···1+# Config file for automatic testing at travis-ci.org
2+3+language: python
4+python: 2.7
5+6+# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
7+install:
8+ - pip install Scrapy docopt
9+10+# command to run tests, e.g. python setup.py test
11+script:
12+ - nosetests tests
13+14+notifications:
15+ slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
+1-3
FourmiCrawler/items.py
···1-# Define here the models for your scraped items
2-#
3-# See documentation in:
4# http://doc.scrapy.org/en/latest/topics/items.html
56from scrapy.item import Item, Field
···1+# For more information on item definitions, see the Scrapy documentation in:
002# http://doc.scrapy.org/en/latest/topics/items.html
34from scrapy.item import Item, Field
+26-9
FourmiCrawler/pipelines.py
···1-# Define your item pipelines here
2-#
3-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
4-# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
5import re
06from scrapy.exceptions import DropItem
789-class DuplicatePipeline(object):
000000000000000100011 def __init__(self):
12 self.known_values = set()
13···20 """
21 value = (item['attribute'], item['value'], item['conditions'])
22 if value in self.known_values:
23- raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item.
24 else:
25 self.known_values.add(value)
26 return item
2728-class AttributeSelectionPipeline(object):
29030 def __init__(self):
31- pass;
3233- def process_item(self, item, spider):
034 """
35 The items are processed using the selected attribute list available in the spider,
36 items that don't match the selected items are dropped.
···1+# For more information on item pipelines, see the Scrapy documentation in:
2+# http://doc.scrapy.org/en/latest/topics/item-pipeline.html
003import re
4+5from scrapy.exceptions import DropItem
678+class RemoveNonePipeline(object):
9+ def __init__(self):
10+ pass
11+12+ @staticmethod
13+ def process_item(item, spider):
14+ """
15+ Processing the items so None values are replaced by empty strings
16+ :param item: The incoming item
17+ :param spider: The spider which scraped the spider
18+ :return: :raise DropItem: Returns the item if unique or drops them if it's already known
19+ """
20+ for key in item:
21+ if item[key] is None:
22+ item[key] = ""
23+ return item
2425+26+class DuplicatePipeline(object):
27 def __init__(self):
28 self.known_values = set()
29···36 """
37 value = (item['attribute'], item['value'], item['conditions'])
38 if value in self.known_values:
39+ raise DropItem("Duplicate item found: %s" % item) # [todo] append sources of first item.
40 else:
41 self.known_values.add(value)
42 return item
4304445+class AttributeSelectionPipeline(object):
46 def __init__(self):
47+ pass
4849+ @staticmethod
50+ def process_item(item, spider):
51 """
52 The items are processed using the selected attribute list available in the spider,
53 items that don't match the selected items are dropped.
+4-3
FourmiCrawler/settings.py
···3# For simplicity, this file contains only the most important settings by
4# default. All the other settings are documented here:
5#
6-# http://doc.scrapy.org/en/latest/topics/settings.html
7#
89BOT_NAME = 'FourmiCrawler'
···11SPIDER_MODULES = ['FourmiCrawler']
12NEWSPIDER_MODULE = 'FourmiCrawler'
13ITEM_PIPELINES = {
14- 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 100,
15- 'FourmiCrawler.pipelines.DuplicatePipeline': 200,
016}
17FEED_URI = 'results.json'
18FEED_FORMAT = 'jsonlines'
···3# For simplicity, this file contains only the most important settings by
4# default. All the other settings are documented here:
5#
6+# http://doc.scrapy.org/en/latest/topics/settings.html
7#
89BOT_NAME = 'FourmiCrawler'
···11SPIDER_MODULES = ['FourmiCrawler']
12NEWSPIDER_MODULE = 'FourmiCrawler'
13ITEM_PIPELINES = {
14+ "FourmiCrawler.pipelines.RemoveNonePipeline": 100,
15+ 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 200,
16+ 'FourmiCrawler.pipelines.DuplicatePipeline': 300,
17}
18FEED_URI = 'results.json'
19FEED_FORMAT = 'jsonlines'
+6-5
FourmiCrawler/sources/ChemSpider.py
···1-from source import Source
02from scrapy import log
3from scrapy.http import Request
4from scrapy.selector import Selector
005from FourmiCrawler.items import Result
6-import re
78# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
9···58 prop_conditions = ''
5960 # Test for properties without values, with one hardcoded exception
61- if (not re.match(r'^\d', prop_value) or
62- (prop_name == 'Polarizability' and
63- prop_value == '10-24cm3')):
64 continue
6566 # Match for condition in parentheses
···1+import re
2+3from scrapy import log
4from scrapy.http import Request
5from scrapy.selector import Selector
6+7+from source import Source
8from FourmiCrawler.items import Result
9+1011# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
12···61 prop_conditions = ''
6263 # Test for properties without values, with one hardcoded exception
64+ if not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and prop_value == '10-24cm3'):
0065 continue
6667 # Match for condition in parentheses
···7 _spider = None
89 def __init__(self):
10+ """
11+ Initiation of a new Source
12+ """
13 pass
1415+ def parse(self, response):
16+ """
17+ This function should be able to parse all Scrapy Response objects with a URL matching the website Regex.
18+ :param response: A Scrapy Response object
19+ :return: A list of Result items and new Scrapy Requests
20+ """
21+ log.msg("The parse function of the empty source was used.", level=log.WARNING)
22 pass
2324 def new_compound_request(self, compound):
25+ """
26+ This function should return a Scrapy Request for the given compound request.
27+ :param compound: A compound name.
28+ :return: A new Scrapy Request
29+ """
30 # return Request(url=self.website[:-1] + compound, callback=self.parse)
31 pass
3233 def set_spider(self, spider):
34+ """
35+ A Function to save the associated spider.
36+ :param spider: A FourmiSpider object
37+ """
38 self._spider = spider
+54-20
FourmiCrawler/spider.py
···001from scrapy.spider import Spider
2from scrapy import log
3-import re
456class FourmiSpider(Spider):
0007 name = "FourmiSpider"
8- __parsers = []
9- synonyms = []
1011 def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
0000012 super(FourmiSpider, self).__init__(*args, **kwargs)
13- self.synonyms.append(compound)
14- self.selected_attributes = selected_attributes;
1516- def parse(self, reponse):
17- for parser in self.__parsers:
18- if re.match(parser.website, reponse.url):
19- log.msg("Url: " + reponse.url + " -> Source: " + parser.website, level=log.DEBUG)
20- return parser.parse(reponse)
00000021 return None
2223 def get_synonym_requests(self, compound):
0000024 requests = []
25- for parser in self.__parsers:
26- parser_requests = parser.new_compound_request(compound)
27- if parser_requests is not None:
28- requests.append(parser_requests)
0029 return requests
3031 def start_requests(self):
000032 requests = []
33 for synonym in self.synonyms:
34 requests.extend(self.get_synonym_requests(synonym))
35 return requests
3637- def add_parsers(self, parsers):
38- for parser in parsers:
39- self.add_parser(parser)
00004041- def add_parser(self, parser):
42- self.__parsers.append(parser)
43- parser.set_spider(self)0000
···1+import re
2+3from scrapy.spider import Spider
4from scrapy import log
0567class FourmiSpider(Spider):
8+ """
9+ A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
10+ """
11 name = "FourmiSpider"
12+ _sources = []
13+ synonyms = set()
1415 def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
16+ """
17+ Initiation of the Spider
18+ :param compound: compound that will be searched.
19+ :param selected_attributes: A list of regular expressions that the attributes should match.
20+ """
21 super(FourmiSpider, self).__init__(*args, **kwargs)
22+ self.synonyms.add(compound)
23+ self.selected_attributes = selected_attributes
2425+ def parse(self, response):
26+ """
27+ The function that is called when a response to a request is available. This function distributes this to a
28+ source which should be able to handle parsing the data.
29+ :param response: A Scrapy Response object that should be parsed
30+ :return: A list of Result items and new Request to be handled by the scrapy core.
31+ """
32+ for source in self._sources:
33+ if re.match(source.website, response.url):
34+ log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
35+ return source.parse(response)
36 return None
3738 def get_synonym_requests(self, compound):
39+ """
40+ A function that generates new Scrapy Request for each source given a new synonym of a compound.
41+ :param compound: A compound name
42+ :return: A list of Scrapy Request objects
43+ """
44 requests = []
45+ if compound not in self.synonyms:
46+ self.synonyms.add(compound)
47+ for parser in self._sources:
48+ parser_requests = parser.new_compound_request(compound)
49+ if parser_requests is not None:
50+ requests.append(parser_requests)
51 return requests
5253 def start_requests(self):
54+ """
55+ The function called by Scrapy for it's first Requests
56+ :return: A list of Scrapy Request generated from the known synonyms using the available sources.
57+ """
58 requests = []
59 for synonym in self.synonyms:
60 requests.extend(self.get_synonym_requests(synonym))
61 return requests
6263+ def add_sources(self, sources):
64+ """
65+ A function to add a new Parser objects to the list of available sources.
66+ :param sources: A list of Source Objects.
67+ """
68+ for parser in sources:
69+ self.add_source(parser)
7071+ def add_source(self, source):
72+ """
73+ A function add a new Parser object to the list of available parsers.
74+ :param source: A Source Object
75+ """
76+ self._sources.append(source)
77+ source.set_spider(self)
+4
README.md
···1# Fourmi
200003Fourmi is an web scraper for chemical substances. The program is designed to be
4used as a search engine to search multiple chemical databases for a specific
5substance. The program will produce all available attributes of the substance
···1# Fourmi
23+**Master branch**: [](https://travis-ci.org/Recondor/Fourmi)
4+5+**Developing branch**: [](https://travis-ci.org/Recondor/Fourmi)
6+7Fourmi is an web scraper for chemical substances. The program is designed to be
8used as a search engine to search multiple chemical databases for a specific
9substance. The program will produce all available attributes of the substance
+28-6
fourmi.py
···1-#!/usr/bin/env python
2"""
3Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms).
4···33from sourceloader import SourceLoader
343536-def setup_crawler(searchable, settings, source_loader, attributes):
37- spider = FourmiSpider(compound=searchable, selected_attributes=attributes)
38- spider.add_parsers(source_loader.sources)
000000039 crawler = Crawler(settings)
40 crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
41 crawler.configure()
···444546def scrapy_settings_manipulation(docopt_arguments):
0000047 settings = get_project_settings()
48- # [todo] - add at least a warning for files that already exist
49 if docopt_arguments["--output"] != 'result.*format*':
50 settings.overrides["FEED_URI"] = docopt_arguments["--output"]
51 elif docopt_arguments["--format"] == "jsonlines":
···606162def start_log(docopt_arguments):
000063 if docopt_arguments["--log"] is not None:
64 if docopt_arguments["--verbose"]:
65 log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
···737475def search(docopt_arguments, source_loader):
0000076 start_log(docopt_arguments)
77 settings = scrapy_settings_manipulation(docopt_arguments)
78 setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
79 reactor.run()
8081082if __name__ == '__main__':
83- arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.0')
84 loader = SourceLoader()
8586 if arguments["--include"]:
···1+# !/usr/bin/env python
2"""
3Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms).
4···33from sourceloader import SourceLoader
343536+def setup_crawler(compound, settings, source_loader, attributes):
37+ """
38+ This function prepares and start the crawler which starts the actual search on the internet
39+ :param compound: The compound which should be searched
40+ :param settings: A scrapy settings object
41+ :param source_loader: A fully functional SourceLoader object which contains only the sources that should be used.
42+ :param attributes: A list of regular expressions which the attribute names should match.
43+ """
44+ spider = FourmiSpider(compound=compound, selected_attributes=attributes)
45+ spider.add_sources(source_loader.sources)
46 crawler = Crawler(settings)
47 crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
48 crawler.configure()
···515253def scrapy_settings_manipulation(docopt_arguments):
54+ """
55+ This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi
56+ project these are command line arguments.
57+ :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
58+ """
59 settings = get_project_settings()
60+61 if docopt_arguments["--output"] != 'result.*format*':
62 settings.overrides["FEED_URI"] = docopt_arguments["--output"]
63 elif docopt_arguments["--format"] == "jsonlines":
···727374def start_log(docopt_arguments):
75+ """
76+ This function starts the logging functionality of Scrapy using the settings given by the CLI.
77+ :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
78+ """
79 if docopt_arguments["--log"] is not None:
80 if docopt_arguments["--verbose"]:
81 log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
···899091def search(docopt_arguments, source_loader):
92+ """
93+ The function that facilitates the search for a specific compound.
94+ :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
95+ :param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
96+ """
97 start_log(docopt_arguments)
98 settings = scrapy_settings_manipulation(docopt_arguments)
99 setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
100 reactor.run()
101102103+# The start for the Fourmi Command Line interface.
104if __name__ == '__main__':
105+ arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.1')
106 loader = SourceLoader()
107108 if arguments["--include"]:
+18
setup.py
···000000000000000000
···1+import sys
2+from cx_Freeze import setup, Executable
3+4+# After running the setup file (python setup.py build) the scrapy/VERSION file has to be manually put into the
5+# library.zip, also the FourmiCrawler map has to be copied to both the library and the exe.win32-2.7 folder. after
6+# putting the files in the library the library has to be zipped and replace the old library.
7+# Dependencies are automatically detected, but it might need fine tuning.
8+build_exe_options = {"packages": ["os", "scrapy", "lxml", "w3lib", "pkg_resources", "zope.interface", "twisted.internet"], "excludes": []}
9+10+# GUI applications require a different base on Windows (the default is for a
11+# console application).
12+base = None
13+14+setup( name = "Scrapy",
15+ version = "0.1",
16+ description = "My GUI application!",
17+ options = {"build_exe": build_exe_options},
18+ executables = [Executable("fourmi.py", base=base)])
+23-4
sourceloader.py
···1import inspect
02import os
3import re
04from FourmiCrawler.sources.source import Source
56···8 sources = []
910 def __init__(self, rel_dir="FourmiCrawler/sources"):
11- path = os.path.dirname(os.path.abspath(__file__))
0000012 path += "/" + rel_dir
13 known_parser = set()
1415 for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
16- mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
17 classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
18 for cls in classes:
19 if issubclass(cls, Source) and cls not in known_parser:
20- self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers?
21- known_parser.add(cls)
2223 def include(self, source_names):
000024 new = set()
25 for name in source_names:
26 new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
27 self.sources = list(new)
2829 def exclude(self, source_names):
000030 exclude = []
31 for name in source_names:
32 exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
33 self.sources = [src for src in self.sources if src not in exclude]
3435 def __str__(self):
000036 string = ""
37 for src in self.sources:
38 string += "Source: " + src.__class__.__name__
···1import inspect
2+import sys
3import os
4import re
5+6from FourmiCrawler.sources.source import Source
78···10 sources = []
1112 def __init__(self, rel_dir="FourmiCrawler/sources"):
13+14+ if hasattr(sys,'frozen'):
15+ path = os.path.dirname(sys.executable)
16+ else:
17+ path = os.path.dirname(os.path.abspath(__file__))
18+19 path += "/" + rel_dir
20 known_parser = set()
2122 for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
23+ mod = __import__('.'.join([rel_dir.replace('/', "."), py]), fromlist=[py])
24 classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
25 for cls in classes:
26 if issubclass(cls, Source) and cls not in known_parser:
27+ self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers?
28+ # known_parser.add(cls)
2930 def include(self, source_names):
31+ """
32+ This function excludes all sources that don't match the given regular expressions.
33+ :param source_names: A list of regular expression (strings)
34+ """
35 new = set()
36 for name in source_names:
37 new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
38 self.sources = list(new)
3940 def exclude(self, source_names):
41+ """
42+ This function excludes all sources that match the given regular expressions.
43+ :param source_names: A list of regular expression (strings)
44+ """
45 exclude = []
46 for name in source_names:
47 exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
48 self.sources = [src for src in self.sources if src not in exclude]
4950 def __str__(self):
51+ """
52+ This function returns a string with all sources currently available in the SourceLoader.
53+ :return: a string with all available sources.
54+ """
55 string = ""
56 for src in self.sources:
57 string += "Source: " + src.__class__.__name__