···44#Python Specific ignores
55*.pyc
6677+#may contain authentication information
88+sources.cfg
99+#Another of our config files
1010+GUI.cfg
1111+712#THINGS WE WOULD NEVER EVER WANT!
813#ignore thumbnails created by windows
914Thumbs.db
+23
.travis.yml
···11+# Config file for automatic testing at travis-ci.org
22+33+language: python
44+python: 2.7
55+66+before_install:
77+ - "export DISPLAY=:99.0"
88+ - "sh -e /etc/init.d/xvfb start"
99+1010+# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
1111+install:
1212+ - pip install Scrapy docopt
1313+ - pip install coveralls
1414+1515+# command to run tests, e.g. python setup.py test
1616+script:
1717+ - nosetests --with-coverage --cover-package=FourmiCrawler,utils,GUI tests
1818+1919+notifications:
2020+ slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
2121+2222+after_success:
2323+ coveralls --verbose
+20
Changelog.md
···11+### v0.6.0
22+- Feature: Added a Graphical User interface
33+- Feature: Automatic config file createion from config samples
44+- FIX: The default name of the output files will now consist of the compound name and the file format when using the CLI
55+- FIX: A lot of bugfixes of the PubChem plugin, as is wasn't working as it should
66+- FIX: Using absolute path for configuration files
77+- DEV: General Code cleanup in documentation
88+99+### v0.5.3
1010+- FIX: It is now again possible to use both verbose and the source inclusion/exclusion options
1111+- FIX: Logging is now "actually" disabled if not using the verbose option.
1212+- FEATURE: Added support for PubChem
1313+1414+### v0.5.2
1515+- FIX: Signatured used to contain untracked and older files, current signature
1616+should be correct.
1717+1818+### v0.5.1
1919+- UPDATED: Logging functionality from command line
2020+- DEV: Code cleanup and extra tests
-31
Fourmi.py
···11-#!/usr/bin/env python
22-"""
33-Fourmi - An internet webcrawler searching for information on chemical
44-compounds. [todo] - Add some more useful text here.
55-"""
66-77-from twisted.internet import reactor
88-from scrapy.crawler import Crawler
99-from scrapy import log, signals
1010-from FourmiCrawler.spider import FourmiSpider
1111-from scrapy.utils.project import get_project_settings
1212-1313-1414-def setup_crawler(searchable):
1515- # [TODO] - Initiate all parsers for the different websites and get
1616- # allowed URLs.
1717- spider = FourmiSpider(compound=searchable)
1818- settings = get_project_settings()
1919- crawler = Crawler(settings)
2020- crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
2121- crawler.configure()
2222- crawler.crawl(spider)
2323- crawler.start()
2424-2525-2626-def start():
2727- setup_crawler("Methane")
2828- log.start()
2929- reactor.run()
3030-3131-start()
+1-3
FourmiCrawler/items.py
···11-# Define here the models for your scraped items
22-#
33-# See documentation in:
11+# For more information on item definitions, see the Scrapy documentation in:
42# http://doc.scrapy.org/en/latest/topics/items.html
5364from scrapy.item import Item, Field
FourmiCrawler/parsers/__init__.py
This is a binary file and will not be displayed.
-9
FourmiCrawler/parsers/parser.py
···11-from scrapy import log
22-33-44-class Parser:
55- website = "http://localhost/*"
66-77- def parse(self, reponse):
88- log.msg("The parse function of the empty parser was used.", level=log.Warning)
99- pass
+43-7
FourmiCrawler/pipelines.py
···11-# Define your item pipelines here
22-#
33-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
44-# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
11+# For more information on item pipelines, see the Scrapy documentation in:
22+# http://doc.scrapy.org/en/latest/topics/item-pipeline.html
33+import re
44+55from scrapy.exceptions import DropItem
667788-class FourmiPipeline(object):
88+class RemoveNonePipeline(object):
99+ def __init__(self):
1010+ pass
1111+1212+ @staticmethod
1313+ def process_item(item, spider):
1414+ """
1515+ Processing the items so None values are replaced by empty strings
1616+ :param item: The incoming item
1717+ :param spider: The spider which scraped the spider
1818+ :return: :raise DropItem: Returns the item if unique or drops them if it's already known
1919+ """
2020+ for key in item:
2121+ if item[key] is None:
2222+ item[key] = ""
2323+ return item
9242525+2626+class DuplicatePipeline(object):
1027 def __init__(self):
1128 self.known_values = set()
1229···1734 :param spider: The spider which scraped the spider
1835 :return: :raise DropItem: Returns the item if unique or drops them if it's already known
1936 """
2020- value = item['attribute'], item['value']
3737+ value = (item['attribute'], item['value'], item['conditions'])
2138 if value in self.known_values:
2222- raise DropItem("Duplicate item found: %s" % item)
3939+ raise DropItem("Duplicate item found: %s" % item) # [todo] append sources of first item.
2340 else:
2441 self.known_values.add(value)
2542 return item
4343+4444+4545+class AttributeSelectionPipeline(object):
4646+ def __init__(self):
4747+ pass
4848+4949+ @staticmethod
5050+ def process_item(item, spider):
5151+ """
5252+ The items are processed using the selected attribute list available in the spider,
5353+ items that don't match the selected items are dropped.
5454+ :param item: The incoming item
5555+ :param spider: The spider which scraped the item. Should have an attribute "selected_attributes".
5656+ :return: :raise DropItem: Returns item if it matches an selected attribute, else it is dropped.
5757+ """
5858+ if [x for x in spider.selected_attributes if re.match(x, item["attribute"])]:
5959+ return item
6060+ else:
6161+ raise DropItem("Attribute not selected by used: %s" % item)
+7-3
FourmiCrawler/settings.py
···33# For simplicity, this file contains only the most important settings by
44# default. All the other settings are documented here:
55#
66-# http://doc.scrapy.org/en/latest/topics/settings.html
66+# http://doc.scrapy.org/en/latest/topics/settings.html
77#
8899BOT_NAME = 'FourmiCrawler'
···1111SPIDER_MODULES = ['FourmiCrawler']
1212NEWSPIDER_MODULE = 'FourmiCrawler'
1313ITEM_PIPELINES = {
1414- 'FourmiCrawler.pipelines.FourmiPipeline': 100
1414+ "FourmiCrawler.pipelines.RemoveNonePipeline": 100,
1515+ 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 200,
1616+ 'FourmiCrawler.pipelines.DuplicatePipeline': 300,
1517}
1818+FEED_URI = 'results.json'
1919+FEED_FORMAT = 'jsonlines'
16201721# Crawl responsibly by identifying yourself (and your website) on the
1822# user-agent
19232020-# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
2424+USER_AGENT = 'Fourmi'
+298
FourmiCrawler/sources/ChemSpider.py
···11+import re
22+33+from scrapy import log
44+from scrapy.http import Request
55+from scrapy.selector import Selector
66+77+from source import Source
88+from FourmiCrawler.items import Result
99+1010+1111+# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
1212+1313+class ChemSpider(Source):
1414+ """
1515+ ChemSpider scraper for synonyms and properties
1616+ This parser will manage searching for chemicals through the
1717+ ChemsSpider API, and parsing the resulting ChemSpider page.
1818+ The token required for the API should be in a configuration file
1919+ somewhere.
2020+ """
2121+2222+ website = 'http://www\\.chemspider\\.com/.*'
2323+2424+ search = 'Search.asmx/SimpleSearch?query=%s&token='
2525+ structure = 'Chemical-Structure.%s.html'
2626+ extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
2727+2828+ def __init__(self, config=None):
2929+ """
3030+ Initialization of ChemSpider scraper
3131+ :param config: a dictionary of settings for this scraper, must contain
3232+ 'reliability' key
3333+ """
3434+ Source.__init__(self, config)
3535+ self.ignore_list = []
3636+ if 'token' not in self.cfg or self.cfg['token'] == '':
3737+ log.msg('ChemSpider token not set or empty, search/MassSpec API '
3838+ 'not available', level=log.WARNING)
3939+ self.cfg['token'] = ''
4040+ self.search += self.cfg['token']
4141+ self.extendedinfo += self.cfg['token']
4242+4343+ def parse(self, response):
4444+ """
4545+ This function is called when a Response matching the variable
4646+ 'website' is available for parsing the Response object.
4747+ :param response: the Scrapy Response object to be parsed
4848+ :return: a list of Result items and Request objects
4949+ """
5050+ sel = Selector(response)
5151+ requests = []
5252+ requests_synonyms = self.parse_synonyms(sel)
5353+ requests.extend(requests_synonyms)
5454+ requests_properties = self.parse_properties(sel)
5555+ requests.extend(requests_properties)
5656+5757+ return requests
5858+5959+ def parse_properties(self, sel):
6060+ """
6161+ This function scrapes the Experimental Data and Predicted ACD/Labs tabs
6262+ :param sel: a Selector object of the whole page
6363+ :return: a list of Result items
6464+ """
6565+ properties = []
6666+6767+ properties.extend(self.parse_acdlabstab(sel))
6868+ properties.extend(self.parse_experimentaldatatab(sel))
6969+7070+ return properties
7171+7272+ def parse_acdlabstab(self, sel):
7373+ """
7474+ This function scrapes the 'Predicted ACD/Labs tab' under Properties
7575+ :param sel: a Selector object of the whole page
7676+ :return: a list of Request objects
7777+ """
7878+ properties = []
7979+8080+ td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
8181+ 'normalize-space(string())')
8282+ prop_names = td_list[::2]
8383+ prop_values = td_list[1::2]
8484+ for (prop_name, prop_value) in zip(prop_names, prop_values):
8585+ # [:-1] is to remove the colon at the end, [TODO] - test for colon
8686+ prop_name = prop_name.extract().encode('utf-8')[:-1]
8787+ prop_value = prop_value.extract().encode('utf-8')
8888+ prop_conditions = ''
8989+9090+ # Test for properties without values, with one hardcoded exception
9191+ if (not re.match(r'^\d', prop_value) or
9292+ (prop_name == 'Polarizability' and prop_value == '10-24cm3')):
9393+ continue
9494+9595+ m = re.match(r'(.*) \((.*)\)', prop_name)
9696+ if m:
9797+ prop_name = m.group(1)
9898+ prop_conditions = m.group(2)
9999+100100+ m = re.match(r'(.*) at (.*)', prop_value)
101101+ if m:
102102+ prop_value = m.group(1)
103103+ prop_conditions = m.group(2)
104104+105105+ new_prop = self.newresult(
106106+ attribute=prop_name,
107107+ value=prop_value,
108108+ source='ChemSpider Predicted - ACD/Labs Tab',
109109+ conditions=prop_conditions
110110+ )
111111+ properties.append(new_prop)
112112+113113+ return properties
114114+115115+ def parse_experimentaldatatab(self, sel):
116116+ """
117117+ This function scrapes Experimental Data tab, Physico-chemical
118118+ properties in particular.
119119+ :param sel: a Selector object of the whole page
120120+ :return: a list of Result items
121121+ """
122122+ properties = []
123123+124124+ scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
125125+ 'Properties"]//li/table/tr/td')
126126+ if not scraped_list:
127127+ return properties
128128+ # Format is: property name followed by a list of values
129129+ property_name = scraped_list.pop(0).xpath(
130130+ 'span/text()').extract()[0].rstrip()
131131+ for line in scraped_list:
132132+ if line.xpath('span/text()'):
133133+ property_name = line.xpath('span/text()').extract()[0].rstrip()
134134+ else:
135135+ new_prop = self.newresult(
136136+ attribute=property_name[:-1],
137137+ value=line.xpath('text()').extract()[0].rstrip(),
138138+ source=line.xpath('strong/text()').extract()[0].rstrip(),
139139+ )
140140+ properties.append(new_prop)
141141+142142+ return properties
143143+144144+ def parse_synonyms(self, sel):
145145+ """
146146+ This function scrapes the list of Names and Identifiers
147147+ :param sel: a Selector object of the whole page
148148+ :return: a list of Requests
149149+ """
150150+ requests = []
151151+ synonyms = []
152152+153153+ # Exact type for this is unknown, but equivalent to Validated by Expert
154154+ for syn in sel.xpath('//p[@class="syn"][span[@class="synonym_cn"]]'):
155155+ name = syn.xpath('span[@class="synonym_cn"]/text()').extract()[0]
156156+ synonyms.append(self.new_synonym(syn, name, 'expert'))
157157+ # These synonyms are labeled by ChemSpider as "Validated by Experts"
158158+ for syn in sel.xpath('//p[@class="syn"][strong]'):
159159+ name = syn.xpath('strong/text()').extract()[0]
160160+ synonyms.append(self.new_synonym(syn, name, 'expert'))
161161+ # These synonyms are labeled by ChemSpider as "Validated by Users"
162162+ for syn in sel.xpath(
163163+ '//p[@class="syn"][span[@class="synonym_confirmed"]]'):
164164+ name = syn.xpath(
165165+ 'span[@class="synonym_confirmed"]/text()').extract()[0]
166166+ synonyms.append(self.new_synonym(syn, name, 'user'))
167167+ # These syonyms are labeled as "Non-validated" and assumed unreliable
168168+ for syn in sel.xpath('//p[@class="syn"][span[@class=""]]'):
169169+ name = syn.xpath('span[@class=""]/text()').extract()[0]
170170+ synonyms.append(self.new_synonym(syn, name, 'nonvalidated'))
171171+172172+ # [TODO] - confirm if English User-Validated synonyms are OK too
173173+ for syn in synonyms:
174174+ if syn['category'] == 'expert' and syn['language'] == 'English':
175175+ log.msg('CS emit synonym: %s' % syn['name'], level=log.DEBUG)
176176+ self._spider.get_synonym_requests(syn['name'])
177177+178178+ return requests
179179+180180+ def new_synonym(self, sel, name, category):
181181+ """
182182+ This function scrapes for a single synonym at a given HTML tag
183183+ :param sel: a Selector object of the given HTML tag
184184+ :param name: the name of the synonym in the tag
185185+ :param category: the name of the category the synonym is labeled as
186186+ :return: a dictionary containing data on the synonym
187187+ """
188188+ self.ignore_list.append(name)
189189+ language = sel.xpath('span[@class="synonym_language"]/text()')
190190+ if language:
191191+ # The [1:-1] is to remove brackets around the language name
192192+ language = language.extract()[0][1:-1]
193193+ else:
194194+ # If language is not given, English is assumed, [TODO] - confirm
195195+ language = 'English'
196196+ log.msg('CS synonym: %s (%s) (%s)' % (name, category, language),
197197+ level=log.DEBUG)
198198+ references = []
199199+ # A synonym can have multiple references, each optionally with link
200200+ for ref in sel.xpath('span[@class="synonym_ref"]'):
201201+ refname = ref.xpath('normalize-space(string())')
202202+ references.append({
203203+ 'name': refname.extract()[0][1:-1],
204204+ 'URI': ''
205205+ })
206206+ for ref in sel.xpath('a[@class="synonym_ref"]'):
207207+ references.append({
208208+ 'name': ref.xpath('@title').extract()[0],
209209+ 'URI': ref.xpath('@href').extract()[0]
210210+ })
211211+ for ref in references:
212212+ log.msg('CS synonym ref: %s %s' % (ref['name'], ref['URI']),
213213+ level=log.DEBUG)
214214+ synonym = {
215215+ 'name': name,
216216+ 'category': category,
217217+ 'language': language,
218218+ 'references': references
219219+ }
220220+ return synonym
221221+222222+ def parse_extendedinfo(self, response):
223223+ """
224224+ This function scrapes data from the ChemSpider GetExtendedCompoundInfo
225225+ API, if a token is present in the configuration settings
226226+ :param response: a Response object to be parsed
227227+ :return: a list of Result items
228228+ """
229229+ sel = Selector(response)
230230+ properties = []
231231+ names = sel.xpath('*').xpath('name()').extract()
232232+ values = sel.xpath('*').xpath('text()').extract()
233233+ for (name, value) in zip(names, values):
234234+ result = self.newresult(
235235+ attribute=name,
236236+ value=value, # These values have no unit!
237237+ source='ChemSpider ExtendedCompoundInfo',
238238+ )
239239+ if result['value']:
240240+ properties.append(result)
241241+ return properties
242242+243243+ def newresult(self, attribute, value, conditions='', source='ChemSpider'):
244244+ """
245245+ This function abstracts from the Result item and provides default
246246+ values.
247247+ :param attribute: the name of the attribute
248248+ :param value: the value of the attribute
249249+ :param conditions: optional conditions regarding the value
250250+ :param source: the name of the source if it is not ChemSpider
251251+ :return: A Result item
252252+ """
253253+ return Result({
254254+ 'attribute': attribute,
255255+ 'value': value,
256256+ 'source': source,
257257+ 'reliability': self.cfg['reliability'],
258258+ 'conditions': conditions
259259+ })
260260+261261+ def parse_searchrequest(self, response):
262262+ """
263263+ This function parses the initial response of the ChemSpider Search API
264264+ Requires a valid token to function.
265265+ :param response: the Response object to be parsed
266266+ :return: A Request for the information page and a Request for the
267267+ extendedinfo API call
268268+ """
269269+ sel = Selector(response)
270270+ log.msg('chemspider parse_searchrequest', level=log.DEBUG)
271271+ sel.register_namespace('cs', 'http://www.chemspider.com/')
272272+ csids = sel.xpath('.//cs:int/text()').extract()
273273+ if len(csids) == 0:
274274+ log.msg('ChemSpider found nothing', level=log.ERROR)
275275+ return
276276+ elif len(csids) > 1:
277277+ log.msg('ChemSpider found multiple substances, taking first '
278278+ 'element', level=log.DEBUG)
279279+ csid = csids[0]
280280+ structure_url = self.website[:-2].replace("\\", "") + self.structure % csid
281281+ extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid
282282+ log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
283283+ return [Request(url=structure_url,
284284+ callback=self.parse),
285285+ Request(url=extendedinfo_url,
286286+ callback=self.parse_extendedinfo)]
287287+288288+ def new_compound_request(self, compound):
289289+ """
290290+ This function is called when a new synonym is returned to the spider
291291+ to generate new requests
292292+ :param compound: the name of the compound to search for
293293+ """
294294+ if compound in self.ignore_list or self.cfg['token'] == '':
295295+ return None
296296+ searchurl = self.website[:-2].replace("\\", "") + self.search % compound
297297+ log.msg('chemspider compound', level=log.DEBUG)
298298+ return Request(url=searchurl, callback=self.parse_searchrequest)
+334
FourmiCrawler/sources/NIST.py
···11+import re
22+33+from scrapy import log
44+from scrapy.http import Request
55+from scrapy.selector import Selector
66+77+from source import Source
88+from FourmiCrawler.items import Result
99+1010+1111+# [TODO]: values can be '128.', perhaps remove the dot in that case?
1212+# [TODO]: properties have references and comments which do not exist in the
1313+# Result item, but should be included eventually.
1414+1515+class NIST(Source):
1616+ """
1717+ NIST Scraper plugin
1818+ This plugin manages searching for a chemical on the NIST website
1919+ and parsing the resulting page if the chemical exists on NIST.
2020+ """
2121+ website = "http://webbook\\.nist\\.gov/.*"
2222+2323+ search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
2424+2525+ def __init__(self, config=None):
2626+ """
2727+ Initialization of NIST scraper
2828+ :param config: configuration variables for this scraper, must contain
2929+ 'reliability' key.
3030+ """
3131+ Source.__init__(self, config)
3232+ self.ignore_list = set()
3333+3434+ def parse(self, response):
3535+ """
3636+ This function is called when a Response matching the variable
3737+ 'website' is available for parsing the Response object.
3838+ :param response: The Scrapy Response object to be parsed
3939+ :return: a list of Result items and Request objects
4040+ """
4141+ sel = Selector(response)
4242+4343+ title = sel.xpath('head/title/text()').extract()[0]
4444+ if title == 'Name Not Found':
4545+ log.msg('NIST: Chemical not found!', level=log.ERROR)
4646+ return
4747+ if title not in self.ignore_list:
4848+ self.ignore_list.update(title)
4949+ log.msg('NIST emit synonym: %s' % title, level=log.DEBUG)
5050+ self._spider.get_synonym_requests(title)
5151+5252+ requests = []
5353+5454+ requests.extend(self.parse_generic_info(sel))
5555+5656+ symbol_table = {}
5757+ tds = sel.xpath('//table[@class="symbol_table"]/tr/td')
5858+ for (symbol_td, name_td) in zip(tds[::2], tds[1::2]):
5959+ symbol = ''.join(symbol_td.xpath('node()').extract())
6060+ name = name_td.xpath('text()').extract()[0]
6161+ symbol_table[symbol] = name
6262+ log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
6363+ level=log.DEBUG)
6464+6565+ requests.extend(self.parse_tables(sel, symbol_table))
6666+6767+ return requests
6868+6969+ def parse_tables(self, sel, symbol_table):
7070+ """
7171+ This function identifies and distributes parsing of tables to other
7272+ functions below.
7373+ :param sel: A Selector object of the whole page
7474+ :param symbol_table: a dictionary containing translations of raw HTML
7575+ tags to human readable names
7676+ :return: a list of Result items and Requests
7777+ """
7878+ requests = []
7979+8080+ for table in sel.xpath('//table[@class="data"]'):
8181+ summary = table.xpath('@summary').extract()[0]
8282+ if summary == 'One dimensional data':
8383+ log.msg('NIST table: Aggregrate data', level=log.DEBUG)
8484+ requests.extend(
8585+ self.parse_aggregate_data(table, symbol_table))
8686+ elif table.xpath('tr/th="Initial Phase"').extract()[0] == '1':
8787+ log.msg('NIST table; Enthalpy/entropy of phase transition',
8888+ level=log.DEBUG)
8989+ requests.extend(self.parse_transition_data(table, summary))
9090+ elif table.xpath('tr[1]/td'):
9191+ log.msg('NIST table: Horizontal table', level=log.DEBUG)
9292+ elif summary == 'Antoine Equation Parameters':
9393+ log.msg('NIST table: Antoine Equation Parameters',
9494+ level=log.DEBUG)
9595+ requests.extend(self.parse_antoine_data(table, summary))
9696+ elif len(table.xpath('tr[1]/th')) == 5:
9797+ log.msg('NIST table: generic 5 columns', level=log.DEBUG)
9898+ # Symbol (unit) Temperature (K) Method Reference Comment
9999+ requests.extend(self.parse_generic_data(table, summary))
100100+ elif len(table.xpath('tr[1]/th')) == 4:
101101+ log.msg('NIST table: generic 4 columns', level=log.DEBUG)
102102+ # Symbol (unit) Temperature (K) Reference Comment
103103+ requests.extend(self.parse_generic_data(table, summary))
104104+ else:
105105+ log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
106106+ continue # Assume unsupported
107107+ return requests
108108+109109+ def parse_generic_info(self, sel):
110110+ """
111111+ This function parses: synonyms, chemical formula, molecular weight,
112112+ InChI, InChiKey, CAS number
113113+ :param sel: A Selector object of the entire page in the original
114114+ response
115115+ :return: a list of Result items
116116+ """
117117+ ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
118118+119119+ raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
120120+ for synonym in raw_synonyms[0].strip().split(';\n'):
121121+ log.msg('NIST synonym: %s' % synonym, level=log.DEBUG)
122122+ self.ignore_list.update(synonym)
123123+ self._spider.get_synonym_requests(synonym)
124124+125125+ data = {}
126126+127127+ raw_formula = ul.xpath('li[strong/a="Formula"]//text()').extract()
128128+ data['Chemical formula'] = ''.join(raw_formula[2:]).strip()
129129+130130+ raw_mol_weight = ul.xpath('li[strong/a="Molecular weight"]/text()')
131131+ data['Molecular weight'] = raw_mol_weight.extract()[0].strip()
132132+133133+ raw_inchi = ul.xpath('li[strong="IUPAC Standard InChI:"]//tt/text()')
134134+ data['IUPAC Standard InChI'] = raw_inchi.extract()[0]
135135+136136+ raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]'
137137+ '/tt/text()')
138138+ data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0]
139139+140140+ raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()')
141141+ data['CAS Registry Number'] = raw_cas_number.extract()[0].strip()
142142+143143+ requests = []
144144+ for key, value in data.iteritems():
145145+ result = self.newresult(
146146+ attribute=key,
147147+ value=value
148148+ )
149149+ requests.append(result)
150150+151151+ return requests
152152+153153+ def parse_aggregate_data(self, table, symbol_table):
154154+ """
155155+ This function parses the table(s) which contain possible links to
156156+ individual data points
157157+ :param table: a Selector object of the table to be parsed
158158+ :param symbol_table: a dictionary containing translations of raw HTML
159159+ tags to human readable names
160160+ :return: a list of Result items and Request objects
161161+ """
162162+ results = []
163163+ for tr in table.xpath('tr[td]'):
164164+ extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
165165+ '/a/@href').extract()
166166+ if extra_data_url:
167167+ request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0],
168168+ callback=self.parse_individual_datapoints)
169169+ results.append(request)
170170+ continue
171171+ data = []
172172+ for td in tr.xpath('td'):
173173+ data.append(''.join(td.xpath('node()').extract()))
174174+175175+ name = symbol_table[data[0]]
176176+ condition = ''
177177+178178+ m = re.match(r'(.*) at (.*)', name)
179179+ if m:
180180+ name = m.group(1)
181181+ condition = m.group(2)
182182+183183+ result = self.newresult(
184184+ attribute=name,
185185+ value=data[1] + ' ' + data[2],
186186+ conditions=condition
187187+ )
188188+ log.msg('NIST: |%s|' % data, level=log.DEBUG)
189189+ results.append(result)
190190+ return results
191191+192192+ def parse_transition_data(self, table, summary):
193193+ """
194194+ This function parses the table containing properties regarding phase
195195+ changes
196196+ :param table: a Selector object of the table to be parsed
197197+ :param summary: the name of the property
198198+ :return: a list of Result items
199199+ """
200200+ results = []
201201+202202+ unit = self.get_unit(table)
203203+204204+ for tr in table.xpath('tr[td]'):
205205+ tds = tr.xpath('td/text()').extract()
206206+ result = self.newresult(
207207+ attribute=summary,
208208+ value=tds[0] + ' ' + unit,
209209+ conditions='%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
210210+ )
211211+ results.append(result)
212212+213213+ return results
214214+215215+ def parse_generic_data(self, table, summary):
216216+ """
217217+ Parses the common tables of 4 and 5 rows. Assumes they are of the
218218+ form:
219219+ Symbol (unit)|Temperature (K)|Method|Reference|Comment
220220+ Symbol (unit)|Temperature (K)|Reference|Comment
221221+ :param table: a Selector object of the table to be parsed
222222+ :param summary: the name of the property
223223+ :return: a list of Result items
224224+ """
225225+ results = []
226226+227227+ unit = self.get_unit(table)
228228+229229+ for tr in table.xpath('tr[td]'):
230230+ tds = tr.xpath('td/text()').extract()
231231+ result = self.newresult(
232232+ attribute=summary,
233233+ value=tds[0] + ' ' + unit,
234234+ conditions='%s K' % tds[1]
235235+ )
236236+ results.append(result)
237237+ return results
238238+239239+ def parse_antoine_data(self, table, summary):
240240+ """
241241+ This function parses the table containing parameters for the Antione
242242+ equation
243243+ :param table: a Selector object of the table to be parsed
244244+ :param summary: the name of the property
245245+ :return: a list of Result items
246246+ """
247247+ results = []
248248+249249+ for tr in table.xpath('tr[td]'):
250250+ tds = tr.xpath('td/text()').extract()
251251+ result = self.newresult(
252252+ attribute=summary,
253253+ value='A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
254254+ conditions='%s K' % tds[0]
255255+ )
256256+ results.append(result)
257257+258258+ return results
259259+260260+ def parse_individual_datapoints(self, response):
261261+ """
262262+ This function parses the 'individual data points' page linked from
263263+ the aggregate data table(s)
264264+ :param response: the Scrapy Response object to be parsed
265265+ :return: a list of Result items
266266+ """
267267+ sel = Selector(response)
268268+ table = sel.xpath('//table[@class="data"]')[0]
269269+270270+ results = []
271271+272272+ name = table.xpath('@summary').extract()[0]
273273+ condition = ''
274274+ m = re.match(r'(.*) at (.*)', name)
275275+ if m:
276276+ name = m.group(1)
277277+ condition = m.group(2)
278278+279279+ unit = self.get_unit(table)
280280+281281+ for tr in table.xpath('tr[td]'):
282282+ tds = tr.xpath('td/text()').extract()
283283+ uncertainty = ''
284284+ m = re.search('Uncertainty assigned by TRC = (.*?) ', tds[-1])
285285+ if m:
286286+ uncertainty = '+- %s ' % m.group(1)
287287+ # [TODO]: get the plusminus sign working in here
288288+ result = self.newresult(
289289+ attribute=name,
290290+ value='%s %s%s' % (tds[0], uncertainty, unit),
291291+ conditions=condition
292292+ )
293293+ results.append(result)
294294+295295+ return results
296296+297297+ @staticmethod
298298+ def get_unit(table):
299299+ tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
300300+ m = re.search(r'\((.*)\)', tr_unit)
301301+ unit = '!'
302302+ if m:
303303+ unit = m.group(1)
304304+305305+ return unit
306306+307307+ def newresult(self, attribute, value, conditions=''):
308308+ """
309309+ This function abstracts from the Result item and provides default
310310+ values
311311+ :param attribute: the name of the attribute
312312+ :param value: the value of the attribute
313313+ :param conditions: optional conditions regarding the value
314314+ :return: A Result item
315315+ """
316316+ return Result(
317317+ {
318318+ 'attribute': attribute,
319319+ 'value': value,
320320+ 'source': 'NIST',
321321+ 'reliability': self.cfg['reliability'],
322322+ 'conditions': conditions
323323+ })
324324+325325+ def new_compound_request(self, compound):
326326+ """
327327+ This function is called when a new synonym is returned to the spider
328328+ to generate new requests
329329+ :param compound: the name of the compound to search for
330330+ """
331331+ if compound not in self.ignore_list:
332332+ self.ignore_list.update(compound)
333333+ return Request(url=self.website[:-2].replace("\\", "") + self.search % compound,
334334+ callback=self.parse)
+149
FourmiCrawler/sources/PubChem.py
···11+import re
22+33+from scrapy.http import Request
44+from scrapy import log
55+from scrapy.selector import Selector
66+77+from source import Source
88+from FourmiCrawler.items import Result
99+1010+1111+class PubChem(Source):
1212+ """ PubChem scraper for chemical properties
1313+1414+ This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance,
1515+ including sources of the values of properties.
1616+ """
1717+1818+ # PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
1919+ website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
2020+ website_www = 'http://www.ncbi.nlm.nih.gov/*'
2121+ website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
2222+ search = 'pccompound?term=%s'
2323+ data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
2424+2525+ __spider = None
2626+ searched_compounds = set()
2727+2828+ def __init__(self, config):
2929+ Source.__init__(self, config)
3030+ self.cfg = config
3131+3232+ def parse(self, response):
3333+ """
3434+ Distributes the above described behaviour
3535+ :param response: The incoming search request
3636+ :return Returns the found properties if response is unique or returns none if it's already known
3737+ """
3838+ requests = []
3939+ log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
4040+4141+ sel = Selector(response)
4242+ compound = sel.xpath('//h1/text()').extract()[0]
4343+ if compound in self.searched_compounds:
4444+ return None
4545+4646+ self.searched_compounds.update(compound)
4747+ raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0]
4848+ for synonym in raw_synonyms.strip().split(', '):
4949+ log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG)
5050+ self.searched_compounds.update(synonym)
5151+ self._spider.get_synonym_requests(synonym)
5252+ log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
5353+5454+ n = re.search(r'cid=(\d+)', response.url)
5555+ if n:
5656+ cid = n.group(1)
5757+ log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach
5858+ # the seperate html page which contains the properties and their values
5959+6060+ # using this cid to get the right url and scrape it
6161+ requests.append(
6262+ Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data))
6363+ return requests
6464+6565+ def parse_data(self, response):
6666+ """
6767+ Parse data found in 'Chemical and Physical properties' part of a substance page.
6868+ :param response: The response with the page to parse
6969+ :return: requests: Returns a list of properties with their values, source, etc.
7070+ """
7171+ log.msg('parsing data', level=log.DEBUG)
7272+ requests = []
7373+7474+ sel = Selector(response)
7575+ props = sel.xpath('//div')
7676+7777+ for prop in props:
7878+ prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
7979+ if prop.xpath('a'): # parsing for single value in property
8080+ prop_source = ''.join(prop.xpath('a/@title').extract())
8181+ prop_value = ''.join(prop.xpath('a/text()').extract())
8282+ new_prop = Result({
8383+ 'attribute': prop_name,
8484+ 'value': prop_value,
8585+ 'source': prop_source,
8686+ 'reliability': self.cfg['reliability'],
8787+ 'conditions': ''
8888+ })
8989+ log.msg('PubChem prop: |%s| |%s| |%s|' %
9090+ (new_prop['attribute'], new_prop['value'],
9191+ new_prop['source']), level=log.DEBUG)
9292+ requests.append(new_prop)
9393+ elif prop.xpath('ul'): # parsing for multiple values (list) in property
9494+ prop_values = prop.xpath('ul//li')
9595+ for prop_li in prop_values:
9696+ prop_value = ''.join(prop_li.xpath('a/text()').extract())
9797+ prop_source = ''.join(prop_li.xpath('a/@title').extract())
9898+ new_prop = Result({
9999+ 'attribute': prop_name,
100100+ 'value': prop_value,
101101+ 'source': prop_source,
102102+ 'reliability': self.cfg['reliability'],
103103+ 'conditions': ''
104104+ })
105105+ log.msg('PubChem prop: |%s| |%s| |%s|' %
106106+ (new_prop['attribute'], new_prop['value'],
107107+ new_prop['source']), level=log.DEBUG)
108108+ requests.append(new_prop)
109109+110110+ return requests
111111+112112+ def parse_searchrequest(self, response):
113113+ """
114114+ This function parses the response to the new_compound_request Request
115115+ :param response: the Response object to be parsed
116116+ :return: A Request for the compound page or what self.parse returns in
117117+ case the search request forwarded to the compound page
118118+ """
119119+120120+ # check if pubchem forwarded straight to compound page
121121+ m = re.match(self.website_pubchem, response.url)
122122+ if m:
123123+ log.msg('PubChem search forwarded to compound page',
124124+ level=log.DEBUG)
125125+ return self.parse(response)
126126+127127+ sel = Selector(response)
128128+129129+ results = sel.xpath('//div[@class="rsltcont"]')
130130+ if results:
131131+ url = results[0].xpath('div/p/a[1]/@href')
132132+ else:
133133+ log.msg('PubChem search found nothing or xpath failed',
134134+ level=log.DEBUG)
135135+ return None
136136+137137+ if url:
138138+ url = 'http:' + ''.join(url[0].extract())
139139+ log.msg('PubChem compound page: %s' % url, level=log.DEBUG)
140140+ else:
141141+ log.msg('PubChem search found results, but no url in first result',
142142+ level=log.DEBUG)
143143+ return None
144144+145145+ return Request(url=url, callback=self.parse)
146146+147147+ def new_compound_request(self, compound):
148148+ return Request(url=self.website_www[:-1] + self.search % compound,
149149+ callback=self.parse_searchrequest)
+169
FourmiCrawler/sources/WikipediaParser.py
···11+import re
22+33+from scrapy.http import Request
44+from scrapy import log
55+from scrapy.selector import Selector
66+77+from source import Source
88+from FourmiCrawler.items import Result
99+1010+1111+class WikipediaParser(Source):
1212+ """ Wikipedia scraper for chemical properties
1313+1414+ This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values.
1515+ It also returns requests with other external sources which contain information on parsed subject.
1616+ """
1717+1818+ website = "http://en\\.wikipedia\\.org/wiki/.*"
1919+ __spider = None
2020+ searched_compounds = []
2121+2222+ def __init__(self, config=None):
2323+ Source.__init__(self, config)
2424+2525+ def parse(self, response):
2626+ """
2727+ Distributes the above described behaviour
2828+ :param response: The incoming search request
2929+ :return: Returns the found properties if response is unique or returns none if it's already known
3030+ """
3131+ log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
3232+ sel = Selector(response)
3333+ compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page
3434+ if compound in self.searched_compounds:
3535+ return None
3636+ else:
3737+ items = self.parse_infobox(sel)
3838+ self.searched_compounds.append(compound)
3939+ return items
4040+4141+ def parse_infobox(self, sel):
4242+ """
4343+ Scrape data from infobox on wikipedia.
4444+4545+ Data from two types of infoboxes: class="infobox bordered" and class="infobox" is scraped and
4646+ :param sel: The selector with the html-information of the page to parse
4747+ :return: item_list: Returns a list of properties with their values, source, etc..
4848+ """
4949+5050+ items = []
5151+5252+ # scrape the chembox (wikipedia template)
5353+ items = self.parse_chembox(sel, items)
5454+5555+ # scrape the drugbox (wikipedia template)
5656+ items = self.parse_drugbox(sel, items)
5757+5858+ items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
5959+ item_list = self.clean_items(items)
6060+6161+ identifiers = self.get_identifiers(sel)
6262+6363+ #add extra sources to scrape from as requests
6464+ for i, identifier in enumerate(identifiers):
6565+ request = None
6666+ #discard internal wikipedia links
6767+ if re.match('//en\.wikipedia', identifier):
6868+ log.msg('Found link to Wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING)
6969+ #fix links starting with '//www.'
7070+ elif re.match('/{2}', identifier):
7171+ identifier = re.sub("/{2}", "http://", identifier)
7272+ request = Request(identifier)
7373+ else:
7474+ request = Request(identifier)
7575+ log.msg('New identifier found, request: %s' % identifier, level=log.DEBUG)
7676+ item_list.append(request)
7777+7878+ return item_list
7979+8080+ def parse_chembox(self, sel, items):
8181+ """
8282+ Scrape data from chembox infobox on wikipedia.
8383+8484+ :param sel: The selector with the html-information of the page to parse
8585+ :param items: the list of items where the result have to be stored in
8686+ :return: items: the list of items with the new found and stored items
8787+ """
8888+ tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
8989+ xpath('normalize-space(string())')
9090+ prop_names = tr_list[::2]
9191+ prop_values = tr_list[1::2]
9292+ for i, prop_name in enumerate(prop_names):
9393+ item = self.newresult(
9494+ attribute=prop_name.extract().encode('utf-8'),
9595+ value=prop_values[i].extract().encode('utf-8')
9696+ )
9797+ items.append(item)
9898+ log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
9999+ return items
100100+101101+ def parse_drugbox(self, sel, items):
102102+ """
103103+ Scrape data from drugbox infobox on wikipedia.
104104+105105+ :param sel: The selector with the html-information of the page to parse
106106+ :param items: the list of items where the result have to be stored in
107107+ :return: items: the list of items with the new found and stored items
108108+ """
109109+ tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
110110+ log.msg('dit: %s' % tr_list2, level=log.DEBUG)
111111+ for tablerow in tr_list2:
112112+ log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
113113+ if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
114114+ 'normalize-space(string())'):
115115+ item = self.newresult(
116116+ attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
117117+ value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
118118+ )
119119+ items.append(item)
120120+ log.msg(
121121+ 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
122122+ level=log.DEBUG)
123123+ return items
124124+125125+ def new_compound_request(self, compound):
126126+ return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
127127+128128+ @staticmethod
129129+ def clean_items(items):
130130+131131+ """
132132+ Clean up properties using regex, makes it possible to split the values from the units
133133+134134+ Almost not in use, only cleans J/K/mol values and boiling/melting points.
135135+136136+ :param items: List of properties with their values, source, etc..
137137+ :return: items: List of now cleaned up items
138138+ """
139139+ for item in items:
140140+ value = item['value']
141141+ m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F)
142142+ if m:
143143+ item['value'] = m.group(1) + " K"
144144+ m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) # clean up J/K/mol values
145145+ if m:
146146+ item['value'] = m.group(1) + " J/K/mol"
147147+ return items
148148+149149+ @staticmethod
150150+ def get_identifiers(sel):
151151+ """
152152+ Find external links, named 'Identifiers' to different sources.
153153+154154+ :param sel: The selector with the html-information of the page to parse
155155+ :return: links: New links which can be used to expand the crawlers search
156156+ """
157157+ links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
158158+ '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
159159+ return links
160160+161161+ def newresult(self, attribute, value):
162162+ return Result(
163163+ {
164164+ 'attribute': attribute,
165165+ 'value': value,
166166+ 'source': 'Wikipedia',
167167+ 'reliability': self.cfg['reliability'],
168168+ 'conditions': ''
169169+ })
FourmiCrawler/sources/__init__.py
This is a binary file and will not be displayed.
+41
FourmiCrawler/sources/source.py
···11+from scrapy import log
22+# from scrapy.http import Request
33+44+55+class Source:
66+ website = "http://something/.*" # Regex of URI's the source is able to parse
77+ _spider = None
88+99+ def __init__(self, config=None):
1010+ """
1111+ Initiation of a new Source
1212+ """
1313+ self.cfg = {}
1414+ if config is not None:
1515+ self.cfg = config
1616+ pass
1717+1818+ def parse(self, response):
1919+ """
2020+ This function should be able to parse all Scrapy Response objects with a URL matching the website Regex.
2121+ :param response: A Scrapy Response object
2222+ :return: A list of Result items and new Scrapy Requests
2323+ """
2424+ log.msg("The parse function of the empty source was used.", level=log.WARNING)
2525+ pass
2626+2727+ def new_compound_request(self, compound):
2828+ """
2929+ This function should return a Scrapy Request for the given compound request.
3030+ :param compound: A compound name.
3131+ :return: A new Scrapy Request
3232+ """
3333+ # return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
3434+ pass
3535+3636+ def set_spider(self, spider):
3737+ """
3838+ A Function to save the associated spider.
3939+ :param spider: A FourmiSpider object
4040+ """
4141+ self._spider = spider
+72-10
FourmiCrawler/spider.py
···11+import re
22+13from scrapy.spider import Spider
44+from scrapy import log
253647class FourmiSpider(Spider):
55- name = "FourmiSpider"
88+ """
99+ A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
1010+ """
1111+ name = "FourmiSpider"
61277- def __init__(self, compound=None, *args, **kwargs):
88- super(FourmiSpider, self).__init__(*args, **kwargs)
99- self.synonyms = [compound]
1313+ def __init__(self, compound=None, selected_attributes=None, *args, **kwargs):
1414+ """
1515+ Initiation of the Spider
1616+ :param compound: compound that will be searched.
1717+ :param selected_attributes: A list of regular expressions that the attributes should match.
1818+ """
1919+ self._sources = []
2020+ self.synonyms = set()
2121+ super(FourmiSpider, self).__init__(*args, **kwargs)
2222+ self.synonyms.add(compound)
2323+ if selected_attributes is None:
2424+ self.selected_attributes = [".*"]
2525+ else:
2626+ self.selected_attributes = selected_attributes
10272828+ def parse(self, response):
2929+ """
3030+ The function that is called when a response to a request is available. This function distributes this to a
3131+ source which should be able to handle parsing the data.
3232+ :param response: A Scrapy Response object that should be parsed
3333+ :return: A list of Result items and new Request to be handled by the scrapy core.
3434+ """
3535+ for source in self._sources:
3636+ if re.match(source.website, response.url):
3737+ log.msg("URL: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
3838+ return source.parse(response)
3939+ log.msg("URL: " + response.url + " -> No compatible source", level=log.INFO)
4040+ return None
11411212-def parse(self, reponse):
1313- # [TODO] - This function should delegate it's functionality to other
1414- # parsers.
1515- pass
4242+ def get_synonym_requests(self, compound, force=False):
4343+ """
4444+ A function that generates new Scrapy Request for each source given a new synonym of a compound.
4545+ :param compound: A compound name
4646+ :return: A list of Scrapy Request objects
4747+ """
4848+ requests = []
4949+ if force or compound not in self.synonyms:
5050+ self.synonyms.add(compound)
5151+ for parser in self._sources:
5252+ parser_requests = parser.new_compound_request(compound)
5353+ if parser_requests is not None:
5454+ requests.append(parser_requests)
5555+ return requests
5656+5757+ def start_requests(self):
5858+ """
5959+ The function called by Scrapy for it's first Requests
6060+ :return: A list of Scrapy Request generated from the known synonyms using the available sources.
6161+ """
6262+ requests = []
6363+ for synonym in self.synonyms:
6464+ requests.extend(self.get_synonym_requests(synonym, force=True))
6565+ return requests
16666767+ def add_sources(self, sources):
6868+ """
6969+ A function to add a new Parser objects to the list of available sources.
7070+ :param sources: A list of Source Objects.
7171+ """
7272+ for parser in sources:
7373+ self.add_source(parser)
17741818-def add_parser(self, parser):
1919- self.parsers.add(parser)
7575+ def add_source(self, source):
7676+ """
7777+ A function add a new Parser object to the list of available parsers.
7878+ :param source: A Source Object
7979+ """
8080+ self._sources.append(source)
8181+ source.set_spider(self)
···11+import ConfigParser
22+33+44+class ConfigImporter():
55+ def __init__(self, filename):
66+ """Read the filename into the parser."""
77+ self.filename = filename
88+ self.parser = ConfigParser.ConfigParser()
99+ self.parser.read(self.filename)
1010+1111+ def load_common_attributes(self):
1212+ """Loads common attributes from the initialized file."""
1313+ try:
1414+ return self.parser.get('GUI', 'CommonParameters')
1515+ except:
1616+ return 'One, Two, Three'
1717+1818+ def load_output_types(self):
1919+ """Loads output types from the initialized file."""
2020+ try:
2121+ return self.parser.get('GUI', 'OutputTypes')
2222+ except:
2323+ return 'csv'
2424+2525+ def load_always_attributes(self):
2626+ """Loads attributes that are always searched for from the initialized file."""
2727+ try:
2828+ return self.parser.get('GUI', 'AlwaysParameters')
2929+ except:
3030+ return 'Name, Weight'
+196
GUI/gui.py
···11+from Tkinter import *
22+import os
33+import shutil
44+from tkFileDialog import asksaveasfilename
55+66+from configImporter import *
77+88+99+class GUI():
1010+ def __init__(self, search, config_file='GUI.cfg', sourceloader=None, in_source=True):
1111+ """Boots the window, configuration."""
1212+ if not in_source:
1313+ current_dir = os.path.dirname(os.path.abspath(__file__))
1414+ config_file = current_dir + '../' + config_file
1515+ if not os.path.isfile(config_file):
1616+ try:
1717+ shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../GUI.cfg.sample", config_file)
1818+ except IOError:
1919+ print "GUI configuration couldn't be found and couldn't be created."
2020+ sys.exit()
2121+ self.configurator = ConfigImporter(config_file)
2222+ self.sourceloader = sourceloader
2323+ self.finish_with_search = False
2424+ self.values = {}
2525+ self.required_variables = ['substance']
2626+ self.search = search
2727+ self.window, self.variables = self.generate_window(self.load_common_attributes(), self.load_output_types())
2828+2929+ def load_common_attributes(self):
3030+ """Calls the configuration parser for common attributes."""
3131+ return [x.strip() for x in self.configurator.load_common_attributes().split(',')]
3232+3333+ def load_output_types(self):
3434+ """Calls the configuration parser for output types."""
3535+ return [x.strip() for x in self.configurator.load_output_types().split(',')]
3636+3737+ def load_always_attributes(self):
3838+ """Calls the configuration parser for attributes that are always used."""
3939+ return ','.join([x.strip() for x in self.configurator.load_always_attributes().split(',')])
4040+4141+ def set_output(self):
4242+ self.variable_output_name.set(asksaveasfilename())
4343+ self.button_output_name.config(text=self.variable_output_name.get())
4444+4545+ def generate_window(self, common_attributes, output_types):
4646+ """Creates all widgets and variables in the window."""
4747+ window = Tk()
4848+ window.wm_title("Fourmi Crawler")
4949+5050+ variables = {}
5151+5252+ variable_substance = StringVar(window)
5353+ frame_substance = Frame(window)
5454+ label_substance = Label(frame_substance, text="Substance: ")
5555+ input_substance = Entry(frame_substance, font=("Helvetica", 12), width=25, textvariable=variable_substance)
5656+ variables.update({"substance": variable_substance})
5757+ frame_substance.pack(side=TOP)
5858+ label_substance.pack()
5959+ input_substance.pack()
6060+ input_substance.focus()
6161+6262+ frame_all_attributes = Frame(window)
6363+ frame_selecting_attributes = Frame(frame_all_attributes)
6464+ frame_new_attributes = Frame(frame_selecting_attributes)
6565+ label_new_attributes = Label(frame_new_attributes, text="Parameters: ")
6666+ input_new_attributes = Text(frame_new_attributes, font=("Helvetica", 8), width=25, height=7, padx=5, pady=5)
6767+ variables.update({"new_attributes": input_new_attributes})
6868+ frame_new_attributes.pack(side=LEFT)
6969+ label_new_attributes.pack()
7070+ input_new_attributes.pack()
7171+7272+ frame_common_attributes = Frame(frame_selecting_attributes)
7373+ label_common_attributes = Label(frame_common_attributes, text="Common Parameters: ")
7474+ input_common_attributes = Listbox(frame_common_attributes, selectmode=MULTIPLE, height=7)
7575+ scrollbar_common_attributes = Scrollbar(frame_common_attributes)
7676+ input_common_attributes.config(yscrollcommand=scrollbar_common_attributes.set)
7777+ scrollbar_common_attributes.config(command=input_common_attributes.yview)
7878+ if common_attributes and len(common_attributes) > 0:
7979+ input_common_attributes.insert(END, *common_attributes)
8080+ variables.update({"common_attributes": input_common_attributes})
8181+ frame_common_attributes.pack(side=RIGHT)
8282+ label_common_attributes.pack(side=TOP)
8383+ input_common_attributes.pack(side=LEFT)
8484+ scrollbar_common_attributes.pack(side=RIGHT, fill=Y)
8585+ frame_selecting_attributes.pack()
8686+8787+ frame_last = Frame(window)
8888+ search_button = Button(frame_last, text="Start search", command=self.prepare_search)
8989+ cancel_button = Button(frame_last, text="Cancel", command=window.destroy)
9090+ frame_last.pack(side=BOTTOM)
9191+ search_button.pack(side=LEFT)
9292+ cancel_button.pack(side=RIGHT)
9393+9494+ frame_name = Frame(window)
9595+ frame_output_name = Frame(frame_name)
9696+ label_output_name = Label(frame_output_name, text='Output file:')
9797+ self.variable_output_name = StringVar()
9898+ self.variable_output_name.set('results.csv')
9999+ variables.update({'output_name':self.variable_output_name})
100100+ self.button_output_name = Button(frame_output_name, command=self.set_output, text="Select file")
101101+ frame_output_name.pack(side=LEFT)
102102+ label_output_name.pack()
103103+ self.button_output_name.pack()
104104+ frame_name.pack(side=BOTTOM)
105105+106106+107107+ frame_checkboxes = Frame(window)
108108+ frame_checkbox_attributes = Frame(frame_checkboxes)
109109+ variable_all_attributes = BooleanVar()
110110+ variable_all_attributes.set(True)
111111+ input_all_attributes = Checkbutton(frame_checkbox_attributes, text="Search ALL parameters",
112112+ variable=variable_all_attributes)
113113+ variables.update({"all_attributes": variable_all_attributes})
114114+ frame_checkbox_attributes.pack(side=LEFT)
115115+ input_all_attributes.pack()
116116+117117+ frame_logging = Frame(frame_checkboxes)
118118+ variable_logging = BooleanVar()
119119+ variable_logging.set(False)
120120+ input_logging = Checkbutton(frame_logging, text="Verbose logging", variable=variable_logging)
121121+ variables.update({'logging':variable_logging})
122122+ frame_logging.pack(side=RIGHT)
123123+ frame_checkboxes.pack(side=BOTTOM)
124124+ input_logging.pack()
125125+ frame_all_attributes.pack()
126126+127127+ return window, variables
128128+129129+ def prepare_search(self):
130130+ """Saves the values from the window for later retrieval."""
131131+ variables = self.variables
132132+ values = {}
133133+134134+ values.update({"Always attributes": self.load_always_attributes()})
135135+ for name, var in variables.iteritems():
136136+ if var.__class__ is StringVar:
137137+ values.update({name: var.get()})
138138+ elif var.__class__ is BooleanVar:
139139+ values.update({name: var.get()})
140140+ elif var.__class__ is Text:
141141+ values.update({name: str(var.get("1.0", END)).strip()})
142142+ elif var.__class__ is Listbox:
143143+ values.update({name: ", ".join([var.get(int(i)) for i in var.curselection()])})
144144+ else:
145145+ print "No known class, {}, {}".format(name, var)
146146+147147+ values.update({'output_name':self.variable_output_name.get()})
148148+ values.update({'output_type':self.check_output_type(values.get('output_name'))})
149149+150150+ self.values = values
151151+ if all([values.get(i) != '' for i in self.required_variables]):
152152+ self.finish_with_search = True
153153+ self.window.destroy()
154154+ else:
155155+ self.finish_with_search = False
156156+ #tkMessageBox.showinfo('Not all required information was entered!')
157157+158158+ def execute_search(self):
159159+ """Calls the Fourmi crawler with the values from the GUI"""
160160+ if self.values.get('all_attributes'):
161161+ attributes = ".*"
162162+ else:
163163+ attribute_types = ['attributes', 'Common attributes', 'Always attributes']
164164+ attributes = ','.join([str(self.values.get(attribute)) for attribute in attribute_types])
165165+ output_file = "file://" + str(self.values.get('output_name')) #Dealing with absolute paths
166166+167167+ arguments = {'--attributes': attributes,
168168+ '--exclude': None,
169169+ '--format': self.values.get('output_type'),
170170+ '--help': False,
171171+ '--include': None,
172172+ '--log': 'log.txt',
173173+ '--output': output_file,
174174+ '-v': 0 if self.values.get('logging') else 3,
175175+ '--version': False,
176176+ '<compound>': self.values.get('substance'),
177177+ 'list': False,
178178+ 'search': True}
179179+180180+ self.search(arguments, self.sourceloader)
181181+182182+ def run(self):
183183+ """Starts the window and the search."""
184184+ self.window.mainloop()
185185+ if self.finish_with_search:
186186+ self.execute_search()
187187+188188+ def check_output_type(self, filename):
189189+ parts = str(filename).split('.')
190190+ output_types = self.load_output_types()
191191+ extension = parts[-1]
192192+193193+ for type in output_types:
194194+ if extension==type:
195195+ return extension
196196+ return output_types[0]
+10
GUI.cfg.sample
···11+[GUI]
22+# Personalize options in your User Interface
33+44+# Commonly used parameters are listed in the GUI for easy selection
55+CommonParameters = Weight, Polarity, Viscosity, Solubility, Name
66+77+# Parameters that are always used in the search
88+AlwaysParameters = Name
99+1010+OutputTypes = csv, json, jsonlines, xml
+21
LICENSE
···11+The MIT License (MIT)
22+33+Copyright (c) 2014 Ivo B. Rietveld
44+55+Permission is hereby granted, free of charge, to any person obtaining a copy
66+of this software and associated documentation files (the "Software"), to deal
77+in the Software without restriction, including without limitation the rights
88+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
99+copies of the Software, and to permit persons to whom the Software is
1010+furnished to do so, subject to the following conditions:
1111+1212+The above copyright notice and this permission notice shall be included in all
1313+copies or substantial portions of the Software.
1414+1515+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1616+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1717+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1818+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1919+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2020+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2121+SOFTWARE.
+80
README.md
···11+# Fourmi
22+33+**Master branch**: [](https://travis-ci.org/jjdekker/Fourmi) [](https://coveralls.io/r/jjdekker/Fourmi?branch=master)
44+55+**Developing branch**: [](https://travis-ci.org/jjdekker/Fourmi) [](https://coveralls.io/r/jjdekker/Fourmi?branch=develop)
66+77+Fourmi is an web scraper for chemical substances. The program is designed to be
88+used as a search engine to search multiple chemical databases for a specific
99+substance. The program will produce all available attributes of the substance
1010+and conditions associated with the attributes. Fourmi also attempts to estimate
1111+the reliability of each data point to assist the user in deciding which data
1212+should be used.
1313+1414+The Fourmi project is open source project licensed under the MIT license. Feel
1515+free to contribute!
1616+1717+Fourmi is based on the [Scrapy framework](http://scrapy.org/), an open source
1818+web scraping framework for python. Most of the functionality of this project can
1919+be traced to this framework. Should the documentation for this application fall
2020+short, we suggest you take a close look at the [Scrapy architecture]
2121+(http://doc.scrapy.org/en/latest/topics/architecture.html) and the [Scrapy
2222+documentation](http://doc.scrapy.org/en/latest/index.html).
2323+2424+### Installing
2525+2626+If you're installing Fourmi, please take a look at our installation guides
2727+on our [wiki](https://github.com/jjdekker/Fourmi/wiki). When you've installed the application, make sure to check our
2828+usage guide on the [Command Line Interface](https://github.com/jjdekker/Fourmi/wiki/CLI) and on the [Graphical User Interface](https://github.com/jjdekker/Fourmi/wiki/GUI).
2929+3030+### Using the Source
3131+3232+To use the Fourmi source code multiple dependencies are required. Take a look at
3333+our [wiki pages](https://github.com/jjdekker/Fourmi/wiki) on using the application source code in our a step by step
3434+installation guide.
3535+3636+When developing for the Fourmi project keep in mind that code readability is a
3737+must. To maintain the readability, code should be conform with the
3838+[PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python
3939+code. More information about the different structures and principles of the
4040+Fourmi application can be found on our [wiki](https://github.com/jjdekker/Fourmi/wiki).
4141+4242+### To Do
4343+4444+The Fourmi project has the following goals for the nearby future:
4545+4646+__Main goals:__
4747+4848+- Build an graphical user interface(GUI) as alternative for the command line
4949+interface(CLI). (Assignee: Harmen)
5050+- Compiling the source into an windows executable. (Assignee: Bas)
5151+5252+__Side goals:__
5353+5454+- Clean and unify data.
5555+- Extensive reliability analysis using statistical tests.
5656+- Test data with Descartes 1.
5757+5858+### Project Origin
5959+6060+The Fourmi project was started in February of 2014 as part of a software
6161+engineering course at the Radboud University for students studying Computer
6262+Science, Information Science or Artificial Intelligence. Students participate in
6363+a real software development project as part of the
6464+[Giphouse](http://www.giphouse.nl/).
6565+6666+This particular project was started on behalf of Ivo B. Rietveld. As a chemist
6767+he was in need of an application to automatically search information on chemical
6868+substances and create an phase diagram. The so called "Descrates" project was
6969+split into two teams each creating a different application that has part of the
7070+functionality. We are the team Descartes 2 and as we were responsible for
7171+creating a web crawler, we've named our application Fourmi (Englis: Ants).
7272+7373+The following people were part of the original team:
7474+7575+- [Jip J. Dekker](http://jip.dekker.li)
7676+- Rob ten Berge
7777+- Harmen Prins
7878+- Bas van Berkel
7979+- Nout van Deijck
8080+- Michail Kuznetcov
-16
README.rst
···11-We are the team Descartes 2.
22-----------------------------
33-44-Our team members are:
55-66-+ Rob ten Berge
77-88-+ Bas van Berkel
99-1010-+ Nout van Deijck
1111-1212-+ Jip J. Dekker
1313-1414-+ Michail Kuznetcov
1515-1616-+ Harmen Prins
+108
SIGNED.md
···11+##### Signed by https://keybase.io/jdekker
22+```
33+-----BEGIN PGP SIGNATURE-----
44+Version: GnuPG v1.4.11 (GNU/Linux)
55+66+iQIcBAABAgAGBQJTpMZAAAoJEJrQ9RIUCT6/Hf8P/AyX9ZD5zj6rBi2CwDOTs5aa
77+flVqw9syvdqTzVfXQaR4UrCSOuyuOeAkiqub0BMjxyCurqAwN/SCPf3uOJ/tGXmt
88+ZPtYVHjevJ4mbojLhZiJ2av8LC9VOh3Zl+reR3L2cLuBD4rVSrfUMJtczbbtNlk+
99++mczRcTpzNvHQW6mKqyUoKn8xqNnLC7C+p5ybNZ5EADUfoKIF1xyTN6je6fpYZ1U
1010+IHxiUzeOvfX9ohmbfnfkpkuSll1nUJWsTgUPKhthJuxEhwCQ1xMdWhxfcyZJaMT2
1111+Pxgo8C8S6lzAk4PxBRBoePjgWAeaFmbr317WXHvw6SSHPIdzToKZgDiDC5LWvKxb
1212+RRdLZ6w7tg0/FSUexekrUafGT8Je0oIoLUQlNaEQzrPNhDpma1uHFfZg0vb2m4Hq
1313+WHLLKTCr6FMczhP1TmuIEtdjKtymT+rO+Ls4ciw+654R7MtBYcmTr+RqmAd+GadJ
1414+vJNmGDod2oPwCydEps8bYAbksqRhMmk3xwco/g6dWYh5/+1GzCr80J7fYpqtoPFH
1515+V5qKyDQovF5jPlb/buq4mH8XYVT1z4Sx8azKVctMLig57zRnvN0WyskpT09oY7dK
1616+TPvIqwTixekndYLcM3QacVq/NhVOOQPFvD0PwU18eKs4EfD2L7iWd2XjV9Az++aD
1717+jUY6EwEuOzDCexWP4eM8
1818+=h6TK
1919+-----END PGP SIGNATURE-----
2020+2121+```
2222+2323+<!-- END SIGNATURES -->
2424+2525+### Begin signed statement
2626+2727+#### Expect
2828+2929+```
3030+size exec file contents
3131+ ./
3232+412 .gitignore 25059da2ee328837ece01b979cd5c1083ed1679372f06c14c1c58035d8120614
3333+548 .travis.yml 7f11bc58a8e94276ef949afeb107f9f1e184c0dbb84f821705ea2245902ed546
3434+846 Changelog.md 345f9aea4812b37b1b2714703ea0d5edd27414c0f839ec3e322450ad5ec5c6ed
3535+ FourmiCrawler/
3636+0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
3737+304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806
3838+2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49
3939+677 settings.py f1e7d21b899ffc2523516c0ebe67d967dc62495b90c2fe34651042a3049fcd94
4040+ sources/
4141+12103 ChemSpider.py f647d70acf9b3f1ee7bde75586aa45156331f977ca7fe836ceac4477a2c0d4ce
4242+12400 NIST.py cdb4c423355ac8fb1097197a9f8df44f667925a785c6bae7c583820da08908ee
4343+6121 PubChem.py 8f8ad40459090b818a384a202e739fe4696a04154df2b8419aee896b0fa02481
4444+6930 WikipediaParser.py ae9f57bbf2aad9c371abcd143fd2dda5995a196cb700734a5035dd94b1988870
4545+0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
4646+1281 source.py 7927fda259ff2c8096fa526db1f08586de6e04473a491e19a07b092fdeed81fc
4747+3111 spider.py ec7c946907fea10c17ee6dd88a506f3e3bf2cd748e3eb09200487fcec2ae7ba3
4848+ GUI/
4949+11 __init__.py 40567015c415e853210425c1b4f3834dbc2a3165e3713e04dd3424b79bc90aa3
5050+940 configImporter.py 5d731d63a3117b25b7e556a746a1dd5b16e8cbb60e57be46de333c31c8c00271
5151+8776 gui.py 20b2220bc3ca55ebfd6d04e8c0bebbf1ae316c85a54db60b8fc02d22642f19d5
5252+299 GUI.cfg.sample 4ee27f7099d588c21358cd645a21621e631d80712f1b514dad898faa5fee2483
5353+1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c
5454+3900 README.md f4a1e3ea1700d2b415acfad661cb45f960fe8e8ffbe98dbecb6c7ed071a101ac
5555+3846 x fourmi.py f0b11f5f153f96f6af2e504cdf369e43c04316752de131a659eb6246fd80212a
5656+261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85
5757+416 sources.cfg.sample 11cd0fc18693da17883c98d25a384ae1b6158adfef13778b6dd02b878f6b8a70
5858+ tests/
5959+107 __init__.py ce90e54e58a0912cadbe3adcf5166dc72477bf9ce289bf427f8e2f5b25406670
6060+2870 test_configurator.py 318d542b1cda5075a2a9a6be97e9e7a79372ee58e1ab3014c161534094f7364d
6161+1315 test_gui.py 0fb95d0b542765bf52bcebb037bf2ed1299209beab23448af741a93c9fbb1ca8
6262+1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031
6363+1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869
6464+2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299
6565+ utils/
6666+40 __init__.py f1237ae74693e2ec1b3154e57aec27438a80a735e5ccf2411aecd194ef443b6a
6767+4047 configurator.py 8b566a0435a9f105a8ec616b16c3e21edb9b82f8debe1ef9f1df6bbbf20949d5
6868+2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37
6969+```
7070+7171+#### Ignore
7272+7373+```
7474+/SIGNED.md
7575+```
7676+7777+#### Presets
7878+7979+```
8080+git # ignore .git and anything as described by .gitignore files
8181+dropbox # ignore .dropbox-cache and other Dropbox-related files
8282+kb # ignore anything as described by .kbignore files
8383+```
8484+8585+<!-- summarize version = 0.0.9 -->
8686+8787+### End signed statement
8888+8989+<hr>
9090+9191+#### Notes
9292+9393+With keybase you can sign any directory's contents, whether it's a git repo,
9494+source code distribution, or a personal documents folder. It aims to replace the drudgery of:
9595+9696+ 1. comparing a zipped file to a detached statement
9797+ 2. downloading a public key
9898+ 3. confirming it is in fact the author's by reviewing public statements they've made, using it
9999+100100+All in one simple command:
101101+102102+```bash
103103+keybase dir verify
104104+```
105105+106106+There are lots of options, including assertions for automating your checks.
107107+108108+For more info, check out https://keybase.io/docs/command_line/code_signing
+89
fourmi.py
···11+#!/usr/bin/env python
22+"""
33+Fourmi, a web scraper build to search specific information for a given compound (and its pseudonyms).
44+55+Usage:
66+ fourmi
77+ fourmi search <compound>
88+ fourmi [options] search <compound>
99+ fourmi [options] [-v | -vv | -vvv] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
1010+ fourmi list
1111+ fourmi [--include=<sourcename> | --exclude=<sourcename>] list
1212+ fourmi -h | --help
1313+ fourmi --version
1414+1515+Options:
1616+ --attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
1717+ -h --help Show this screen.
1818+ --version Show version.
1919+ -v Verbose logging output. (Multiple occurrences increase logging level)
2020+ --log=<file> Save log to an file.
2121+ -o <file> --output=<file> Output file [default: <compound>.*format*]
2222+ -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
2323+ --include=<regex> Include only sources that match these regular expressions split by a comma.
2424+ --exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
2525+"""
2626+2727+from twisted.internet import reactor
2828+from scrapy.crawler import Crawler
2929+from scrapy import signals, log
3030+import docopt
3131+3232+from FourmiCrawler.spider import FourmiSpider
3333+from utils.configurator import Configurator
3434+from utils.sourceloader import SourceLoader
3535+from GUI import gui
3636+3737+3838+def setup_crawler(compound, settings, source_loader, attributes):
3939+ """
4040+ This function prepares and start the crawler which starts the actual search on the internet
4141+ :param compound: The compound which should be searched
4242+ :param settings: A scrapy settings object
4343+ :param source_loader: A fully functional SourceLoader object which contains only the sources that should be used.
4444+ :param attributes: A list of regular expressions which the attribute names should match.
4545+ """
4646+ spider = FourmiSpider(compound=compound, selected_attributes=attributes)
4747+ spider.add_sources(source_loader.sources)
4848+ crawler = Crawler(settings)
4949+ crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
5050+ crawler.configure()
5151+ crawler.crawl(spider)
5252+ crawler.start()
5353+5454+5555+def search(docopt_arguments, source_loader):
5656+ """
5757+ The function that facilitates the search for a specific compound.
5858+ :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
5959+ :param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
6060+ """
6161+ conf = Configurator()
6262+ conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
6363+ conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments["<compound>"])
6464+ setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
6565+ source_loader, docopt_arguments["--attributes"].split(','))
6666+ if conf.scrapy_settings.getbool("LOG_ENABLED"):
6767+ log.start(conf.scrapy_settings.get("LOG_FILE"),
6868+ conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
6969+ reactor.run()
7070+7171+7272+# The start for the Fourmi Command Line interface.
7373+if __name__ == '__main__':
7474+ arguments = docopt.docopt(__doc__, version='Fourmi - V0.6.0')
7575+ loader = SourceLoader()
7676+7777+ if arguments["--include"]:
7878+ loader.include(arguments["--include"].split(','))
7979+ elif arguments["--exclude"]:
8080+ loader.exclude(arguments["--exclude"].split(','))
8181+8282+ if arguments["search"]:
8383+ search(arguments, loader)
8484+ elif arguments["list"]:
8585+ print "-== Available Sources ==-"
8686+ print str(loader)
8787+ else:
8888+ gui_window = gui.GUI(search, sourceloader=SourceLoader())
8989+ gui_window.run()
+19
sources.cfg.sample
···11+[DEFAULT]
22+reliability = Unknown
33+44+#For each source listed in FourmiCrawler/sources there should be a section
55+#named exactly as the filename in here. If not present, the DEFAULT value is
66+#used for reliability of that source.
77+88+[ChemSpider]
99+reliability = High
1010+#token=Paste ChemSpider API token here and remove the hashtag
1111+1212+[NIST]
1313+reliability = High
1414+1515+[WikipediaParser]
1616+reliability = Medium
1717+1818+[PubChem]
1919+reliability = High
···11+import ConfigParser
22+import os
33+import shutil
44+55+from scrapy.utils.project import get_project_settings
66+77+88+class Configurator:
99+ """
1010+ A helper class in the fourmi class. This class is used to process the settings as set
1111+ from one of the Fourmi applications.
1212+ """
1313+1414+ def __init__(self):
1515+ self.scrapy_settings = get_project_settings()
1616+1717+ def set_output(self, filename, fileformat, compound):
1818+ """
1919+ This function manipulates the Scrapy output file settings that normally would be set in the settings file.
2020+ In the Fourmi project these are command line arguments.
2121+ :param filename: The filename of the file where the output will be put.
2222+ :param fileformat: The format in which the output will be.
2323+ """
2424+2525+ if filename != '<compound>.*format*':
2626+ self.scrapy_settings.overrides["FEED_URI"] = filename
2727+ elif fileformat == "jsonlines":
2828+ self.scrapy_settings.overrides["FEED_URI"] = compound + ".json"
2929+ elif fileformat is not None:
3030+ self.scrapy_settings.overrides["FEED_URI"] = compound + "." + fileformat
3131+3232+ if fileformat is not None:
3333+ self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
3434+3535+ def set_logging(self, logfile=None, verbose=0):
3636+ """
3737+ This function changes the default settings of Scapy's logging functionality
3838+ using the settings given by the CLI.
3939+ :param logfile: The location where the logfile will be saved.
4040+ :param verbose: A integer value to switch between loglevels.
4141+ """
4242+ if verbose != 0:
4343+ self.scrapy_settings.overrides["LOG_ENABLED"] = True
4444+ else:
4545+ self.scrapy_settings.overrides["LOG_ENABLED"] = False
4646+4747+ if verbose == 1:
4848+ self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING"
4949+ elif verbose == 2:
5050+ self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO"
5151+ else:
5252+ self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG"
5353+5454+ if verbose > 1:
5555+ self.scrapy_settings.overrides["LOG_STDOUT"] = False
5656+ else:
5757+ self.scrapy_settings.overrides["LOG_STDOUT"] = True
5858+5959+ if logfile is not None:
6060+ self.scrapy_settings.overrides["LOG_FILE"] = logfile
6161+ else:
6262+ self.scrapy_settings.overrides["LOG_FILE"] = None
6363+6464+ @staticmethod
6565+ def read_sourceconfiguration():
6666+ """
6767+ This function reads sources.cfg in the main folder for configuration
6868+ variables for sources
6969+ :return a ConfigParser object of sources.cfg
7070+ """
7171+ current_dir = os.path.dirname(os.path.abspath(__file__))
7272+ config_path = current_dir + '/../sources.cfg'
7373+ # [TODO]: location of sources.cfg should be softcoded eventually
7474+ if not os.path.isfile(config_path):
7575+ try:
7676+ shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../sources.cfg.sample", config_path)
7777+ except IOError:
7878+ print "WARNING: Source configuration couldn't be found and couldn't be created."
7979+ config = ConfigParser.ConfigParser()
8080+ config.read(config_path)
8181+ return config
8282+8383+ @staticmethod
8484+ def get_section(config, sourcename):
8585+ """
8686+ This function reads a config section labeled in variable sourcename and
8787+ tests whether the reliability variable is set else set to empty string.
8888+ Return the default section if the labeled config section does not exist
8989+ :param config: a ConfigParser object
9090+ :param sourcename: the name of the section to be read
9191+ :return a dictionary of the section in the config labeled in sourcename
9292+ """
9393+ section = dict()
9494+ if config.has_section(sourcename):
9595+ section = dict(config.items(sourcename))
9696+ elif config.defaults():
9797+ section = config.defaults()
9898+ if 'reliability' not in section:
9999+ print 'WARNING: Reliability not set for %s' % sourcename
100100+ section['reliability'] = ''
101101+ return section
+64
utils/sourceloader.py
···11+import inspect
22+import os
33+import re
44+55+from FourmiCrawler.sources.source import Source
66+from utils.configurator import Configurator
77+88+99+class SourceLoader:
1010+ sources = []
1111+1212+ def __init__(self, rel_dir="../FourmiCrawler/sources"):
1313+ """
1414+ The initiation of a SourceLoader, selects and indexes a directory for usable sources.
1515+ Also loads a configuration file for Sources and passes the arguments in
1616+ the named section to the source
1717+ :param rel_dir: A relative path to a directory.
1818+ """
1919+ path = os.path.dirname(os.path.abspath(__file__))
2020+ path += "/" + rel_dir
2121+ known_parser = set()
2222+2323+ config = Configurator.read_sourceconfiguration()
2424+2525+ for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
2626+ mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py])
2727+ classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
2828+ for cls in classes:
2929+ if issubclass(cls, Source) and cls not in known_parser:
3030+ sourcecfg = Configurator.get_section(config, cls.__name__)
3131+ self.sources.append(cls(sourcecfg))
3232+ known_parser.add(cls)
3333+3434+ def include(self, source_names):
3535+ """
3636+ This function excludes all sources that don't match the given regular expressions.
3737+ :param source_names: A list of regular expression (strings)
3838+ """
3939+ new = set()
4040+ for name in source_names:
4141+ new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
4242+ self.sources = list(new)
4343+4444+ def exclude(self, source_names):
4545+ """
4646+ This function excludes all sources that match the given regular expressions.
4747+ :param source_names: A list of regular expression (strings)
4848+ """
4949+ exclude = []
5050+ for name in source_names:
5151+ exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
5252+ self.sources = [src for src in self.sources if src not in exclude]
5353+5454+ def __str__(self):
5555+ """
5656+ This function returns a string with all sources currently available in the SourceLoader.
5757+ :return: a string with all available sources.
5858+ """
5959+ string = ""
6060+ for src in self.sources:
6161+ string += "Source: " + src.__class__.__name__
6262+ string += " - "
6363+ string += "URI: " + src.website + "\n"
6464+ return string