···44#Python Specific ignores
55*.pyc
6677+#may contain authentication information
88+sources.cfg
99+#Another of our config files
1010+GUI.cfg
1111+712#THINGS WE WOULD NEVER EVER WANT!
813#ignore thumbnails created by windows
914Thumbs.db
+23
.travis.yml
···11+# Config file for automatic testing at travis-ci.org
22+33+language: python
44+python: 2.7
55+66+before_install:
77+ - "export DISPLAY=:99.0"
88+ - "sh -e /etc/init.d/xvfb start"
99+1010+# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
1111+install:
1212+ - pip install Scrapy docopt
1313+ - pip install coveralls
1414+1515+# command to run tests, e.g. python setup.py test
1616+script:
1717+ - nosetests --with-coverage --cover-package=FourmiCrawler,utils,GUI tests
1818+1919+notifications:
2020+ slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
2121+2222+after_success:
2323+ coveralls --verbose
+20
Changelog.md
···11+### v0.6.0
22+- Feature: Added a Graphical User interface
33+- Feature: Automatic config file createion from config samples
44+- FIX: The default name of the output files will now consist of the compound name and the file format when using the CLI
55+- FIX: A lot of bugfixes of the PubChem plugin, as is wasn't working as it should
66+- FIX: Using absolute path for configuration files
77+- DEV: General Code cleanup in documentation
88+99+### v0.5.3
1010+- FIX: It is now again possible to use both verbose and the source inclusion/exclusion options
1111+- FIX: Logging is now "actually" disabled if not using the verbose option.
1212+- FEATURE: Added support for PubChem
1313+1414+### v0.5.2
1515+- FIX: Signatured used to contain untracked and older files, current signature
1616+should be correct.
1717+1818+### v0.5.1
1919+- UPDATED: Logging functionality from command line
2020+- DEV: Code cleanup and extra tests
+1-3
FourmiCrawler/items.py
···11-# Define here the models for your scraped items
22-#
33-# See documentation in:
11+# For more information on item definitions, see the Scrapy documentation in:
42# http://doc.scrapy.org/en/latest/topics/items.html
5364from scrapy.item import Item, Field
+13-12
FourmiCrawler/pipelines.py
···11-# Define your item pipelines here
22-#
33-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
44-# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
11+# For more information on item pipelines, see the Scrapy documentation in:
22+# http://doc.scrapy.org/en/latest/topics/item-pipeline.html
53import re
44+65from scrapy.exceptions import DropItem
7688-class RemoveNonePipeline(object):
9788+class RemoveNonePipeline(object):
109 def __init__(self):
1111- self.known_values = set()
1010+ pass
12111313- def process_item(self, item, spider):
1212+ @staticmethod
1313+ def process_item(item, spider):
1414 """
1515 Processing the items so None values are replaced by empty strings
1616 :param item: The incoming item
···2222 item[key] = ""
2323 return item
24242525-class DuplicatePipeline(object):
26252626+class DuplicatePipeline(object):
2727 def __init__(self):
2828 self.known_values = set()
2929···3636 """
3737 value = (item['attribute'], item['value'], item['conditions'])
3838 if value in self.known_values:
3939- raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item.
3939+ raise DropItem("Duplicate item found: %s" % item) # [todo] append sources of first item.
4040 else:
4141 self.known_values.add(value)
4242 return item
43434444-class AttributeSelectionPipeline(object):
45444545+class AttributeSelectionPipeline(object):
4646 def __init__(self):
4747- pass;
4747+ pass
48484949- def process_item(self, item, spider):
4949+ @staticmethod
5050+ def process_item(item, spider):
5051 """
5152 The items are processed using the selected attribute list available in the spider,
5253 items that don't match the selected items are dropped.
+2-3
FourmiCrawler/settings.py
···33# For simplicity, this file contains only the most important settings by
44# default. All the other settings are documented here:
55#
66-# http://doc.scrapy.org/en/latest/topics/settings.html
66+# http://doc.scrapy.org/en/latest/topics/settings.html
77#
8899BOT_NAME = 'FourmiCrawler'
···1818FEED_URI = 'results.json'
1919FEED_FORMAT = 'jsonlines'
20202121-2221# Crawl responsibly by identifying yourself (and your website) on the
2322# user-agent
24232525-# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
2424+USER_AGENT = 'Fourmi'
+131-63
FourmiCrawler/sources/ChemSpider.py
···11-from source import Source
11+import re
22+23from scrapy import log
34from scrapy.http import Request
45from scrapy.selector import Selector
66+77+from source import Source
58from FourmiCrawler.items import Result
66-import re
99+710811# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
9121010-1113class ChemSpider(Source):
1212- """ChemSpider scraper for synonyms and properties
1313-1414+ """
1515+ ChemSpider scraper for synonyms and properties
1416 This parser will manage searching for chemicals through the
1517 ChemsSpider API, and parsing the resulting ChemSpider page.
1618 The token required for the API should be in a configuration file
1719 somewhere.
1820 """
19212020- def __init__(self):
2121- Source.__init__(self)
2222+ website = 'http://www\\.chemspider\\.com/.*'
22232323- website = 'http://www.chemspider.com/*'
2424-2525- # [TODO] - Save and access token of specific user.
2626- search = ('Search.asmx/SimpleSearch?query=%s&token='
2727- '052bfd06-5ce4-43d6-bf12-89eabefd2338')
2424+ search = 'Search.asmx/SimpleSearch?query=%s&token='
2825 structure = 'Chemical-Structure.%s.html'
2929- extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
3030- '052bfd06-5ce4-43d6-bf12-89eabefd2338')
2626+ extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
31273232- ignore_list = []
2828+ def __init__(self, config=None):
2929+ """
3030+ Initialization of ChemSpider scraper
3131+ :param config: a dictionary of settings for this scraper, must contain
3232+ 'reliability' key
3333+ """
3434+ Source.__init__(self, config)
3535+ self.ignore_list = []
3636+ if 'token' not in self.cfg or self.cfg['token'] == '':
3737+ log.msg('ChemSpider token not set or empty, search/MassSpec API '
3838+ 'not available', level=log.WARNING)
3939+ self.cfg['token'] = ''
4040+ self.search += self.cfg['token']
4141+ self.extendedinfo += self.cfg['token']
33423443 def parse(self, response):
4444+ """
4545+ This function is called when a Response matching the variable
4646+ 'website' is available for parsing the Response object.
4747+ :param response: the Scrapy Response object to be parsed
4848+ :return: a list of Result items and Request objects
4949+ """
3550 sel = Selector(response)
3651 requests = []
3752 requests_synonyms = self.parse_synonyms(sel)
···41564257 return requests
43584444- @staticmethod
4545- def parse_properties(sel):
4646- """scrape Experimental Data and Predicted ACD/Labs tabs"""
5959+ def parse_properties(self, sel):
6060+ """
6161+ This function scrapes the Experimental Data and Predicted ACD/Labs tabs
6262+ :param sel: a Selector object of the whole page
6363+ :return: a list of Result items
6464+ """
4765 properties = []
48664949- # Predicted - ACD/Labs tab
6767+ properties.extend(self.parse_acdlabstab(sel))
6868+ properties.extend(self.parse_experimentaldatatab(sel))
6969+7070+ return properties
7171+7272+ def parse_acdlabstab(self, sel):
7373+ """
7474+ This function scrapes the 'Predicted ACD/Labs tab' under Properties
7575+ :param sel: a Selector object of the whole page
7676+ :return: a list of Request objects
7777+ """
7878+ properties = []
7979+5080 td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
5181 'normalize-space(string())')
5282 prop_names = td_list[::2]
···59896090 # Test for properties without values, with one hardcoded exception
6191 if (not re.match(r'^\d', prop_value) or
6262- (prop_name == 'Polarizability' and
6363- prop_value == '10-24cm3')):
9292+ (prop_name == 'Polarizability' and prop_value == '10-24cm3')):
6493 continue
65946666- # Match for condition in parentheses
6795 m = re.match(r'(.*) \((.*)\)', prop_name)
6896 if m:
6997 prop_name = m.group(1)
7098 prop_conditions = m.group(2)
71997272- # Match for condition in value seperated by an 'at'
73100 m = re.match(r'(.*) at (.*)', prop_value)
74101 if m:
75102 prop_value = m.group(1)
76103 prop_conditions = m.group(2)
771047878- new_prop = Result({
7979- 'attribute': prop_name,
8080- 'value': prop_value,
8181- 'source': 'ChemSpider Predicted - ACD/Labs Tab',
8282- 'reliability': 'Unknown',
8383- 'conditions': prop_conditions
8484- })
105105+ new_prop = self.newresult(
106106+ attribute=prop_name,
107107+ value=prop_value,
108108+ source='ChemSpider Predicted - ACD/Labs Tab',
109109+ conditions=prop_conditions
110110+ )
85111 properties.append(new_prop)
8686- log.msg('CS prop: |%s| |%s| |%s|' %
8787- (new_prop['attribute'], new_prop['value'], new_prop['source']),
8888- level=log.DEBUG)
891129090- # Experimental Data Tab, Physico-chemical properties in particular
113113+ return properties
114114+115115+ def parse_experimentaldatatab(self, sel):
116116+ """
117117+ This function scrapes Experimental Data tab, Physico-chemical
118118+ properties in particular.
119119+ :param sel: a Selector object of the whole page
120120+ :return: a list of Result items
121121+ """
122122+ properties = []
123123+91124 scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
92125 'Properties"]//li/table/tr/td')
93126 if not scraped_list:
···99132 if line.xpath('span/text()'):
100133 property_name = line.xpath('span/text()').extract()[0].rstrip()
101134 else:
102102- new_prop = Result({
103103- 'attribute': property_name[:-1],
104104- 'value': line.xpath('text()').extract()[0].rstrip(),
105105- 'source': line.xpath(
106106- 'strong/text()').extract()[0].rstrip(),
107107- 'reliability': 'Unknown',
108108- 'conditions': ''
109109- })
110110- properties.append(new_prop)
111111- log.msg('CS prop: |%s| |%s| |%s|' %
112112- (new_prop['attribute'], new_prop['value'],
113113- new_prop['source']), level=log.DEBUG)
135135+ new_prop = self.newresult(
136136+ attribute=property_name[:-1],
137137+ value=line.xpath('text()').extract()[0].rstrip(),
138138+ source=line.xpath('strong/text()').extract()[0].rstrip(),
139139+ )
140140+ properties.append(new_prop)
114141115142 return properties
116143117144 def parse_synonyms(self, sel):
118118- """Scrape list of Names and Identifiers"""
145145+ """
146146+ This function scrapes the list of Names and Identifiers
147147+ :param sel: a Selector object of the whole page
148148+ :return: a list of Requests
149149+ """
119150 requests = []
120151 synonyms = []
121152···147178 return requests
148179149180 def new_synonym(self, sel, name, category):
150150- """Scrape for a single synonym at a given HTML tag"""
181181+ """
182182+ This function scrapes for a single synonym at a given HTML tag
183183+ :param sel: a Selector object of the given HTML tag
184184+ :param name: the name of the synonym in the tag
185185+ :param category: the name of the category the synonym is labeled as
186186+ :return: a dictionary containing data on the synonym
187187+ """
151188 self.ignore_list.append(name)
152189 language = sel.xpath('span[@class="synonym_language"]/text()')
153190 if language:
···182219 }
183220 return synonym
184221185185- @staticmethod
186186- def parse_extendedinfo(response):
187187- """Scrape data from the ChemSpider GetExtendedCompoundInfo API"""
222222+ def parse_extendedinfo(self, response):
223223+ """
224224+ This function scrapes data from the ChemSpider GetExtendedCompoundInfo
225225+ API, if a token is present in the configuration settings
226226+ :param response: a Response object to be parsed
227227+ :return: a list of Result items
228228+ """
188229 sel = Selector(response)
189230 properties = []
190231 names = sel.xpath('*').xpath('name()').extract()
191232 values = sel.xpath('*').xpath('text()').extract()
192233 for (name, value) in zip(names, values):
193193- result = Result({
194194- 'attribute': name,
195195- 'value': value, # These values have no unit!
196196- 'source': 'ChemSpider ExtendedCompoundInfo',
197197- 'reliability': 'Unknown',
198198- 'conditions': ''
199199- })
234234+ result = self.newresult(
235235+ attribute=name,
236236+ value=value, # These values have no unit!
237237+ source='ChemSpider ExtendedCompoundInfo',
238238+ )
200239 if result['value']:
201240 properties.append(result)
202241 return properties
203242243243+ def newresult(self, attribute, value, conditions='', source='ChemSpider'):
244244+ """
245245+ This function abstracts from the Result item and provides default
246246+ values.
247247+ :param attribute: the name of the attribute
248248+ :param value: the value of the attribute
249249+ :param conditions: optional conditions regarding the value
250250+ :param source: the name of the source if it is not ChemSpider
251251+ :return: A Result item
252252+ """
253253+ return Result({
254254+ 'attribute': attribute,
255255+ 'value': value,
256256+ 'source': source,
257257+ 'reliability': self.cfg['reliability'],
258258+ 'conditions': conditions
259259+ })
260260+204261 def parse_searchrequest(self, response):
205205- """Parse the initial response of the ChemSpider Search API """
262262+ """
263263+ This function parses the initial response of the ChemSpider Search API
264264+ Requires a valid token to function.
265265+ :param response: the Response object to be parsed
266266+ :return: A Request for the information page and a Request for the
267267+ extendedinfo API call
268268+ """
206269 sel = Selector(response)
207270 log.msg('chemspider parse_searchrequest', level=log.DEBUG)
208271 sel.register_namespace('cs', 'http://www.chemspider.com/')
···214277 log.msg('ChemSpider found multiple substances, taking first '
215278 'element', level=log.DEBUG)
216279 csid = csids[0]
217217- structure_url = self.website[:-1] + self.structure % csid
218218- extendedinfo_url = self.website[:-1] + self.extendedinfo % csid
280280+ structure_url = self.website[:-2].replace("\\", "") + self.structure % csid
281281+ extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid
219282 log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
220283 return [Request(url=structure_url,
221284 callback=self.parse),
···223286 callback=self.parse_extendedinfo)]
224287225288 def new_compound_request(self, compound):
226226- if compound in self.ignore_list: # [TODO] - add regular expression
289289+ """
290290+ This function is called when a new synonym is returned to the spider
291291+ to generate new requests
292292+ :param compound: the name of the compound to search for
293293+ """
294294+ if compound in self.ignore_list or self.cfg['token'] == '':
227295 return None
228228- searchurl = self.website[:-1] + self.search % compound
296296+ searchurl = self.website[:-2].replace("\\", "") + self.search % compound
229297 log.msg('chemspider compound', level=log.DEBUG)
230298 return Request(url=searchurl, callback=self.parse_searchrequest)
+150-89
FourmiCrawler/sources/NIST.py
···11-from source import Source
11+import re
22+23from scrapy import log
34from scrapy.http import Request
45from scrapy.selector import Selector
66+77+from source import Source
58from FourmiCrawler.items import Result
66-import re
99+710811# [TODO]: values can be '128.', perhaps remove the dot in that case?
912# [TODO]: properties have references and comments which do not exist in the
1010-# Result item, but should be included eventually.
1313+# Result item, but should be included eventually.
11141215class NIST(Source):
1313- """NIST Scraper plugin
1414-1616+ """
1717+ NIST Scraper plugin
1518 This plugin manages searching for a chemical on the NIST website
1619 and parsing the resulting page if the chemical exists on NIST.
1720 """
1818- website = "http://webbook.nist.gov/*"
2121+ website = "http://webbook\\.nist\\.gov/.*"
19222023 search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
21242222- ignore_list = set()
2323-2424- def __init__(self):
2525- Source.__init__(self)
2525+ def __init__(self, config=None):
2626+ """
2727+ Initialization of NIST scraper
2828+ :param config: configuration variables for this scraper, must contain
2929+ 'reliability' key.
3030+ """
3131+ Source.__init__(self, config)
3232+ self.ignore_list = set()
26332734 def parse(self, response):
3535+ """
3636+ This function is called when a Response matching the variable
3737+ 'website' is available for parsing the Response object.
3838+ :param response: The Scrapy Response object to be parsed
3939+ :return: a list of Result items and Request objects
4040+ """
2841 sel = Selector(response)
29423043 title = sel.xpath('head/title/text()').extract()[0]
···4962 log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
5063 level=log.DEBUG)
51646565+ requests.extend(self.parse_tables(sel, symbol_table))
6666+6767+ return requests
6868+6969+ def parse_tables(self, sel, symbol_table):
7070+ """
7171+ This function identifies and distributes parsing of tables to other
7272+ functions below.
7373+ :param sel: A Selector object of the whole page
7474+ :param symbol_table: a dictionary containing translations of raw HTML
7575+ tags to human readable names
7676+ :return: a list of Result items and Requests
7777+ """
7878+ requests = []
7979+5280 for table in sel.xpath('//table[@class="data"]'):
5381 summary = table.xpath('@summary').extract()[0]
5482 if summary == 'One dimensional data':
···75103 requests.extend(self.parse_generic_data(table, summary))
76104 else:
77105 log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
7878- continue #Assume unsupported
106106+ continue # Assume unsupported
79107 return requests
8010881109 def parse_generic_info(self, sel):
8282- """Parses: synonyms, chemical formula, molecular weight, InChI,
8383- InChiKey, CAS number
110110+ """
111111+ This function parses: synonyms, chemical formula, molecular weight,
112112+ InChI, InChiKey, CAS number
113113+ :param sel: A Selector object of the entire page in the original
114114+ response
115115+ :return: a list of Result items
84116 """
85117 ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
8686- li = ul.xpath('li')
8711888119 raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
89120 for synonym in raw_synonyms[0].strip().split(';\n'):
···103134 data['IUPAC Standard InChI'] = raw_inchi.extract()[0]
104135105136 raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]'
106106- '/tt/text()')
137137+ '/tt/text()')
107138 data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0]
108139109140 raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()')
···111142112143 requests = []
113144 for key, value in data.iteritems():
114114- result = Result({
115115- 'attribute': key,
116116- 'value': value,
117117- 'source': 'NIST',
118118- 'reliability': 'Unknown',
119119- 'conditions': ''
120120- })
145145+ result = self.newresult(
146146+ attribute=key,
147147+ value=value
148148+ )
121149 requests.append(result)
122150123151 return requests
124152125153 def parse_aggregate_data(self, table, symbol_table):
126126- """Parses the table(s) which contain possible links to individual
127127- data points
154154+ """
155155+ This function parses the table(s) which contain possible links to
156156+ individual data points
157157+ :param table: a Selector object of the table to be parsed
158158+ :param symbol_table: a dictionary containing translations of raw HTML
159159+ tags to human readable names
160160+ :return: a list of Result items and Request objects
128161 """
129162 results = []
130163 for tr in table.xpath('tr[td]'):
131164 extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
132132- '/a/@href').extract()
165165+ '/a/@href').extract()
133166 if extra_data_url:
134134- request = Request(url=self.website[:-1] + extra_data_url[0],
135135- callback=self.parse_individual_datapoints)
167167+ request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0],
168168+ callback=self.parse_individual_datapoints)
136169 results.append(request)
137170 continue
138171 data = []
···147180 name = m.group(1)
148181 condition = m.group(2)
149182150150- result = Result({
151151- 'attribute': name,
152152- 'value': data[1] + ' ' + data[2],
153153- 'source': 'NIST',
154154- 'reliability': 'Unknown',
155155- 'conditions': condition
156156- })
183183+ result = self.newresult(
184184+ attribute=name,
185185+ value=data[1] + ' ' + data[2],
186186+ conditions=condition
187187+ )
157188 log.msg('NIST: |%s|' % data, level=log.DEBUG)
158189 results.append(result)
159190 return results
160191161161- @staticmethod
162162- def parse_transition_data(table, summary):
163163- """Parses the table containing properties regarding phase changes"""
192192+ def parse_transition_data(self, table, summary):
193193+ """
194194+ This function parses the table containing properties regarding phase
195195+ changes
196196+ :param table: a Selector object of the table to be parsed
197197+ :param summary: the name of the property
198198+ :return: a list of Result items
199199+ """
164200 results = []
165201166166- tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
167167- m = re.search(r'\((.*)\)', tr_unit)
168168- unit = '!'
169169- if m:
170170- unit = m.group(1)
202202+ unit = self.get_unit(table)
171203172204 for tr in table.xpath('tr[td]'):
173205 tds = tr.xpath('td/text()').extract()
174174- result = Result({
175175- 'attribute': summary,
176176- 'value': tds[0] + ' ' + unit,
177177- 'source': 'NIST',
178178- 'reliability': 'Unknown',
179179- 'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
180180- })
206206+ result = self.newresult(
207207+ attribute=summary,
208208+ value=tds[0] + ' ' + unit,
209209+ conditions='%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
210210+ )
181211 results.append(result)
182212183183-184213 return results
185214186186- @staticmethod
187187- def parse_generic_data(table, summary):
188188- """Parses the common tables of 4 and 5 rows. Assumes they are of the
215215+ def parse_generic_data(self, table, summary):
216216+ """
217217+ Parses the common tables of 4 and 5 rows. Assumes they are of the
189218 form:
190219 Symbol (unit)|Temperature (K)|Method|Reference|Comment
191220 Symbol (unit)|Temperature (K)|Reference|Comment
221221+ :param table: a Selector object of the table to be parsed
222222+ :param summary: the name of the property
223223+ :return: a list of Result items
192224 """
193225 results = []
194226195195- tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
196196- m = re.search(r'\((.*)\)', tr_unit)
197197- unit = '!'
198198- if m:
199199- unit = m.group(1)
227227+ unit = self.get_unit(table)
200228201229 for tr in table.xpath('tr[td]'):
202230 tds = tr.xpath('td/text()').extract()
203203- result = Result({
204204- 'attribute': summary,
205205- 'value': tds[0] + ' ' + unit,
206206- 'source': 'NIST',
207207- 'reliability': 'Unknown',
208208- 'conditions': '%s K' % tds[1]
209209- })
231231+ result = self.newresult(
232232+ attribute=summary,
233233+ value=tds[0] + ' ' + unit,
234234+ conditions='%s K' % tds[1]
235235+ )
210236 results.append(result)
211237 return results
212238213213- @staticmethod
214214- def parse_antoine_data(table, summary):
215215- """Parse table containing parameters for the Antione equation"""
239239+ def parse_antoine_data(self, table, summary):
240240+ """
241241+ This function parses the table containing parameters for the Antione
242242+ equation
243243+ :param table: a Selector object of the table to be parsed
244244+ :param summary: the name of the property
245245+ :return: a list of Result items
246246+ """
216247 results = []
217248218249 for tr in table.xpath('tr[td]'):
219250 tds = tr.xpath('td/text()').extract()
220220- result = Result({
221221- 'attribute': summary,
222222- 'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
223223- 'source': 'NIST',
224224- 'reliability': 'Unknown',
225225- 'conditions': '%s K' % tds[0]
226226- })
251251+ result = self.newresult(
252252+ attribute=summary,
253253+ value='A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
254254+ conditions='%s K' % tds[0]
255255+ )
227256 results.append(result)
228257229258 return results
230259231260 def parse_individual_datapoints(self, response):
232232- """Parses the page linked from aggregate data"""
261261+ """
262262+ This function parses the 'individual data points' page linked from
263263+ the aggregate data table(s)
264264+ :param response: the Scrapy Response object to be parsed
265265+ :return: a list of Result items
266266+ """
233267 sel = Selector(response)
234268 table = sel.xpath('//table[@class="data"]')[0]
235269···242276 name = m.group(1)
243277 condition = m.group(2)
244278245245- tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
246246- m = re.search(r'\((.*)\)', tr_unit)
247247- unit = '!'
248248- if m:
249249- unit = m.group(1)
279279+ unit = self.get_unit(table)
250280251281 for tr in table.xpath('tr[td]'):
252282 tds = tr.xpath('td/text()').extract()
···255285 if m:
256286 uncertainty = '+- %s ' % m.group(1)
257287 # [TODO]: get the plusminus sign working in here
258258- result = Result({
259259- 'attribute': name,
260260- 'value': '%s %s%s' % (tds[0], uncertainty, unit),
261261- 'source': 'NIST',
262262- 'reliability': 'Unknown',
263263- 'conditions': condition
264264- })
288288+ result = self.newresult(
289289+ attribute=name,
290290+ value='%s %s%s' % (tds[0], uncertainty, unit),
291291+ conditions=condition
292292+ )
265293 results.append(result)
266294267295 return results
268296297297+ @staticmethod
298298+ def get_unit(table):
299299+ tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
300300+ m = re.search(r'\((.*)\)', tr_unit)
301301+ unit = '!'
302302+ if m:
303303+ unit = m.group(1)
304304+305305+ return unit
306306+307307+ def newresult(self, attribute, value, conditions=''):
308308+ """
309309+ This function abstracts from the Result item and provides default
310310+ values
311311+ :param attribute: the name of the attribute
312312+ :param value: the value of the attribute
313313+ :param conditions: optional conditions regarding the value
314314+ :return: A Result item
315315+ """
316316+ return Result(
317317+ {
318318+ 'attribute': attribute,
319319+ 'value': value,
320320+ 'source': 'NIST',
321321+ 'reliability': self.cfg['reliability'],
322322+ 'conditions': conditions
323323+ })
324324+269325 def new_compound_request(self, compound):
326326+ """
327327+ This function is called when a new synonym is returned to the spider
328328+ to generate new requests
329329+ :param compound: the name of the compound to search for
330330+ """
270331 if compound not in self.ignore_list:
271332 self.ignore_list.update(compound)
272272- return Request(url=self.website[:-1] + self.search % compound,
333333+ return Request(url=self.website[:-2].replace("\\", "") + self.search % compound,
273334 callback=self.parse)
+149
FourmiCrawler/sources/PubChem.py
···11+import re
22+33+from scrapy.http import Request
44+from scrapy import log
55+from scrapy.selector import Selector
66+77+from source import Source
88+from FourmiCrawler.items import Result
99+1010+1111+class PubChem(Source):
1212+ """ PubChem scraper for chemical properties
1313+1414+ This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance,
1515+ including sources of the values of properties.
1616+ """
1717+1818+ # PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
1919+ website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
2020+ website_www = 'http://www.ncbi.nlm.nih.gov/*'
2121+ website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
2222+ search = 'pccompound?term=%s'
2323+ data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
2424+2525+ __spider = None
2626+ searched_compounds = set()
2727+2828+ def __init__(self, config):
2929+ Source.__init__(self, config)
3030+ self.cfg = config
3131+3232+ def parse(self, response):
3333+ """
3434+ Distributes the above described behaviour
3535+ :param response: The incoming search request
3636+ :return Returns the found properties if response is unique or returns none if it's already known
3737+ """
3838+ requests = []
3939+ log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
4040+4141+ sel = Selector(response)
4242+ compound = sel.xpath('//h1/text()').extract()[0]
4343+ if compound in self.searched_compounds:
4444+ return None
4545+4646+ self.searched_compounds.update(compound)
4747+ raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0]
4848+ for synonym in raw_synonyms.strip().split(', '):
4949+ log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG)
5050+ self.searched_compounds.update(synonym)
5151+ self._spider.get_synonym_requests(synonym)
5252+ log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
5353+5454+ n = re.search(r'cid=(\d+)', response.url)
5555+ if n:
5656+ cid = n.group(1)
5757+ log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach
5858+ # the seperate html page which contains the properties and their values
5959+6060+ # using this cid to get the right url and scrape it
6161+ requests.append(
6262+ Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data))
6363+ return requests
6464+6565+ def parse_data(self, response):
6666+ """
6767+ Parse data found in 'Chemical and Physical properties' part of a substance page.
6868+ :param response: The response with the page to parse
6969+ :return: requests: Returns a list of properties with their values, source, etc.
7070+ """
7171+ log.msg('parsing data', level=log.DEBUG)
7272+ requests = []
7373+7474+ sel = Selector(response)
7575+ props = sel.xpath('//div')
7676+7777+ for prop in props:
7878+ prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
7979+ if prop.xpath('a'): # parsing for single value in property
8080+ prop_source = ''.join(prop.xpath('a/@title').extract())
8181+ prop_value = ''.join(prop.xpath('a/text()').extract())
8282+ new_prop = Result({
8383+ 'attribute': prop_name,
8484+ 'value': prop_value,
8585+ 'source': prop_source,
8686+ 'reliability': self.cfg['reliability'],
8787+ 'conditions': ''
8888+ })
8989+ log.msg('PubChem prop: |%s| |%s| |%s|' %
9090+ (new_prop['attribute'], new_prop['value'],
9191+ new_prop['source']), level=log.DEBUG)
9292+ requests.append(new_prop)
9393+ elif prop.xpath('ul'): # parsing for multiple values (list) in property
9494+ prop_values = prop.xpath('ul//li')
9595+ for prop_li in prop_values:
9696+ prop_value = ''.join(prop_li.xpath('a/text()').extract())
9797+ prop_source = ''.join(prop_li.xpath('a/@title').extract())
9898+ new_prop = Result({
9999+ 'attribute': prop_name,
100100+ 'value': prop_value,
101101+ 'source': prop_source,
102102+ 'reliability': self.cfg['reliability'],
103103+ 'conditions': ''
104104+ })
105105+ log.msg('PubChem prop: |%s| |%s| |%s|' %
106106+ (new_prop['attribute'], new_prop['value'],
107107+ new_prop['source']), level=log.DEBUG)
108108+ requests.append(new_prop)
109109+110110+ return requests
111111+112112+ def parse_searchrequest(self, response):
113113+ """
114114+ This function parses the response to the new_compound_request Request
115115+ :param response: the Response object to be parsed
116116+ :return: A Request for the compound page or what self.parse returns in
117117+ case the search request forwarded to the compound page
118118+ """
119119+120120+ # check if pubchem forwarded straight to compound page
121121+ m = re.match(self.website_pubchem, response.url)
122122+ if m:
123123+ log.msg('PubChem search forwarded to compound page',
124124+ level=log.DEBUG)
125125+ return self.parse(response)
126126+127127+ sel = Selector(response)
128128+129129+ results = sel.xpath('//div[@class="rsltcont"]')
130130+ if results:
131131+ url = results[0].xpath('div/p/a[1]/@href')
132132+ else:
133133+ log.msg('PubChem search found nothing or xpath failed',
134134+ level=log.DEBUG)
135135+ return None
136136+137137+ if url:
138138+ url = 'http:' + ''.join(url[0].extract())
139139+ log.msg('PubChem compound page: %s' % url, level=log.DEBUG)
140140+ else:
141141+ log.msg('PubChem search found results, but no url in first result',
142142+ level=log.DEBUG)
143143+ return None
144144+145145+ return Request(url=url, callback=self.parse)
146146+147147+ def new_compound_request(self, compound):
148148+ return Request(url=self.website_www[:-1] + self.search % compound,
149149+ callback=self.parse_searchrequest)
+97-45
FourmiCrawler/sources/WikipediaParser.py
···11+import re
22+13from scrapy.http import Request
24from scrapy import log
55+from scrapy.selector import Selector
66+37from source import Source
44-from scrapy.selector import Selector
58from FourmiCrawler.items import Result
66-import re
79810911class WikipediaParser(Source):
1012 """ Wikipedia scraper for chemical properties
11131214 This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values.
1313- It also returns requests with other external sources which contain information on parsed subject.
1515+ It also returns requests with other external sources which contain information on parsed subject.
1416 """
15171616- website = "http://en.wikipedia.org/wiki/*"
1818+ website = "http://en\\.wikipedia\\.org/wiki/.*"
1719 __spider = None
1820 searched_compounds = []
19212020- def __init__(self):
2121- Source.__init__(self)
2222+ def __init__(self, config=None):
2323+ Source.__init__(self, config)
22242325 def parse(self, response):
2424- """ Distributes the above described behaviour """
2626+ """
2727+ Distributes the above described behaviour
2828+ :param response: The incoming search request
2929+ :return: Returns the found properties if response is unique or returns none if it's already known
3030+ """
2531 log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
2632 sel = Selector(response)
2733 compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page
···3339 return items
34403541 def parse_infobox(self, sel):
3636- """ scrape data from infobox on wikipedia. """
4242+ """
4343+ Scrape data from infobox on wikipedia.
4444+4545+ Data from two types of infoboxes: class="infobox bordered" and class="infobox" is scraped and
4646+ :param sel: The selector with the html-information of the page to parse
4747+ :return: item_list: Returns a list of properties with their values, source, etc..
4848+ """
4949+3750 items = []
38513939- #be sure to get chembox (wikipedia template)
4040- tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
4141- xpath('normalize-space(string())')
4242- prop_names = tr_list[::2]
4343- prop_values = tr_list[1::2]
4444- for i, prop_name in enumerate(prop_names):
4545- item = Result({
4646- 'attribute': prop_name.extract().encode('utf-8'),
4747- 'value': prop_values[i].extract().encode('utf-8'),
4848- 'source': "Wikipedia",
4949- 'reliability': "Unknown",
5050- 'conditions': ""
5151- })
5252- items.append(item)
5353- log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
5252+ # scrape the chembox (wikipedia template)
5353+ items = self.parse_chembox(sel, items)
54545555- #scrape the drugbox (wikipedia template)
5656- tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
5757- log.msg('dit: %s' % tr_list2, level=log.DEBUG)
5858- for tablerow in tr_list2:
5959- log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
6060- if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
6161- 'normalize-space(string())'):
6262- item = Result({
6363- 'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
6464- 'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
6565- 'source': "Wikipedia",
6666- 'reliability': "Unknown",
6767- 'conditions': ""
6868- })
6969- items.append(item)
7070- log.msg(
7171- 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
7272- level=log.DEBUG)
5555+ # scrape the drugbox (wikipedia template)
5656+ items = self.parse_drugbox(sel, items)
73577458 items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
7559 item_list = self.clean_items(items)
···93779478 return item_list
95798080+ def parse_chembox(self, sel, items):
8181+ """
8282+ Scrape data from chembox infobox on wikipedia.
8383+8484+ :param sel: The selector with the html-information of the page to parse
8585+ :param items: the list of items where the result have to be stored in
8686+ :return: items: the list of items with the new found and stored items
8787+ """
8888+ tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
8989+ xpath('normalize-space(string())')
9090+ prop_names = tr_list[::2]
9191+ prop_values = tr_list[1::2]
9292+ for i, prop_name in enumerate(prop_names):
9393+ item = self.newresult(
9494+ attribute=prop_name.extract().encode('utf-8'),
9595+ value=prop_values[i].extract().encode('utf-8')
9696+ )
9797+ items.append(item)
9898+ log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
9999+ return items
100100+101101+ def parse_drugbox(self, sel, items):
102102+ """
103103+ Scrape data from drugbox infobox on wikipedia.
104104+105105+ :param sel: The selector with the html-information of the page to parse
106106+ :param items: the list of items where the result have to be stored in
107107+ :return: items: the list of items with the new found and stored items
108108+ """
109109+ tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
110110+ log.msg('dit: %s' % tr_list2, level=log.DEBUG)
111111+ for tablerow in tr_list2:
112112+ log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
113113+ if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
114114+ 'normalize-space(string())'):
115115+ item = self.newresult(
116116+ attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
117117+ value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
118118+ )
119119+ items.append(item)
120120+ log.msg(
121121+ 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
122122+ level=log.DEBUG)
123123+ return items
124124+96125 def new_compound_request(self, compound):
9797- return Request(url=self.website[:-1] + compound, callback=self.parse)
126126+ return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
9812799128 @staticmethod
100129 def clean_items(items):
101101- """ clean up properties using regex, makes it possible to split the values from the units """
130130+131131+ """
132132+ Clean up properties using regex, makes it possible to split the values from the units
133133+134134+ Almost not in use, only cleans J/K/mol values and boiling/melting points.
135135+136136+ :param items: List of properties with their values, source, etc..
137137+ :return: items: List of now cleaned up items
138138+ """
102139 for item in items:
103140 value = item['value']
104141 m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F)
···111148112149 @staticmethod
113150 def get_identifiers(sel):
114114- """ find external links, named 'Identifiers' to different sources. """
151151+ """
152152+ Find external links, named 'Identifiers' to different sources.
153153+154154+ :param sel: The selector with the html-information of the page to parse
155155+ :return: links: New links which can be used to expand the crawlers search
156156+ """
115157 links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
116158 '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
117117- return links159159+ return links
160160+161161+ def newresult(self, attribute, value):
162162+ return Result(
163163+ {
164164+ 'attribute': attribute,
165165+ 'value': value,
166166+ 'source': 'Wikipedia',
167167+ 'reliability': self.cfg['reliability'],
168168+ 'conditions': ''
169169+ })
+25-5
FourmiCrawler/sources/source.py
···334455class Source:
66- website = "http://something/*" # Regex of URI's the source is able to parse
66+ website = "http://something/.*" # Regex of URI's the source is able to parse
77 _spider = None
8899- def __init__(self):
99+ def __init__(self, config=None):
1010+ """
1111+ Initiation of a new Source
1212+ """
1313+ self.cfg = {}
1414+ if config is not None:
1515+ self.cfg = config
1016 pass
11171212- def parse(self, reponse):
1313- log.msg("The parse function of the empty parser was used.", level=log.WARNING)
1818+ def parse(self, response):
1919+ """
2020+ This function should be able to parse all Scrapy Response objects with a URL matching the website Regex.
2121+ :param response: A Scrapy Response object
2222+ :return: A list of Result items and new Scrapy Requests
2323+ """
2424+ log.msg("The parse function of the empty source was used.", level=log.WARNING)
1425 pass
15261627 def new_compound_request(self, compound):
1717- # return Request(url=self.website[:-1] + compound, callback=self.parse)
2828+ """
2929+ This function should return a Scrapy Request for the given compound request.
3030+ :param compound: A compound name.
3131+ :return: A new Scrapy Request
3232+ """
3333+ # return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
1834 pass
19352036 def set_spider(self, spider):
3737+ """
3838+ A Function to save the associated spider.
3939+ :param spider: A FourmiSpider object
4040+ """
2141 self._spider = spider
+61-23
FourmiCrawler/spider.py
···11+import re
22+13from scrapy.spider import Spider
24from scrapy import log
33-import re
455667class FourmiSpider(Spider):
88+ """
99+ A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
1010+ """
711 name = "FourmiSpider"
88- __parsers = []
99- synonyms = []
10121111- def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
1313+ def __init__(self, compound=None, selected_attributes=None, *args, **kwargs):
1414+ """
1515+ Initiation of the Spider
1616+ :param compound: compound that will be searched.
1717+ :param selected_attributes: A list of regular expressions that the attributes should match.
1818+ """
1919+ self._sources = []
2020+ self.synonyms = set()
1221 super(FourmiSpider, self).__init__(*args, **kwargs)
1313- self.synonyms.append(compound)
1414- self.selected_attributes = selected_attributes;
2222+ self.synonyms.add(compound)
2323+ if selected_attributes is None:
2424+ self.selected_attributes = [".*"]
2525+ else:
2626+ self.selected_attributes = selected_attributes
15271616- def parse(self, reponse):
1717- for parser in self.__parsers:
1818- if re.match(parser.website, reponse.url):
1919- log.msg("Url: " + reponse.url + " -> Source: " + parser.website, level=log.DEBUG)
2020- return parser.parse(reponse)
2828+ def parse(self, response):
2929+ """
3030+ The function that is called when a response to a request is available. This function distributes this to a
3131+ source which should be able to handle parsing the data.
3232+ :param response: A Scrapy Response object that should be parsed
3333+ :return: A list of Result items and new Request to be handled by the scrapy core.
3434+ """
3535+ for source in self._sources:
3636+ if re.match(source.website, response.url):
3737+ log.msg("URL: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
3838+ return source.parse(response)
3939+ log.msg("URL: " + response.url + " -> No compatible source", level=log.INFO)
2140 return None
22412323- def get_synonym_requests(self, compound):
4242+ def get_synonym_requests(self, compound, force=False):
4343+ """
4444+ A function that generates new Scrapy Request for each source given a new synonym of a compound.
4545+ :param compound: A compound name
4646+ :return: A list of Scrapy Request objects
4747+ """
2448 requests = []
2525- for parser in self.__parsers:
2626- parser_requests = parser.new_compound_request(compound)
2727- if parser_requests is not None:
2828- requests.append(parser_requests)
4949+ if force or compound not in self.synonyms:
5050+ self.synonyms.add(compound)
5151+ for parser in self._sources:
5252+ parser_requests = parser.new_compound_request(compound)
5353+ if parser_requests is not None:
5454+ requests.append(parser_requests)
2955 return requests
30563157 def start_requests(self):
5858+ """
5959+ The function called by Scrapy for it's first Requests
6060+ :return: A list of Scrapy Request generated from the known synonyms using the available sources.
6161+ """
3262 requests = []
3363 for synonym in self.synonyms:
3434- requests.extend(self.get_synonym_requests(synonym))
6464+ requests.extend(self.get_synonym_requests(synonym, force=True))
3565 return requests
36663737- def add_parsers(self, parsers):
3838- for parser in parsers:
3939- self.add_parser(parser)
6767+ def add_sources(self, sources):
6868+ """
6969+ A function to add a new Parser objects to the list of available sources.
7070+ :param sources: A list of Source Objects.
7171+ """
7272+ for parser in sources:
7373+ self.add_source(parser)
40744141- def add_parser(self, parser):
4242- self.__parsers.append(parser)
4343- parser.set_spider(self)7575+ def add_source(self, source):
7676+ """
7777+ A function add a new Parser object to the list of available parsers.
7878+ :param source: A Source Object
7979+ """
8080+ self._sources.append(source)
8181+ source.set_spider(self)
···11+import ConfigParser
22+33+44+class ConfigImporter():
55+ def __init__(self, filename):
66+ """Read the filename into the parser."""
77+ self.filename = filename
88+ self.parser = ConfigParser.ConfigParser()
99+ self.parser.read(self.filename)
1010+1111+ def load_common_attributes(self):
1212+ """Loads common attributes from the initialized file."""
1313+ try:
1414+ return self.parser.get('GUI', 'CommonParameters')
1515+ except:
1616+ return 'One, Two, Three'
1717+1818+ def load_output_types(self):
1919+ """Loads output types from the initialized file."""
2020+ try:
2121+ return self.parser.get('GUI', 'OutputTypes')
2222+ except:
2323+ return 'csv'
2424+2525+ def load_always_attributes(self):
2626+ """Loads attributes that are always searched for from the initialized file."""
2727+ try:
2828+ return self.parser.get('GUI', 'AlwaysParameters')
2929+ except:
3030+ return 'Name, Weight'
+196
GUI/gui.py
···11+from Tkinter import *
22+import os
33+import shutil
44+from tkFileDialog import asksaveasfilename
55+66+from configImporter import *
77+88+99+class GUI():
1010+ def __init__(self, search, config_file='GUI.cfg', sourceloader=None, in_source=True):
1111+ """Boots the window, configuration."""
1212+ if not in_source:
1313+ current_dir = os.path.dirname(os.path.abspath(__file__))
1414+ config_file = current_dir + '../' + config_file
1515+ if not os.path.isfile(config_file):
1616+ try:
1717+ shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../GUI.cfg.sample", config_file)
1818+ except IOError:
1919+ print "GUI configuration couldn't be found and couldn't be created."
2020+ sys.exit()
2121+ self.configurator = ConfigImporter(config_file)
2222+ self.sourceloader = sourceloader
2323+ self.finish_with_search = False
2424+ self.values = {}
2525+ self.required_variables = ['substance']
2626+ self.search = search
2727+ self.window, self.variables = self.generate_window(self.load_common_attributes(), self.load_output_types())
2828+2929+ def load_common_attributes(self):
3030+ """Calls the configuration parser for common attributes."""
3131+ return [x.strip() for x in self.configurator.load_common_attributes().split(',')]
3232+3333+ def load_output_types(self):
3434+ """Calls the configuration parser for output types."""
3535+ return [x.strip() for x in self.configurator.load_output_types().split(',')]
3636+3737+ def load_always_attributes(self):
3838+ """Calls the configuration parser for attributes that are always used."""
3939+ return ','.join([x.strip() for x in self.configurator.load_always_attributes().split(',')])
4040+4141+ def set_output(self):
4242+ self.variable_output_name.set(asksaveasfilename())
4343+ self.button_output_name.config(text=self.variable_output_name.get())
4444+4545+ def generate_window(self, common_attributes, output_types):
4646+ """Creates all widgets and variables in the window."""
4747+ window = Tk()
4848+ window.wm_title("Fourmi Crawler")
4949+5050+ variables = {}
5151+5252+ variable_substance = StringVar(window)
5353+ frame_substance = Frame(window)
5454+ label_substance = Label(frame_substance, text="Substance: ")
5555+ input_substance = Entry(frame_substance, font=("Helvetica", 12), width=25, textvariable=variable_substance)
5656+ variables.update({"substance": variable_substance})
5757+ frame_substance.pack(side=TOP)
5858+ label_substance.pack()
5959+ input_substance.pack()
6060+ input_substance.focus()
6161+6262+ frame_all_attributes = Frame(window)
6363+ frame_selecting_attributes = Frame(frame_all_attributes)
6464+ frame_new_attributes = Frame(frame_selecting_attributes)
6565+ label_new_attributes = Label(frame_new_attributes, text="Parameters: ")
6666+ input_new_attributes = Text(frame_new_attributes, font=("Helvetica", 8), width=25, height=7, padx=5, pady=5)
6767+ variables.update({"new_attributes": input_new_attributes})
6868+ frame_new_attributes.pack(side=LEFT)
6969+ label_new_attributes.pack()
7070+ input_new_attributes.pack()
7171+7272+ frame_common_attributes = Frame(frame_selecting_attributes)
7373+ label_common_attributes = Label(frame_common_attributes, text="Common Parameters: ")
7474+ input_common_attributes = Listbox(frame_common_attributes, selectmode=MULTIPLE, height=7)
7575+ scrollbar_common_attributes = Scrollbar(frame_common_attributes)
7676+ input_common_attributes.config(yscrollcommand=scrollbar_common_attributes.set)
7777+ scrollbar_common_attributes.config(command=input_common_attributes.yview)
7878+ if common_attributes and len(common_attributes) > 0:
7979+ input_common_attributes.insert(END, *common_attributes)
8080+ variables.update({"common_attributes": input_common_attributes})
8181+ frame_common_attributes.pack(side=RIGHT)
8282+ label_common_attributes.pack(side=TOP)
8383+ input_common_attributes.pack(side=LEFT)
8484+ scrollbar_common_attributes.pack(side=RIGHT, fill=Y)
8585+ frame_selecting_attributes.pack()
8686+8787+ frame_last = Frame(window)
8888+ search_button = Button(frame_last, text="Start search", command=self.prepare_search)
8989+ cancel_button = Button(frame_last, text="Cancel", command=window.destroy)
9090+ frame_last.pack(side=BOTTOM)
9191+ search_button.pack(side=LEFT)
9292+ cancel_button.pack(side=RIGHT)
9393+9494+ frame_name = Frame(window)
9595+ frame_output_name = Frame(frame_name)
9696+ label_output_name = Label(frame_output_name, text='Output file:')
9797+ self.variable_output_name = StringVar()
9898+ self.variable_output_name.set('results.csv')
9999+ variables.update({'output_name':self.variable_output_name})
100100+ self.button_output_name = Button(frame_output_name, command=self.set_output, text="Select file")
101101+ frame_output_name.pack(side=LEFT)
102102+ label_output_name.pack()
103103+ self.button_output_name.pack()
104104+ frame_name.pack(side=BOTTOM)
105105+106106+107107+ frame_checkboxes = Frame(window)
108108+ frame_checkbox_attributes = Frame(frame_checkboxes)
109109+ variable_all_attributes = BooleanVar()
110110+ variable_all_attributes.set(True)
111111+ input_all_attributes = Checkbutton(frame_checkbox_attributes, text="Search ALL parameters",
112112+ variable=variable_all_attributes)
113113+ variables.update({"all_attributes": variable_all_attributes})
114114+ frame_checkbox_attributes.pack(side=LEFT)
115115+ input_all_attributes.pack()
116116+117117+ frame_logging = Frame(frame_checkboxes)
118118+ variable_logging = BooleanVar()
119119+ variable_logging.set(False)
120120+ input_logging = Checkbutton(frame_logging, text="Verbose logging", variable=variable_logging)
121121+ variables.update({'logging':variable_logging})
122122+ frame_logging.pack(side=RIGHT)
123123+ frame_checkboxes.pack(side=BOTTOM)
124124+ input_logging.pack()
125125+ frame_all_attributes.pack()
126126+127127+ return window, variables
128128+129129+ def prepare_search(self):
130130+ """Saves the values from the window for later retrieval."""
131131+ variables = self.variables
132132+ values = {}
133133+134134+ values.update({"Always attributes": self.load_always_attributes()})
135135+ for name, var in variables.iteritems():
136136+ if var.__class__ is StringVar:
137137+ values.update({name: var.get()})
138138+ elif var.__class__ is BooleanVar:
139139+ values.update({name: var.get()})
140140+ elif var.__class__ is Text:
141141+ values.update({name: str(var.get("1.0", END)).strip()})
142142+ elif var.__class__ is Listbox:
143143+ values.update({name: ", ".join([var.get(int(i)) for i in var.curselection()])})
144144+ else:
145145+ print "No known class, {}, {}".format(name, var)
146146+147147+ values.update({'output_name':self.variable_output_name.get()})
148148+ values.update({'output_type':self.check_output_type(values.get('output_name'))})
149149+150150+ self.values = values
151151+ if all([values.get(i) != '' for i in self.required_variables]):
152152+ self.finish_with_search = True
153153+ self.window.destroy()
154154+ else:
155155+ self.finish_with_search = False
156156+ #tkMessageBox.showinfo('Not all required information was entered!')
157157+158158+ def execute_search(self):
159159+ """Calls the Fourmi crawler with the values from the GUI"""
160160+ if self.values.get('all_attributes'):
161161+ attributes = ".*"
162162+ else:
163163+ attribute_types = ['attributes', 'Common attributes', 'Always attributes']
164164+ attributes = ','.join([str(self.values.get(attribute)) for attribute in attribute_types])
165165+ output_file = "file://" + str(self.values.get('output_name')) #Dealing with absolute paths
166166+167167+ arguments = {'--attributes': attributes,
168168+ '--exclude': None,
169169+ '--format': self.values.get('output_type'),
170170+ '--help': False,
171171+ '--include': None,
172172+ '--log': 'log.txt',
173173+ '--output': output_file,
174174+ '-v': 0 if self.values.get('logging') else 3,
175175+ '--version': False,
176176+ '<compound>': self.values.get('substance'),
177177+ 'list': False,
178178+ 'search': True}
179179+180180+ self.search(arguments, self.sourceloader)
181181+182182+ def run(self):
183183+ """Starts the window and the search."""
184184+ self.window.mainloop()
185185+ if self.finish_with_search:
186186+ self.execute_search()
187187+188188+ def check_output_type(self, filename):
189189+ parts = str(filename).split('.')
190190+ output_types = self.load_output_types()
191191+ extension = parts[-1]
192192+193193+ for type in output_types:
194194+ if extension==type:
195195+ return extension
196196+ return output_types[0]
+10
GUI.cfg.sample
···11+[GUI]
22+# Personalize options in your User Interface
33+44+# Commonly used parameters are listed in the GUI for easy selection
55+CommonParameters = Weight, Polarity, Viscosity, Solubility, Name
66+77+# Parameters that are always used in the search
88+AlwaysParameters = Name
99+1010+OutputTypes = csv, json, jsonlines, xml
+9-10
README.md
···11# Fourmi
2233+**Master branch**: [](https://travis-ci.org/jjdekker/Fourmi) [](https://coveralls.io/r/jjdekker/Fourmi?branch=master)
44+55+**Developing branch**: [](https://travis-ci.org/jjdekker/Fourmi) [](https://coveralls.io/r/jjdekker/Fourmi?branch=develop)
66+37Fourmi is an web scraper for chemical substances. The program is designed to be
48used as a search engine to search multiple chemical databases for a specific
59substance. The program will produce all available attributes of the substance
···19232024### Installing
21252222-If you're installing Fourmi, please take a look at our [installation guide](...)
2323-on our wiki. When you've installed the application, make sure to check our
2424-[usage guide](...).
2626+If you're installing Fourmi, please take a look at our installation guides
2727+on our [wiki](https://github.com/jjdekker/Fourmi/wiki). When you've installed the application, make sure to check our
2828+usage guide on the [Command Line Interface](https://github.com/jjdekker/Fourmi/wiki/CLI) and on the [Graphical User Interface](https://github.com/jjdekker/Fourmi/wiki/GUI).
25292630### Using the Source
27312832To use the Fourmi source code multiple dependencies are required. Take a look at
2929-the [wiki page](...) on using the application source code for a step by step
3333+our [wiki pages](https://github.com/jjdekker/Fourmi/wiki) on using the application source code in our a step by step
3034installation guide.
31353236When developing for the Fourmi project keep in mind that code readability is a
3337must. To maintain the readability, code should be conform with the
3438[PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python
3539code. More information about the different structures and principles of the
3636-Fourmi application can be found on our [wiki](...).
4040+Fourmi application can be found on our [wiki](https://github.com/jjdekker/Fourmi/wiki).
37413842### To Do
3943···41454246__Main goals:__
43474444-- Improve our documentation and guides. (Assignee: Dekker)
4548- Build an graphical user interface(GUI) as alternative for the command line
4649interface(CLI). (Assignee: Harmen)
4750- Compiling the source into an windows executable. (Assignee: Bas)
4848-- Create an configuration file to hold logins and API keys.
4949-- Determine reliability of our data point.
5050-- Create an module to gather data from NIST. (Assignee: Rob)
5151-- Create an module to gather data from PubChem. (Assignee: Nout)
52515352__Side goals:__
5453
+108
SIGNED.md
···11+##### Signed by https://keybase.io/jdekker
22+```
33+-----BEGIN PGP SIGNATURE-----
44+Version: GnuPG v1.4.11 (GNU/Linux)
55+66+iQIcBAABAgAGBQJTpMZAAAoJEJrQ9RIUCT6/Hf8P/AyX9ZD5zj6rBi2CwDOTs5aa
77+flVqw9syvdqTzVfXQaR4UrCSOuyuOeAkiqub0BMjxyCurqAwN/SCPf3uOJ/tGXmt
88+ZPtYVHjevJ4mbojLhZiJ2av8LC9VOh3Zl+reR3L2cLuBD4rVSrfUMJtczbbtNlk+
99++mczRcTpzNvHQW6mKqyUoKn8xqNnLC7C+p5ybNZ5EADUfoKIF1xyTN6je6fpYZ1U
1010+IHxiUzeOvfX9ohmbfnfkpkuSll1nUJWsTgUPKhthJuxEhwCQ1xMdWhxfcyZJaMT2
1111+Pxgo8C8S6lzAk4PxBRBoePjgWAeaFmbr317WXHvw6SSHPIdzToKZgDiDC5LWvKxb
1212+RRdLZ6w7tg0/FSUexekrUafGT8Je0oIoLUQlNaEQzrPNhDpma1uHFfZg0vb2m4Hq
1313+WHLLKTCr6FMczhP1TmuIEtdjKtymT+rO+Ls4ciw+654R7MtBYcmTr+RqmAd+GadJ
1414+vJNmGDod2oPwCydEps8bYAbksqRhMmk3xwco/g6dWYh5/+1GzCr80J7fYpqtoPFH
1515+V5qKyDQovF5jPlb/buq4mH8XYVT1z4Sx8azKVctMLig57zRnvN0WyskpT09oY7dK
1616+TPvIqwTixekndYLcM3QacVq/NhVOOQPFvD0PwU18eKs4EfD2L7iWd2XjV9Az++aD
1717+jUY6EwEuOzDCexWP4eM8
1818+=h6TK
1919+-----END PGP SIGNATURE-----
2020+2121+```
2222+2323+<!-- END SIGNATURES -->
2424+2525+### Begin signed statement
2626+2727+#### Expect
2828+2929+```
3030+size exec file contents
3131+ ./
3232+412 .gitignore 25059da2ee328837ece01b979cd5c1083ed1679372f06c14c1c58035d8120614
3333+548 .travis.yml 7f11bc58a8e94276ef949afeb107f9f1e184c0dbb84f821705ea2245902ed546
3434+846 Changelog.md 345f9aea4812b37b1b2714703ea0d5edd27414c0f839ec3e322450ad5ec5c6ed
3535+ FourmiCrawler/
3636+0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
3737+304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806
3838+2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49
3939+677 settings.py f1e7d21b899ffc2523516c0ebe67d967dc62495b90c2fe34651042a3049fcd94
4040+ sources/
4141+12103 ChemSpider.py f647d70acf9b3f1ee7bde75586aa45156331f977ca7fe836ceac4477a2c0d4ce
4242+12400 NIST.py cdb4c423355ac8fb1097197a9f8df44f667925a785c6bae7c583820da08908ee
4343+6121 PubChem.py 8f8ad40459090b818a384a202e739fe4696a04154df2b8419aee896b0fa02481
4444+6930 WikipediaParser.py ae9f57bbf2aad9c371abcd143fd2dda5995a196cb700734a5035dd94b1988870
4545+0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
4646+1281 source.py 7927fda259ff2c8096fa526db1f08586de6e04473a491e19a07b092fdeed81fc
4747+3111 spider.py ec7c946907fea10c17ee6dd88a506f3e3bf2cd748e3eb09200487fcec2ae7ba3
4848+ GUI/
4949+11 __init__.py 40567015c415e853210425c1b4f3834dbc2a3165e3713e04dd3424b79bc90aa3
5050+940 configImporter.py 5d731d63a3117b25b7e556a746a1dd5b16e8cbb60e57be46de333c31c8c00271
5151+8776 gui.py 20b2220bc3ca55ebfd6d04e8c0bebbf1ae316c85a54db60b8fc02d22642f19d5
5252+299 GUI.cfg.sample 4ee27f7099d588c21358cd645a21621e631d80712f1b514dad898faa5fee2483
5353+1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c
5454+3900 README.md f4a1e3ea1700d2b415acfad661cb45f960fe8e8ffbe98dbecb6c7ed071a101ac
5555+3846 x fourmi.py f0b11f5f153f96f6af2e504cdf369e43c04316752de131a659eb6246fd80212a
5656+261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85
5757+416 sources.cfg.sample 11cd0fc18693da17883c98d25a384ae1b6158adfef13778b6dd02b878f6b8a70
5858+ tests/
5959+107 __init__.py ce90e54e58a0912cadbe3adcf5166dc72477bf9ce289bf427f8e2f5b25406670
6060+2870 test_configurator.py 318d542b1cda5075a2a9a6be97e9e7a79372ee58e1ab3014c161534094f7364d
6161+1315 test_gui.py 0fb95d0b542765bf52bcebb037bf2ed1299209beab23448af741a93c9fbb1ca8
6262+1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031
6363+1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869
6464+2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299
6565+ utils/
6666+40 __init__.py f1237ae74693e2ec1b3154e57aec27438a80a735e5ccf2411aecd194ef443b6a
6767+4047 configurator.py 8b566a0435a9f105a8ec616b16c3e21edb9b82f8debe1ef9f1df6bbbf20949d5
6868+2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37
6969+```
7070+7171+#### Ignore
7272+7373+```
7474+/SIGNED.md
7575+```
7676+7777+#### Presets
7878+7979+```
8080+git # ignore .git and anything as described by .gitignore files
8181+dropbox # ignore .dropbox-cache and other Dropbox-related files
8282+kb # ignore anything as described by .kbignore files
8383+```
8484+8585+<!-- summarize version = 0.0.9 -->
8686+8787+### End signed statement
8888+8989+<hr>
9090+9191+#### Notes
9292+9393+With keybase you can sign any directory's contents, whether it's a git repo,
9494+source code distribution, or a personal documents folder. It aims to replace the drudgery of:
9595+9696+ 1. comparing a zipped file to a detached statement
9797+ 2. downloading a public key
9898+ 3. confirming it is in fact the author's by reviewing public statements they've made, using it
9999+100100+All in one simple command:
101101+102102+```bash
103103+keybase dir verify
104104+```
105105+106106+There are lots of options, including assertions for automating your checks.
107107+108108+For more info, check out https://keybase.io/docs/command_line/code_signing
+38-44
fourmi.py
···11#!/usr/bin/env python
22"""
33-Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms).
33+Fourmi, a web scraper build to search specific information for a given compound (and its pseudonyms).
4455Usage:
66+ fourmi
67 fourmi search <compound>
78 fourmi [options] search <compound>
88- fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
99+ fourmi [options] [-v | -vv | -vvv] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
910 fourmi list
1011 fourmi [--include=<sourcename> | --exclude=<sourcename>] list
1112 fourmi -h | --help
···1516 --attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
1617 -h --help Show this screen.
1718 --version Show version.
1818- --verbose Verbose logging output.
1919+ -v Verbose logging output. (Multiple occurrences increase logging level)
1920 --log=<file> Save log to an file.
2020- -o <file> --output=<file> Output file [default: result.*format*]
2121- -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines]
2121+ -o <file> --output=<file> Output file [default: <compound>.*format*]
2222+ -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
2223 --include=<regex> Include only sources that match these regular expressions split by a comma.
2324 --exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
2425"""
25262627from twisted.internet import reactor
2728from scrapy.crawler import Crawler
2828-from scrapy import log, signals
2929-from scrapy.utils.project import get_project_settings
2929+from scrapy import signals, log
3030import docopt
31313232from FourmiCrawler.spider import FourmiSpider
3333-from sourceloader import SourceLoader
3333+from utils.configurator import Configurator
3434+from utils.sourceloader import SourceLoader
3535+from GUI import gui
343635373636-def setup_crawler(searchable, settings, source_loader, attributes):
3737- spider = FourmiSpider(compound=searchable, selected_attributes=attributes)
3838- spider.add_parsers(source_loader.sources)
3838+def setup_crawler(compound, settings, source_loader, attributes):
3939+ """
4040+ This function prepares and start the crawler which starts the actual search on the internet
4141+ :param compound: The compound which should be searched
4242+ :param settings: A scrapy settings object
4343+ :param source_loader: A fully functional SourceLoader object which contains only the sources that should be used.
4444+ :param attributes: A list of regular expressions which the attribute names should match.
4545+ """
4646+ spider = FourmiSpider(compound=compound, selected_attributes=attributes)
4747+ spider.add_sources(source_loader.sources)
3948 crawler = Crawler(settings)
4049 crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
4150 crawler.configure()
···4352 crawler.start()
445345544646-def scrapy_settings_manipulation(docopt_arguments):
4747- settings = get_project_settings()
4848- # [todo] - add at least a warning for files that already exist
4949- if docopt_arguments["--output"] != 'result.*format*':
5050- settings.overrides["FEED_URI"] = docopt_arguments["--output"]
5151- elif docopt_arguments["--format"] == "jsonlines":
5252- settings.overrides["FEED_URI"] = "results.json"
5353- elif docopt_arguments["--format"] is not None:
5454- settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"]
5555-5656- if docopt_arguments["--format"] is not None:
5757- settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"]
5858-5959- return settings
6060-6161-6262-def start_log(docopt_arguments):
6363- if docopt_arguments["--log"] is not None:
6464- if docopt_arguments["--verbose"]:
6565- log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
6666- else:
6767- log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING)
6868- else:
6969- if docopt_arguments["--verbose"]:
7070- log.start(logstdout=False, loglevel=log.DEBUG)
7171- else:
7272- log.start(logstdout=True, loglevel=log.WARNING)
7373-7474-7555def search(docopt_arguments, source_loader):
7676- start_log(docopt_arguments)
7777- settings = scrapy_settings_manipulation(docopt_arguments)
7878- setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
5656+ """
5757+ The function that facilitates the search for a specific compound.
5858+ :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
5959+ :param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
6060+ """
6161+ conf = Configurator()
6262+ conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
6363+ conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments["<compound>"])
6464+ setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
6565+ source_loader, docopt_arguments["--attributes"].split(','))
6666+ if conf.scrapy_settings.getbool("LOG_ENABLED"):
6767+ log.start(conf.scrapy_settings.get("LOG_FILE"),
6868+ conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
7969 reactor.run()
807081717272+# The start for the Fourmi Command Line interface.
8273if __name__ == '__main__':
8383- arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.0')
7474+ arguments = docopt.docopt(__doc__, version='Fourmi - V0.6.0')
8475 loader = SourceLoader()
85768677 if arguments["--include"]:
···9384 elif arguments["list"]:
9485 print "-== Available Sources ==-"
9586 print str(loader)
8787+ else:
8888+ gui_window = gui.GUI(search, sourceloader=SourceLoader())
8989+ gui_window.run()
-41
sourceloader.py
···11-import inspect
22-import os
33-import re
44-from FourmiCrawler.sources.source import Source
55-66-77-class SourceLoader:
88- sources = []
99-1010- def __init__(self, rel_dir="FourmiCrawler/sources"):
1111- path = os.path.dirname(os.path.abspath(__file__))
1212- path += "/" + rel_dir
1313- known_parser = set()
1414-1515- for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
1616- mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
1717- classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
1818- for cls in classes:
1919- if issubclass(cls, Source) and cls not in known_parser:
2020- self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers?
2121- known_parser.add(cls)
2222-2323- def include(self, source_names):
2424- new = set()
2525- for name in source_names:
2626- new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
2727- self.sources = list(new)
2828-2929- def exclude(self, source_names):
3030- exclude = []
3131- for name in source_names:
3232- exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
3333- self.sources = [src for src in self.sources if src not in exclude]
3434-3535- def __str__(self):
3636- string = ""
3737- for src in self.sources:
3838- string += "Source: " + src.__class__.__name__
3939- string += " - "
4040- string += "URI: " + src.website + "\n"
4141- return string
+19
sources.cfg.sample
···11+[DEFAULT]
22+reliability = Unknown
33+44+#For each source listed in FourmiCrawler/sources there should be a section
55+#named exactly as the filename in here. If not present, the DEFAULT value is
66+#used for reliability of that source.
77+88+[ChemSpider]
99+reliability = High
1010+#token=Paste ChemSpider API token here and remove the hashtag
1111+1212+[NIST]
1313+reliability = High
1414+1515+[WikipediaParser]
1616+reliability = Medium
1717+1818+[PubChem]
1919+reliability = High
···11+import ConfigParser
22+import os
33+import shutil
44+55+from scrapy.utils.project import get_project_settings
66+77+88+class Configurator:
99+ """
1010+ A helper class in the fourmi class. This class is used to process the settings as set
1111+ from one of the Fourmi applications.
1212+ """
1313+1414+ def __init__(self):
1515+ self.scrapy_settings = get_project_settings()
1616+1717+ def set_output(self, filename, fileformat, compound):
1818+ """
1919+ This function manipulates the Scrapy output file settings that normally would be set in the settings file.
2020+ In the Fourmi project these are command line arguments.
2121+ :param filename: The filename of the file where the output will be put.
2222+ :param fileformat: The format in which the output will be.
2323+ """
2424+2525+ if filename != '<compound>.*format*':
2626+ self.scrapy_settings.overrides["FEED_URI"] = filename
2727+ elif fileformat == "jsonlines":
2828+ self.scrapy_settings.overrides["FEED_URI"] = compound + ".json"
2929+ elif fileformat is not None:
3030+ self.scrapy_settings.overrides["FEED_URI"] = compound + "." + fileformat
3131+3232+ if fileformat is not None:
3333+ self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
3434+3535+ def set_logging(self, logfile=None, verbose=0):
3636+ """
3737+ This function changes the default settings of Scapy's logging functionality
3838+ using the settings given by the CLI.
3939+ :param logfile: The location where the logfile will be saved.
4040+ :param verbose: A integer value to switch between loglevels.
4141+ """
4242+ if verbose != 0:
4343+ self.scrapy_settings.overrides["LOG_ENABLED"] = True
4444+ else:
4545+ self.scrapy_settings.overrides["LOG_ENABLED"] = False
4646+4747+ if verbose == 1:
4848+ self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING"
4949+ elif verbose == 2:
5050+ self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO"
5151+ else:
5252+ self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG"
5353+5454+ if verbose > 1:
5555+ self.scrapy_settings.overrides["LOG_STDOUT"] = False
5656+ else:
5757+ self.scrapy_settings.overrides["LOG_STDOUT"] = True
5858+5959+ if logfile is not None:
6060+ self.scrapy_settings.overrides["LOG_FILE"] = logfile
6161+ else:
6262+ self.scrapy_settings.overrides["LOG_FILE"] = None
6363+6464+ @staticmethod
6565+ def read_sourceconfiguration():
6666+ """
6767+ This function reads sources.cfg in the main folder for configuration
6868+ variables for sources
6969+ :return a ConfigParser object of sources.cfg
7070+ """
7171+ current_dir = os.path.dirname(os.path.abspath(__file__))
7272+ config_path = current_dir + '/../sources.cfg'
7373+ # [TODO]: location of sources.cfg should be softcoded eventually
7474+ if not os.path.isfile(config_path):
7575+ try:
7676+ shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../sources.cfg.sample", config_path)
7777+ except IOError:
7878+ print "WARNING: Source configuration couldn't be found and couldn't be created."
7979+ config = ConfigParser.ConfigParser()
8080+ config.read(config_path)
8181+ return config
8282+8383+ @staticmethod
8484+ def get_section(config, sourcename):
8585+ """
8686+ This function reads a config section labeled in variable sourcename and
8787+ tests whether the reliability variable is set else set to empty string.
8888+ Return the default section if the labeled config section does not exist
8989+ :param config: a ConfigParser object
9090+ :param sourcename: the name of the section to be read
9191+ :return a dictionary of the section in the config labeled in sourcename
9292+ """
9393+ section = dict()
9494+ if config.has_section(sourcename):
9595+ section = dict(config.items(sourcename))
9696+ elif config.defaults():
9797+ section = config.defaults()
9898+ if 'reliability' not in section:
9999+ print 'WARNING: Reliability not set for %s' % sourcename
100100+ section['reliability'] = ''
101101+ return section
+64
utils/sourceloader.py
···11+import inspect
22+import os
33+import re
44+55+from FourmiCrawler.sources.source import Source
66+from utils.configurator import Configurator
77+88+99+class SourceLoader:
1010+ sources = []
1111+1212+ def __init__(self, rel_dir="../FourmiCrawler/sources"):
1313+ """
1414+ The initiation of a SourceLoader, selects and indexes a directory for usable sources.
1515+ Also loads a configuration file for Sources and passes the arguments in
1616+ the named section to the source
1717+ :param rel_dir: A relative path to a directory.
1818+ """
1919+ path = os.path.dirname(os.path.abspath(__file__))
2020+ path += "/" + rel_dir
2121+ known_parser = set()
2222+2323+ config = Configurator.read_sourceconfiguration()
2424+2525+ for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
2626+ mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py])
2727+ classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
2828+ for cls in classes:
2929+ if issubclass(cls, Source) and cls not in known_parser:
3030+ sourcecfg = Configurator.get_section(config, cls.__name__)
3131+ self.sources.append(cls(sourcecfg))
3232+ known_parser.add(cls)
3333+3434+ def include(self, source_names):
3535+ """
3636+ This function excludes all sources that don't match the given regular expressions.
3737+ :param source_names: A list of regular expression (strings)
3838+ """
3939+ new = set()
4040+ for name in source_names:
4141+ new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
4242+ self.sources = list(new)
4343+4444+ def exclude(self, source_names):
4545+ """
4646+ This function excludes all sources that match the given regular expressions.
4747+ :param source_names: A list of regular expression (strings)
4848+ """
4949+ exclude = []
5050+ for name in source_names:
5151+ exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
5252+ self.sources = [src for src in self.sources if src not in exclude]
5353+5454+ def __str__(self):
5555+ """
5656+ This function returns a string with all sources currently available in the SourceLoader.
5757+ :return: a string with all available sources.
5858+ """
5959+ string = ""
6060+ for src in self.sources:
6161+ string += "Source: " + src.__class__.__name__
6262+ string += " - "
6363+ string += "URI: " + src.website + "\n"
6464+ return string