···4#Python Specific ignores
5*.pyc
6000007#THINGS WE WOULD NEVER EVER WANT!
8#ignore thumbnails created by windows
9Thumbs.db
···4#Python Specific ignores
5*.pyc
67+#may contain authentication information
8+sources.cfg
9+#Another of our config files
10+GUI.cfg
11+12#THINGS WE WOULD NEVER EVER WANT!
13#ignore thumbnails created by windows
14Thumbs.db
+23
.travis.yml
···00000000000000000000000
···1+# Config file for automatic testing at travis-ci.org
2+3+language: python
4+python: 2.7
5+6+before_install:
7+ - "export DISPLAY=:99.0"
8+ - "sh -e /etc/init.d/xvfb start"
9+10+# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
11+install:
12+ - pip install Scrapy docopt
13+ - pip install coveralls
14+15+# command to run tests, e.g. python setup.py test
16+script:
17+ - nosetests --with-coverage --cover-package=FourmiCrawler,utils,GUI tests
18+19+notifications:
20+ slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
21+22+after_success:
23+ coveralls --verbose
+20
Changelog.md
···00000000000000000000
···1+### v0.6.0
2+- Feature: Added a Graphical User interface
3+- Feature: Automatic config file createion from config samples
4+- FIX: The default name of the output files will now consist of the compound name and the file format when using the CLI
5+- FIX: A lot of bugfixes of the PubChem plugin, as is wasn't working as it should
6+- FIX: Using absolute path for configuration files
7+- DEV: General Code cleanup in documentation
8+9+### v0.5.3
10+- FIX: It is now again possible to use both verbose and the source inclusion/exclusion options
11+- FIX: Logging is now "actually" disabled if not using the verbose option.
12+- FEATURE: Added support for PubChem
13+14+### v0.5.2
15+- FIX: Signatured used to contain untracked and older files, current signature
16+should be correct.
17+18+### v0.5.1
19+- UPDATED: Logging functionality from command line
20+- DEV: Code cleanup and extra tests
-31
Fourmi.py
···1-#!/usr/bin/env python
2-"""
3-Fourmi - An internet webcrawler searching for information on chemical
4-compounds. [todo] - Add some more useful text here.
5-"""
6-7-from twisted.internet import reactor
8-from scrapy.crawler import Crawler
9-from scrapy import log, signals
10-from FourmiCrawler.spiders.Fourmispider import FourmiSpider
11-from scrapy.utils.project import get_project_settings
12-13-14-def setup_crawler(searchable):
15- # [TODO] - Initiate all parsers for the different websites and get
16- # allowed URLs.
17- spider = FourmiSpider(compound=searchable)
18- settings = get_project_settings()
19- crawler = Crawler(settings)
20- crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
21- crawler.configure()
22- crawler.crawl(spider)
23- crawler.start()
24-25-26-def start():
27- setup_crawler("Methane")
28- log.start()
29- reactor.run()
30-31-start()
···0000000000000000000000000000000
+1-3
FourmiCrawler/items.py
···1-# Define here the models for your scraped items
2-#
3-# See documentation in:
4# http://doc.scrapy.org/en/latest/topics/items.html
56from scrapy.item import Item, Field
···1+# For more information on item definitions, see the Scrapy documentation in:
002# http://doc.scrapy.org/en/latest/topics/items.html
34from scrapy.item import Item, Field
FourmiCrawler/parsers/__init__.py
This is a binary file and will not be displayed.
-9
FourmiCrawler/parsers/parser.py
···1-from scrapy import log
2-3-4-class Parser:
5- website = "http://localhost/*"
6-7- def parse(self, reponse):
8- log.msg("The parse function of the empty parser was used.", level=log.Warning)
9- pass
···000000000
+43-7
FourmiCrawler/pipelines.py
···1-# Define your item pipelines here
2-#
3-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
4-# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
5from scrapy.exceptions import DropItem
678-class FourmiPipeline(object):
00000000000000090010 def __init__(self):
11 self.known_values = set()
12···17 :param spider: The spider which scraped the spider
18 :return: :raise DropItem: Returns the item if unique or drops them if it's already known
19 """
20- value = item['attribute'], item['value']
21 if value in self.known_values:
22- raise DropItem("Duplicate item found: %s" % item)
23 else:
24 self.known_values.add(value)
25 return item
0000000000000000000
···1+# For more information on item pipelines, see the Scrapy documentation in:
2+# http://doc.scrapy.org/en/latest/topics/item-pipeline.html
3+import re
4+5from scrapy.exceptions import DropItem
678+class RemoveNonePipeline(object):
9+ def __init__(self):
10+ pass
11+12+ @staticmethod
13+ def process_item(item, spider):
14+ """
15+ Processing the items so None values are replaced by empty strings
16+ :param item: The incoming item
17+ :param spider: The spider which scraped the spider
18+ :return: :raise DropItem: Returns the item if unique or drops them if it's already known
19+ """
20+ for key in item:
21+ if item[key] is None:
22+ item[key] = ""
23+ return item
2425+26+class DuplicatePipeline(object):
27 def __init__(self):
28 self.known_values = set()
29···34 :param spider: The spider which scraped the spider
35 :return: :raise DropItem: Returns the item if unique or drops them if it's already known
36 """
37+ value = (item['attribute'], item['value'], item['conditions'])
38 if value in self.known_values:
39+ raise DropItem("Duplicate item found: %s" % item) # [todo] append sources of first item.
40 else:
41 self.known_values.add(value)
42 return item
43+44+45+class AttributeSelectionPipeline(object):
46+ def __init__(self):
47+ pass
48+49+ @staticmethod
50+ def process_item(item, spider):
51+ """
52+ The items are processed using the selected attribute list available in the spider,
53+ items that don't match the selected items are dropped.
54+ :param item: The incoming item
55+ :param spider: The spider which scraped the item. Should have an attribute "selected_attributes".
56+ :return: :raise DropItem: Returns item if it matches an selected attribute, else it is dropped.
57+ """
58+ if [x for x in spider.selected_attributes if re.match(x, item["attribute"])]:
59+ return item
60+ else:
61+ raise DropItem("Attribute not selected by used: %s" % item)
+7-3
FourmiCrawler/settings.py
···3# For simplicity, this file contains only the most important settings by
4# default. All the other settings are documented here:
5#
6-# http://doc.scrapy.org/en/latest/topics/settings.html
7#
89BOT_NAME = 'FourmiCrawler'
···11SPIDER_MODULES = ['FourmiCrawler']
12NEWSPIDER_MODULE = 'FourmiCrawler'
13ITEM_PIPELINES = {
14- 'FourmiCrawler.pipelines.FourmiPipeline': 100
0015}
001617# Crawl responsibly by identifying yourself (and your website) on the
18# user-agent
1920-# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
···3# For simplicity, this file contains only the most important settings by
4# default. All the other settings are documented here:
5#
6+# http://doc.scrapy.org/en/latest/topics/settings.html
7#
89BOT_NAME = 'FourmiCrawler'
···11SPIDER_MODULES = ['FourmiCrawler']
12NEWSPIDER_MODULE = 'FourmiCrawler'
13ITEM_PIPELINES = {
14+ "FourmiCrawler.pipelines.RemoveNonePipeline": 100,
15+ 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 200,
16+ 'FourmiCrawler.pipelines.DuplicatePipeline': 300,
17}
18+FEED_URI = 'results.json'
19+FEED_FORMAT = 'jsonlines'
2021# Crawl responsibly by identifying yourself (and your website) on the
22# user-agent
2324+USER_AGENT = 'Fourmi'
···1+import re
2+3+from scrapy import log
4+from scrapy.http import Request
5+from scrapy.selector import Selector
6+7+from source import Source
8+from FourmiCrawler.items import Result
9+10+11+# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
12+13+class ChemSpider(Source):
14+ """
15+ ChemSpider scraper for synonyms and properties
16+ This parser will manage searching for chemicals through the
17+ ChemsSpider API, and parsing the resulting ChemSpider page.
18+ The token required for the API should be in a configuration file
19+ somewhere.
20+ """
21+22+ website = 'http://www\\.chemspider\\.com/.*'
23+24+ search = 'Search.asmx/SimpleSearch?query=%s&token='
25+ structure = 'Chemical-Structure.%s.html'
26+ extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
27+28+ def __init__(self, config=None):
29+ """
30+ Initialization of ChemSpider scraper
31+ :param config: a dictionary of settings for this scraper, must contain
32+ 'reliability' key
33+ """
34+ Source.__init__(self, config)
35+ self.ignore_list = []
36+ if 'token' not in self.cfg or self.cfg['token'] == '':
37+ log.msg('ChemSpider token not set or empty, search/MassSpec API '
38+ 'not available', level=log.WARNING)
39+ self.cfg['token'] = ''
40+ self.search += self.cfg['token']
41+ self.extendedinfo += self.cfg['token']
42+43+ def parse(self, response):
44+ """
45+ This function is called when a Response matching the variable
46+ 'website' is available for parsing the Response object.
47+ :param response: the Scrapy Response object to be parsed
48+ :return: a list of Result items and Request objects
49+ """
50+ sel = Selector(response)
51+ requests = []
52+ requests_synonyms = self.parse_synonyms(sel)
53+ requests.extend(requests_synonyms)
54+ requests_properties = self.parse_properties(sel)
55+ requests.extend(requests_properties)
56+57+ return requests
58+59+ def parse_properties(self, sel):
60+ """
61+ This function scrapes the Experimental Data and Predicted ACD/Labs tabs
62+ :param sel: a Selector object of the whole page
63+ :return: a list of Result items
64+ """
65+ properties = []
66+67+ properties.extend(self.parse_acdlabstab(sel))
68+ properties.extend(self.parse_experimentaldatatab(sel))
69+70+ return properties
71+72+ def parse_acdlabstab(self, sel):
73+ """
74+ This function scrapes the 'Predicted ACD/Labs tab' under Properties
75+ :param sel: a Selector object of the whole page
76+ :return: a list of Request objects
77+ """
78+ properties = []
79+80+ td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
81+ 'normalize-space(string())')
82+ prop_names = td_list[::2]
83+ prop_values = td_list[1::2]
84+ for (prop_name, prop_value) in zip(prop_names, prop_values):
85+ # [:-1] is to remove the colon at the end, [TODO] - test for colon
86+ prop_name = prop_name.extract().encode('utf-8')[:-1]
87+ prop_value = prop_value.extract().encode('utf-8')
88+ prop_conditions = ''
89+90+ # Test for properties without values, with one hardcoded exception
91+ if (not re.match(r'^\d', prop_value) or
92+ (prop_name == 'Polarizability' and prop_value == '10-24cm3')):
93+ continue
94+95+ m = re.match(r'(.*) \((.*)\)', prop_name)
96+ if m:
97+ prop_name = m.group(1)
98+ prop_conditions = m.group(2)
99+100+ m = re.match(r'(.*) at (.*)', prop_value)
101+ if m:
102+ prop_value = m.group(1)
103+ prop_conditions = m.group(2)
104+105+ new_prop = self.newresult(
106+ attribute=prop_name,
107+ value=prop_value,
108+ source='ChemSpider Predicted - ACD/Labs Tab',
109+ conditions=prop_conditions
110+ )
111+ properties.append(new_prop)
112+113+ return properties
114+115+ def parse_experimentaldatatab(self, sel):
116+ """
117+ This function scrapes Experimental Data tab, Physico-chemical
118+ properties in particular.
119+ :param sel: a Selector object of the whole page
120+ :return: a list of Result items
121+ """
122+ properties = []
123+124+ scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
125+ 'Properties"]//li/table/tr/td')
126+ if not scraped_list:
127+ return properties
128+ # Format is: property name followed by a list of values
129+ property_name = scraped_list.pop(0).xpath(
130+ 'span/text()').extract()[0].rstrip()
131+ for line in scraped_list:
132+ if line.xpath('span/text()'):
133+ property_name = line.xpath('span/text()').extract()[0].rstrip()
134+ else:
135+ new_prop = self.newresult(
136+ attribute=property_name[:-1],
137+ value=line.xpath('text()').extract()[0].rstrip(),
138+ source=line.xpath('strong/text()').extract()[0].rstrip(),
139+ )
140+ properties.append(new_prop)
141+142+ return properties
143+144+ def parse_synonyms(self, sel):
145+ """
146+ This function scrapes the list of Names and Identifiers
147+ :param sel: a Selector object of the whole page
148+ :return: a list of Requests
149+ """
150+ requests = []
151+ synonyms = []
152+153+ # Exact type for this is unknown, but equivalent to Validated by Expert
154+ for syn in sel.xpath('//p[@class="syn"][span[@class="synonym_cn"]]'):
155+ name = syn.xpath('span[@class="synonym_cn"]/text()').extract()[0]
156+ synonyms.append(self.new_synonym(syn, name, 'expert'))
157+ # These synonyms are labeled by ChemSpider as "Validated by Experts"
158+ for syn in sel.xpath('//p[@class="syn"][strong]'):
159+ name = syn.xpath('strong/text()').extract()[0]
160+ synonyms.append(self.new_synonym(syn, name, 'expert'))
161+ # These synonyms are labeled by ChemSpider as "Validated by Users"
162+ for syn in sel.xpath(
163+ '//p[@class="syn"][span[@class="synonym_confirmed"]]'):
164+ name = syn.xpath(
165+ 'span[@class="synonym_confirmed"]/text()').extract()[0]
166+ synonyms.append(self.new_synonym(syn, name, 'user'))
167+ # These syonyms are labeled as "Non-validated" and assumed unreliable
168+ for syn in sel.xpath('//p[@class="syn"][span[@class=""]]'):
169+ name = syn.xpath('span[@class=""]/text()').extract()[0]
170+ synonyms.append(self.new_synonym(syn, name, 'nonvalidated'))
171+172+ # [TODO] - confirm if English User-Validated synonyms are OK too
173+ for syn in synonyms:
174+ if syn['category'] == 'expert' and syn['language'] == 'English':
175+ log.msg('CS emit synonym: %s' % syn['name'], level=log.DEBUG)
176+ self._spider.get_synonym_requests(syn['name'])
177+178+ return requests
179+180+ def new_synonym(self, sel, name, category):
181+ """
182+ This function scrapes for a single synonym at a given HTML tag
183+ :param sel: a Selector object of the given HTML tag
184+ :param name: the name of the synonym in the tag
185+ :param category: the name of the category the synonym is labeled as
186+ :return: a dictionary containing data on the synonym
187+ """
188+ self.ignore_list.append(name)
189+ language = sel.xpath('span[@class="synonym_language"]/text()')
190+ if language:
191+ # The [1:-1] is to remove brackets around the language name
192+ language = language.extract()[0][1:-1]
193+ else:
194+ # If language is not given, English is assumed, [TODO] - confirm
195+ language = 'English'
196+ log.msg('CS synonym: %s (%s) (%s)' % (name, category, language),
197+ level=log.DEBUG)
198+ references = []
199+ # A synonym can have multiple references, each optionally with link
200+ for ref in sel.xpath('span[@class="synonym_ref"]'):
201+ refname = ref.xpath('normalize-space(string())')
202+ references.append({
203+ 'name': refname.extract()[0][1:-1],
204+ 'URI': ''
205+ })
206+ for ref in sel.xpath('a[@class="synonym_ref"]'):
207+ references.append({
208+ 'name': ref.xpath('@title').extract()[0],
209+ 'URI': ref.xpath('@href').extract()[0]
210+ })
211+ for ref in references:
212+ log.msg('CS synonym ref: %s %s' % (ref['name'], ref['URI']),
213+ level=log.DEBUG)
214+ synonym = {
215+ 'name': name,
216+ 'category': category,
217+ 'language': language,
218+ 'references': references
219+ }
220+ return synonym
221+222+ def parse_extendedinfo(self, response):
223+ """
224+ This function scrapes data from the ChemSpider GetExtendedCompoundInfo
225+ API, if a token is present in the configuration settings
226+ :param response: a Response object to be parsed
227+ :return: a list of Result items
228+ """
229+ sel = Selector(response)
230+ properties = []
231+ names = sel.xpath('*').xpath('name()').extract()
232+ values = sel.xpath('*').xpath('text()').extract()
233+ for (name, value) in zip(names, values):
234+ result = self.newresult(
235+ attribute=name,
236+ value=value, # These values have no unit!
237+ source='ChemSpider ExtendedCompoundInfo',
238+ )
239+ if result['value']:
240+ properties.append(result)
241+ return properties
242+243+ def newresult(self, attribute, value, conditions='', source='ChemSpider'):
244+ """
245+ This function abstracts from the Result item and provides default
246+ values.
247+ :param attribute: the name of the attribute
248+ :param value: the value of the attribute
249+ :param conditions: optional conditions regarding the value
250+ :param source: the name of the source if it is not ChemSpider
251+ :return: A Result item
252+ """
253+ return Result({
254+ 'attribute': attribute,
255+ 'value': value,
256+ 'source': source,
257+ 'reliability': self.cfg['reliability'],
258+ 'conditions': conditions
259+ })
260+261+ def parse_searchrequest(self, response):
262+ """
263+ This function parses the initial response of the ChemSpider Search API
264+ Requires a valid token to function.
265+ :param response: the Response object to be parsed
266+ :return: A Request for the information page and a Request for the
267+ extendedinfo API call
268+ """
269+ sel = Selector(response)
270+ log.msg('chemspider parse_searchrequest', level=log.DEBUG)
271+ sel.register_namespace('cs', 'http://www.chemspider.com/')
272+ csids = sel.xpath('.//cs:int/text()').extract()
273+ if len(csids) == 0:
274+ log.msg('ChemSpider found nothing', level=log.ERROR)
275+ return
276+ elif len(csids) > 1:
277+ log.msg('ChemSpider found multiple substances, taking first '
278+ 'element', level=log.DEBUG)
279+ csid = csids[0]
280+ structure_url = self.website[:-2].replace("\\", "") + self.structure % csid
281+ extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid
282+ log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
283+ return [Request(url=structure_url,
284+ callback=self.parse),
285+ Request(url=extendedinfo_url,
286+ callback=self.parse_extendedinfo)]
287+288+ def new_compound_request(self, compound):
289+ """
290+ This function is called when a new synonym is returned to the spider
291+ to generate new requests
292+ :param compound: the name of the compound to search for
293+ """
294+ if compound in self.ignore_list or self.cfg['token'] == '':
295+ return None
296+ searchurl = self.website[:-2].replace("\\", "") + self.search % compound
297+ log.msg('chemspider compound', level=log.DEBUG)
298+ return Request(url=searchurl, callback=self.parse_searchrequest)
···1+import re
2+3+from scrapy import log
4+from scrapy.http import Request
5+from scrapy.selector import Selector
6+7+from source import Source
8+from FourmiCrawler.items import Result
9+10+11+# [TODO]: values can be '128.', perhaps remove the dot in that case?
12+# [TODO]: properties have references and comments which do not exist in the
13+# Result item, but should be included eventually.
14+15+class NIST(Source):
16+ """
17+ NIST Scraper plugin
18+ This plugin manages searching for a chemical on the NIST website
19+ and parsing the resulting page if the chemical exists on NIST.
20+ """
21+ website = "http://webbook\\.nist\\.gov/.*"
22+23+ search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
24+25+ def __init__(self, config=None):
26+ """
27+ Initialization of NIST scraper
28+ :param config: configuration variables for this scraper, must contain
29+ 'reliability' key.
30+ """
31+ Source.__init__(self, config)
32+ self.ignore_list = set()
33+34+ def parse(self, response):
35+ """
36+ This function is called when a Response matching the variable
37+ 'website' is available for parsing the Response object.
38+ :param response: The Scrapy Response object to be parsed
39+ :return: a list of Result items and Request objects
40+ """
41+ sel = Selector(response)
42+43+ title = sel.xpath('head/title/text()').extract()[0]
44+ if title == 'Name Not Found':
45+ log.msg('NIST: Chemical not found!', level=log.ERROR)
46+ return
47+ if title not in self.ignore_list:
48+ self.ignore_list.update(title)
49+ log.msg('NIST emit synonym: %s' % title, level=log.DEBUG)
50+ self._spider.get_synonym_requests(title)
51+52+ requests = []
53+54+ requests.extend(self.parse_generic_info(sel))
55+56+ symbol_table = {}
57+ tds = sel.xpath('//table[@class="symbol_table"]/tr/td')
58+ for (symbol_td, name_td) in zip(tds[::2], tds[1::2]):
59+ symbol = ''.join(symbol_td.xpath('node()').extract())
60+ name = name_td.xpath('text()').extract()[0]
61+ symbol_table[symbol] = name
62+ log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
63+ level=log.DEBUG)
64+65+ requests.extend(self.parse_tables(sel, symbol_table))
66+67+ return requests
68+69+ def parse_tables(self, sel, symbol_table):
70+ """
71+ This function identifies and distributes parsing of tables to other
72+ functions below.
73+ :param sel: A Selector object of the whole page
74+ :param symbol_table: a dictionary containing translations of raw HTML
75+ tags to human readable names
76+ :return: a list of Result items and Requests
77+ """
78+ requests = []
79+80+ for table in sel.xpath('//table[@class="data"]'):
81+ summary = table.xpath('@summary').extract()[0]
82+ if summary == 'One dimensional data':
83+ log.msg('NIST table: Aggregrate data', level=log.DEBUG)
84+ requests.extend(
85+ self.parse_aggregate_data(table, symbol_table))
86+ elif table.xpath('tr/th="Initial Phase"').extract()[0] == '1':
87+ log.msg('NIST table; Enthalpy/entropy of phase transition',
88+ level=log.DEBUG)
89+ requests.extend(self.parse_transition_data(table, summary))
90+ elif table.xpath('tr[1]/td'):
91+ log.msg('NIST table: Horizontal table', level=log.DEBUG)
92+ elif summary == 'Antoine Equation Parameters':
93+ log.msg('NIST table: Antoine Equation Parameters',
94+ level=log.DEBUG)
95+ requests.extend(self.parse_antoine_data(table, summary))
96+ elif len(table.xpath('tr[1]/th')) == 5:
97+ log.msg('NIST table: generic 5 columns', level=log.DEBUG)
98+ # Symbol (unit) Temperature (K) Method Reference Comment
99+ requests.extend(self.parse_generic_data(table, summary))
100+ elif len(table.xpath('tr[1]/th')) == 4:
101+ log.msg('NIST table: generic 4 columns', level=log.DEBUG)
102+ # Symbol (unit) Temperature (K) Reference Comment
103+ requests.extend(self.parse_generic_data(table, summary))
104+ else:
105+ log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
106+ continue # Assume unsupported
107+ return requests
108+109+ def parse_generic_info(self, sel):
110+ """
111+ This function parses: synonyms, chemical formula, molecular weight,
112+ InChI, InChiKey, CAS number
113+ :param sel: A Selector object of the entire page in the original
114+ response
115+ :return: a list of Result items
116+ """
117+ ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
118+119+ raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
120+ for synonym in raw_synonyms[0].strip().split(';\n'):
121+ log.msg('NIST synonym: %s' % synonym, level=log.DEBUG)
122+ self.ignore_list.update(synonym)
123+ self._spider.get_synonym_requests(synonym)
124+125+ data = {}
126+127+ raw_formula = ul.xpath('li[strong/a="Formula"]//text()').extract()
128+ data['Chemical formula'] = ''.join(raw_formula[2:]).strip()
129+130+ raw_mol_weight = ul.xpath('li[strong/a="Molecular weight"]/text()')
131+ data['Molecular weight'] = raw_mol_weight.extract()[0].strip()
132+133+ raw_inchi = ul.xpath('li[strong="IUPAC Standard InChI:"]//tt/text()')
134+ data['IUPAC Standard InChI'] = raw_inchi.extract()[0]
135+136+ raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]'
137+ '/tt/text()')
138+ data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0]
139+140+ raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()')
141+ data['CAS Registry Number'] = raw_cas_number.extract()[0].strip()
142+143+ requests = []
144+ for key, value in data.iteritems():
145+ result = self.newresult(
146+ attribute=key,
147+ value=value
148+ )
149+ requests.append(result)
150+151+ return requests
152+153+ def parse_aggregate_data(self, table, symbol_table):
154+ """
155+ This function parses the table(s) which contain possible links to
156+ individual data points
157+ :param table: a Selector object of the table to be parsed
158+ :param symbol_table: a dictionary containing translations of raw HTML
159+ tags to human readable names
160+ :return: a list of Result items and Request objects
161+ """
162+ results = []
163+ for tr in table.xpath('tr[td]'):
164+ extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
165+ '/a/@href').extract()
166+ if extra_data_url:
167+ request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0],
168+ callback=self.parse_individual_datapoints)
169+ results.append(request)
170+ continue
171+ data = []
172+ for td in tr.xpath('td'):
173+ data.append(''.join(td.xpath('node()').extract()))
174+175+ name = symbol_table[data[0]]
176+ condition = ''
177+178+ m = re.match(r'(.*) at (.*)', name)
179+ if m:
180+ name = m.group(1)
181+ condition = m.group(2)
182+183+ result = self.newresult(
184+ attribute=name,
185+ value=data[1] + ' ' + data[2],
186+ conditions=condition
187+ )
188+ log.msg('NIST: |%s|' % data, level=log.DEBUG)
189+ results.append(result)
190+ return results
191+192+ def parse_transition_data(self, table, summary):
193+ """
194+ This function parses the table containing properties regarding phase
195+ changes
196+ :param table: a Selector object of the table to be parsed
197+ :param summary: the name of the property
198+ :return: a list of Result items
199+ """
200+ results = []
201+202+ unit = self.get_unit(table)
203+204+ for tr in table.xpath('tr[td]'):
205+ tds = tr.xpath('td/text()').extract()
206+ result = self.newresult(
207+ attribute=summary,
208+ value=tds[0] + ' ' + unit,
209+ conditions='%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
210+ )
211+ results.append(result)
212+213+ return results
214+215+ def parse_generic_data(self, table, summary):
216+ """
217+ Parses the common tables of 4 and 5 rows. Assumes they are of the
218+ form:
219+ Symbol (unit)|Temperature (K)|Method|Reference|Comment
220+ Symbol (unit)|Temperature (K)|Reference|Comment
221+ :param table: a Selector object of the table to be parsed
222+ :param summary: the name of the property
223+ :return: a list of Result items
224+ """
225+ results = []
226+227+ unit = self.get_unit(table)
228+229+ for tr in table.xpath('tr[td]'):
230+ tds = tr.xpath('td/text()').extract()
231+ result = self.newresult(
232+ attribute=summary,
233+ value=tds[0] + ' ' + unit,
234+ conditions='%s K' % tds[1]
235+ )
236+ results.append(result)
237+ return results
238+239+ def parse_antoine_data(self, table, summary):
240+ """
241+ This function parses the table containing parameters for the Antione
242+ equation
243+ :param table: a Selector object of the table to be parsed
244+ :param summary: the name of the property
245+ :return: a list of Result items
246+ """
247+ results = []
248+249+ for tr in table.xpath('tr[td]'):
250+ tds = tr.xpath('td/text()').extract()
251+ result = self.newresult(
252+ attribute=summary,
253+ value='A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
254+ conditions='%s K' % tds[0]
255+ )
256+ results.append(result)
257+258+ return results
259+260+ def parse_individual_datapoints(self, response):
261+ """
262+ This function parses the 'individual data points' page linked from
263+ the aggregate data table(s)
264+ :param response: the Scrapy Response object to be parsed
265+ :return: a list of Result items
266+ """
267+ sel = Selector(response)
268+ table = sel.xpath('//table[@class="data"]')[0]
269+270+ results = []
271+272+ name = table.xpath('@summary').extract()[0]
273+ condition = ''
274+ m = re.match(r'(.*) at (.*)', name)
275+ if m:
276+ name = m.group(1)
277+ condition = m.group(2)
278+279+ unit = self.get_unit(table)
280+281+ for tr in table.xpath('tr[td]'):
282+ tds = tr.xpath('td/text()').extract()
283+ uncertainty = ''
284+ m = re.search('Uncertainty assigned by TRC = (.*?) ', tds[-1])
285+ if m:
286+ uncertainty = '+- %s ' % m.group(1)
287+ # [TODO]: get the plusminus sign working in here
288+ result = self.newresult(
289+ attribute=name,
290+ value='%s %s%s' % (tds[0], uncertainty, unit),
291+ conditions=condition
292+ )
293+ results.append(result)
294+295+ return results
296+297+ @staticmethod
298+ def get_unit(table):
299+ tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
300+ m = re.search(r'\((.*)\)', tr_unit)
301+ unit = '!'
302+ if m:
303+ unit = m.group(1)
304+305+ return unit
306+307+ def newresult(self, attribute, value, conditions=''):
308+ """
309+ This function abstracts from the Result item and provides default
310+ values
311+ :param attribute: the name of the attribute
312+ :param value: the value of the attribute
313+ :param conditions: optional conditions regarding the value
314+ :return: A Result item
315+ """
316+ return Result(
317+ {
318+ 'attribute': attribute,
319+ 'value': value,
320+ 'source': 'NIST',
321+ 'reliability': self.cfg['reliability'],
322+ 'conditions': conditions
323+ })
324+325+ def new_compound_request(self, compound):
326+ """
327+ This function is called when a new synonym is returned to the spider
328+ to generate new requests
329+ :param compound: the name of the compound to search for
330+ """
331+ if compound not in self.ignore_list:
332+ self.ignore_list.update(compound)
333+ return Request(url=self.website[:-2].replace("\\", "") + self.search % compound,
334+ callback=self.parse)
···1+import re
2+3+from scrapy.http import Request
4+from scrapy import log
5+from scrapy.selector import Selector
6+7+from source import Source
8+from FourmiCrawler.items import Result
9+10+11+class PubChem(Source):
12+ """ PubChem scraper for chemical properties
13+14+ This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance,
15+ including sources of the values of properties.
16+ """
17+18+ # PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
19+ website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
20+ website_www = 'http://www.ncbi.nlm.nih.gov/*'
21+ website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
22+ search = 'pccompound?term=%s'
23+ data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
24+25+ __spider = None
26+ searched_compounds = set()
27+28+ def __init__(self, config):
29+ Source.__init__(self, config)
30+ self.cfg = config
31+32+ def parse(self, response):
33+ """
34+ Distributes the above described behaviour
35+ :param response: The incoming search request
36+ :return Returns the found properties if response is unique or returns none if it's already known
37+ """
38+ requests = []
39+ log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
40+41+ sel = Selector(response)
42+ compound = sel.xpath('//h1/text()').extract()[0]
43+ if compound in self.searched_compounds:
44+ return None
45+46+ self.searched_compounds.update(compound)
47+ raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0]
48+ for synonym in raw_synonyms.strip().split(', '):
49+ log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG)
50+ self.searched_compounds.update(synonym)
51+ self._spider.get_synonym_requests(synonym)
52+ log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
53+54+ n = re.search(r'cid=(\d+)', response.url)
55+ if n:
56+ cid = n.group(1)
57+ log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach
58+ # the seperate html page which contains the properties and their values
59+60+ # using this cid to get the right url and scrape it
61+ requests.append(
62+ Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data))
63+ return requests
64+65+ def parse_data(self, response):
66+ """
67+ Parse data found in 'Chemical and Physical properties' part of a substance page.
68+ :param response: The response with the page to parse
69+ :return: requests: Returns a list of properties with their values, source, etc.
70+ """
71+ log.msg('parsing data', level=log.DEBUG)
72+ requests = []
73+74+ sel = Selector(response)
75+ props = sel.xpath('//div')
76+77+ for prop in props:
78+ prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
79+ if prop.xpath('a'): # parsing for single value in property
80+ prop_source = ''.join(prop.xpath('a/@title').extract())
81+ prop_value = ''.join(prop.xpath('a/text()').extract())
82+ new_prop = Result({
83+ 'attribute': prop_name,
84+ 'value': prop_value,
85+ 'source': prop_source,
86+ 'reliability': self.cfg['reliability'],
87+ 'conditions': ''
88+ })
89+ log.msg('PubChem prop: |%s| |%s| |%s|' %
90+ (new_prop['attribute'], new_prop['value'],
91+ new_prop['source']), level=log.DEBUG)
92+ requests.append(new_prop)
93+ elif prop.xpath('ul'): # parsing for multiple values (list) in property
94+ prop_values = prop.xpath('ul//li')
95+ for prop_li in prop_values:
96+ prop_value = ''.join(prop_li.xpath('a/text()').extract())
97+ prop_source = ''.join(prop_li.xpath('a/@title').extract())
98+ new_prop = Result({
99+ 'attribute': prop_name,
100+ 'value': prop_value,
101+ 'source': prop_source,
102+ 'reliability': self.cfg['reliability'],
103+ 'conditions': ''
104+ })
105+ log.msg('PubChem prop: |%s| |%s| |%s|' %
106+ (new_prop['attribute'], new_prop['value'],
107+ new_prop['source']), level=log.DEBUG)
108+ requests.append(new_prop)
109+110+ return requests
111+112+ def parse_searchrequest(self, response):
113+ """
114+ This function parses the response to the new_compound_request Request
115+ :param response: the Response object to be parsed
116+ :return: A Request for the compound page or what self.parse returns in
117+ case the search request forwarded to the compound page
118+ """
119+120+ # check if pubchem forwarded straight to compound page
121+ m = re.match(self.website_pubchem, response.url)
122+ if m:
123+ log.msg('PubChem search forwarded to compound page',
124+ level=log.DEBUG)
125+ return self.parse(response)
126+127+ sel = Selector(response)
128+129+ results = sel.xpath('//div[@class="rsltcont"]')
130+ if results:
131+ url = results[0].xpath('div/p/a[1]/@href')
132+ else:
133+ log.msg('PubChem search found nothing or xpath failed',
134+ level=log.DEBUG)
135+ return None
136+137+ if url:
138+ url = 'http:' + ''.join(url[0].extract())
139+ log.msg('PubChem compound page: %s' % url, level=log.DEBUG)
140+ else:
141+ log.msg('PubChem search found results, but no url in first result',
142+ level=log.DEBUG)
143+ return None
144+145+ return Request(url=url, callback=self.parse)
146+147+ def new_compound_request(self, compound):
148+ return Request(url=self.website_www[:-1] + self.search % compound,
149+ callback=self.parse_searchrequest)
···1+import re
2+3+from scrapy.http import Request
4+from scrapy import log
5+from scrapy.selector import Selector
6+7+from source import Source
8+from FourmiCrawler.items import Result
9+10+11+class WikipediaParser(Source):
12+ """ Wikipedia scraper for chemical properties
13+14+ This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values.
15+ It also returns requests with other external sources which contain information on parsed subject.
16+ """
17+18+ website = "http://en\\.wikipedia\\.org/wiki/.*"
19+ __spider = None
20+ searched_compounds = []
21+22+ def __init__(self, config=None):
23+ Source.__init__(self, config)
24+25+ def parse(self, response):
26+ """
27+ Distributes the above described behaviour
28+ :param response: The incoming search request
29+ :return: Returns the found properties if response is unique or returns none if it's already known
30+ """
31+ log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
32+ sel = Selector(response)
33+ compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page
34+ if compound in self.searched_compounds:
35+ return None
36+ else:
37+ items = self.parse_infobox(sel)
38+ self.searched_compounds.append(compound)
39+ return items
40+41+ def parse_infobox(self, sel):
42+ """
43+ Scrape data from infobox on wikipedia.
44+45+ Data from two types of infoboxes: class="infobox bordered" and class="infobox" is scraped and
46+ :param sel: The selector with the html-information of the page to parse
47+ :return: item_list: Returns a list of properties with their values, source, etc..
48+ """
49+50+ items = []
51+52+ # scrape the chembox (wikipedia template)
53+ items = self.parse_chembox(sel, items)
54+55+ # scrape the drugbox (wikipedia template)
56+ items = self.parse_drugbox(sel, items)
57+58+ items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
59+ item_list = self.clean_items(items)
60+61+ identifiers = self.get_identifiers(sel)
62+63+ #add extra sources to scrape from as requests
64+ for i, identifier in enumerate(identifiers):
65+ request = None
66+ #discard internal wikipedia links
67+ if re.match('//en\.wikipedia', identifier):
68+ log.msg('Found link to Wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING)
69+ #fix links starting with '//www.'
70+ elif re.match('/{2}', identifier):
71+ identifier = re.sub("/{2}", "http://", identifier)
72+ request = Request(identifier)
73+ else:
74+ request = Request(identifier)
75+ log.msg('New identifier found, request: %s' % identifier, level=log.DEBUG)
76+ item_list.append(request)
77+78+ return item_list
79+80+ def parse_chembox(self, sel, items):
81+ """
82+ Scrape data from chembox infobox on wikipedia.
83+84+ :param sel: The selector with the html-information of the page to parse
85+ :param items: the list of items where the result have to be stored in
86+ :return: items: the list of items with the new found and stored items
87+ """
88+ tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
89+ xpath('normalize-space(string())')
90+ prop_names = tr_list[::2]
91+ prop_values = tr_list[1::2]
92+ for i, prop_name in enumerate(prop_names):
93+ item = self.newresult(
94+ attribute=prop_name.extract().encode('utf-8'),
95+ value=prop_values[i].extract().encode('utf-8')
96+ )
97+ items.append(item)
98+ log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
99+ return items
100+101+ def parse_drugbox(self, sel, items):
102+ """
103+ Scrape data from drugbox infobox on wikipedia.
104+105+ :param sel: The selector with the html-information of the page to parse
106+ :param items: the list of items where the result have to be stored in
107+ :return: items: the list of items with the new found and stored items
108+ """
109+ tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
110+ log.msg('dit: %s' % tr_list2, level=log.DEBUG)
111+ for tablerow in tr_list2:
112+ log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
113+ if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
114+ 'normalize-space(string())'):
115+ item = self.newresult(
116+ attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
117+ value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
118+ )
119+ items.append(item)
120+ log.msg(
121+ 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
122+ level=log.DEBUG)
123+ return items
124+125+ def new_compound_request(self, compound):
126+ return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
127+128+ @staticmethod
129+ def clean_items(items):
130+131+ """
132+ Clean up properties using regex, makes it possible to split the values from the units
133+134+ Almost not in use, only cleans J/K/mol values and boiling/melting points.
135+136+ :param items: List of properties with their values, source, etc..
137+ :return: items: List of now cleaned up items
138+ """
139+ for item in items:
140+ value = item['value']
141+ m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F)
142+ if m:
143+ item['value'] = m.group(1) + " K"
144+ m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) # clean up J/K/mol values
145+ if m:
146+ item['value'] = m.group(1) + " J/K/mol"
147+ return items
148+149+ @staticmethod
150+ def get_identifiers(sel):
151+ """
152+ Find external links, named 'Identifiers' to different sources.
153+154+ :param sel: The selector with the html-information of the page to parse
155+ :return: links: New links which can be used to expand the crawlers search
156+ """
157+ links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
158+ '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
159+ return links
160+161+ def newresult(self, attribute, value):
162+ return Result(
163+ {
164+ 'attribute': attribute,
165+ 'value': value,
166+ 'source': 'Wikipedia',
167+ 'reliability': self.cfg['reliability'],
168+ 'conditions': ''
169+ })
FourmiCrawler/sources/__init__.py
This is a binary file and will not be displayed.
+41
FourmiCrawler/sources/source.py
···00000000000000000000000000000000000000000
···1+from scrapy import log
2+# from scrapy.http import Request
3+4+5+class Source:
6+ website = "http://something/.*" # Regex of URI's the source is able to parse
7+ _spider = None
8+9+ def __init__(self, config=None):
10+ """
11+ Initiation of a new Source
12+ """
13+ self.cfg = {}
14+ if config is not None:
15+ self.cfg = config
16+ pass
17+18+ def parse(self, response):
19+ """
20+ This function should be able to parse all Scrapy Response objects with a URL matching the website Regex.
21+ :param response: A Scrapy Response object
22+ :return: A list of Result items and new Scrapy Requests
23+ """
24+ log.msg("The parse function of the empty source was used.", level=log.WARNING)
25+ pass
26+27+ def new_compound_request(self, compound):
28+ """
29+ This function should return a Scrapy Request for the given compound request.
30+ :param compound: A compound name.
31+ :return: A new Scrapy Request
32+ """
33+ # return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
34+ pass
35+36+ def set_spider(self, spider):
37+ """
38+ A Function to save the associated spider.
39+ :param spider: A FourmiSpider object
40+ """
41+ self._spider = spider
+72-7
FourmiCrawler/spider.py
···001from scrapy.spider import Spider
0234class FourmiSpider(Spider):
0005 name = "FourmiSpider"
67- def __init__(self, compound=None, *args, **kwargs):
00000008 super(FourmiSpider, self).__init__(*args, **kwargs)
00000910- def parse(self, reponse):
11- # [TODO] - This function should delegate it's functionality to other
12- # parsers.
13- pass
0000000000000000000000000000000000000000001415- def add_parser(self, parser):
16- self.parsers.add(parser)
00000
···1+import re
2+3from scrapy.spider import Spider
4+from scrapy import log
567class FourmiSpider(Spider):
8+ """
9+ A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
10+ """
11 name = "FourmiSpider"
1213+ def __init__(self, compound=None, selected_attributes=None, *args, **kwargs):
14+ """
15+ Initiation of the Spider
16+ :param compound: compound that will be searched.
17+ :param selected_attributes: A list of regular expressions that the attributes should match.
18+ """
19+ self._sources = []
20+ self.synonyms = set()
21 super(FourmiSpider, self).__init__(*args, **kwargs)
22+ self.synonyms.add(compound)
23+ if selected_attributes is None:
24+ self.selected_attributes = [".*"]
25+ else:
26+ self.selected_attributes = selected_attributes
2728+ def parse(self, response):
29+ """
30+ The function that is called when a response to a request is available. This function distributes this to a
31+ source which should be able to handle parsing the data.
32+ :param response: A Scrapy Response object that should be parsed
33+ :return: A list of Result items and new Request to be handled by the scrapy core.
34+ """
35+ for source in self._sources:
36+ if re.match(source.website, response.url):
37+ log.msg("URL: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
38+ return source.parse(response)
39+ log.msg("URL: " + response.url + " -> No compatible source", level=log.INFO)
40+ return None
41+42+ def get_synonym_requests(self, compound, force=False):
43+ """
44+ A function that generates new Scrapy Request for each source given a new synonym of a compound.
45+ :param compound: A compound name
46+ :return: A list of Scrapy Request objects
47+ """
48+ requests = []
49+ if force or compound not in self.synonyms:
50+ self.synonyms.add(compound)
51+ for parser in self._sources:
52+ parser_requests = parser.new_compound_request(compound)
53+ if parser_requests is not None:
54+ requests.append(parser_requests)
55+ return requests
56+57+ def start_requests(self):
58+ """
59+ The function called by Scrapy for it's first Requests
60+ :return: A list of Scrapy Request generated from the known synonyms using the available sources.
61+ """
62+ requests = []
63+ for synonym in self.synonyms:
64+ requests.extend(self.get_synonym_requests(synonym, force=True))
65+ return requests
66+67+ def add_sources(self, sources):
68+ """
69+ A function to add a new Parser objects to the list of available sources.
70+ :param sources: A list of Source Objects.
71+ """
72+ for parser in sources:
73+ self.add_source(parser)
7475+ def add_source(self, source):
76+ """
77+ A function add a new Parser object to the list of available parsers.
78+ :param source: A Source Object
79+ """
80+ self._sources.append(source)
81+ source.set_spider(self)
···1+from Tkinter import *
2+import os
3+import shutil
4+from tkFileDialog import asksaveasfilename
5+6+from configImporter import *
7+8+9+class GUI():
10+ def __init__(self, search, config_file='GUI.cfg', sourceloader=None, in_source=True):
11+ """Boots the window, configuration."""
12+ if not in_source:
13+ current_dir = os.path.dirname(os.path.abspath(__file__))
14+ config_file = current_dir + '../' + config_file
15+ if not os.path.isfile(config_file):
16+ try:
17+ shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../GUI.cfg.sample", config_file)
18+ except IOError:
19+ print "GUI configuration couldn't be found and couldn't be created."
20+ sys.exit()
21+ self.configurator = ConfigImporter(config_file)
22+ self.sourceloader = sourceloader
23+ self.finish_with_search = False
24+ self.values = {}
25+ self.required_variables = ['substance']
26+ self.search = search
27+ self.window, self.variables = self.generate_window(self.load_common_attributes(), self.load_output_types())
28+29+ def load_common_attributes(self):
30+ """Calls the configuration parser for common attributes."""
31+ return [x.strip() for x in self.configurator.load_common_attributes().split(',')]
32+33+ def load_output_types(self):
34+ """Calls the configuration parser for output types."""
35+ return [x.strip() for x in self.configurator.load_output_types().split(',')]
36+37+ def load_always_attributes(self):
38+ """Calls the configuration parser for attributes that are always used."""
39+ return ','.join([x.strip() for x in self.configurator.load_always_attributes().split(',')])
40+41+ def set_output(self):
42+ self.variable_output_name.set(asksaveasfilename())
43+ self.button_output_name.config(text=self.variable_output_name.get())
44+45+ def generate_window(self, common_attributes, output_types):
46+ """Creates all widgets and variables in the window."""
47+ window = Tk()
48+ window.wm_title("Fourmi Crawler")
49+50+ variables = {}
51+52+ variable_substance = StringVar(window)
53+ frame_substance = Frame(window)
54+ label_substance = Label(frame_substance, text="Substance: ")
55+ input_substance = Entry(frame_substance, font=("Helvetica", 12), width=25, textvariable=variable_substance)
56+ variables.update({"substance": variable_substance})
57+ frame_substance.pack(side=TOP)
58+ label_substance.pack()
59+ input_substance.pack()
60+ input_substance.focus()
61+62+ frame_all_attributes = Frame(window)
63+ frame_selecting_attributes = Frame(frame_all_attributes)
64+ frame_new_attributes = Frame(frame_selecting_attributes)
65+ label_new_attributes = Label(frame_new_attributes, text="Parameters: ")
66+ input_new_attributes = Text(frame_new_attributes, font=("Helvetica", 8), width=25, height=7, padx=5, pady=5)
67+ variables.update({"new_attributes": input_new_attributes})
68+ frame_new_attributes.pack(side=LEFT)
69+ label_new_attributes.pack()
70+ input_new_attributes.pack()
71+72+ frame_common_attributes = Frame(frame_selecting_attributes)
73+ label_common_attributes = Label(frame_common_attributes, text="Common Parameters: ")
74+ input_common_attributes = Listbox(frame_common_attributes, selectmode=MULTIPLE, height=7)
75+ scrollbar_common_attributes = Scrollbar(frame_common_attributes)
76+ input_common_attributes.config(yscrollcommand=scrollbar_common_attributes.set)
77+ scrollbar_common_attributes.config(command=input_common_attributes.yview)
78+ if common_attributes and len(common_attributes) > 0:
79+ input_common_attributes.insert(END, *common_attributes)
80+ variables.update({"common_attributes": input_common_attributes})
81+ frame_common_attributes.pack(side=RIGHT)
82+ label_common_attributes.pack(side=TOP)
83+ input_common_attributes.pack(side=LEFT)
84+ scrollbar_common_attributes.pack(side=RIGHT, fill=Y)
85+ frame_selecting_attributes.pack()
86+87+ frame_last = Frame(window)
88+ search_button = Button(frame_last, text="Start search", command=self.prepare_search)
89+ cancel_button = Button(frame_last, text="Cancel", command=window.destroy)
90+ frame_last.pack(side=BOTTOM)
91+ search_button.pack(side=LEFT)
92+ cancel_button.pack(side=RIGHT)
93+94+ frame_name = Frame(window)
95+ frame_output_name = Frame(frame_name)
96+ label_output_name = Label(frame_output_name, text='Output file:')
97+ self.variable_output_name = StringVar()
98+ self.variable_output_name.set('results.csv')
99+ variables.update({'output_name':self.variable_output_name})
100+ self.button_output_name = Button(frame_output_name, command=self.set_output, text="Select file")
101+ frame_output_name.pack(side=LEFT)
102+ label_output_name.pack()
103+ self.button_output_name.pack()
104+ frame_name.pack(side=BOTTOM)
105+106+107+ frame_checkboxes = Frame(window)
108+ frame_checkbox_attributes = Frame(frame_checkboxes)
109+ variable_all_attributes = BooleanVar()
110+ variable_all_attributes.set(True)
111+ input_all_attributes = Checkbutton(frame_checkbox_attributes, text="Search ALL parameters",
112+ variable=variable_all_attributes)
113+ variables.update({"all_attributes": variable_all_attributes})
114+ frame_checkbox_attributes.pack(side=LEFT)
115+ input_all_attributes.pack()
116+117+ frame_logging = Frame(frame_checkboxes)
118+ variable_logging = BooleanVar()
119+ variable_logging.set(False)
120+ input_logging = Checkbutton(frame_logging, text="Verbose logging", variable=variable_logging)
121+ variables.update({'logging':variable_logging})
122+ frame_logging.pack(side=RIGHT)
123+ frame_checkboxes.pack(side=BOTTOM)
124+ input_logging.pack()
125+ frame_all_attributes.pack()
126+127+ return window, variables
128+129+ def prepare_search(self):
130+ """Saves the values from the window for later retrieval."""
131+ variables = self.variables
132+ values = {}
133+134+ values.update({"Always attributes": self.load_always_attributes()})
135+ for name, var in variables.iteritems():
136+ if var.__class__ is StringVar:
137+ values.update({name: var.get()})
138+ elif var.__class__ is BooleanVar:
139+ values.update({name: var.get()})
140+ elif var.__class__ is Text:
141+ values.update({name: str(var.get("1.0", END)).strip()})
142+ elif var.__class__ is Listbox:
143+ values.update({name: ", ".join([var.get(int(i)) for i in var.curselection()])})
144+ else:
145+ print "No known class, {}, {}".format(name, var)
146+147+ values.update({'output_name':self.variable_output_name.get()})
148+ values.update({'output_type':self.check_output_type(values.get('output_name'))})
149+150+ self.values = values
151+ if all([values.get(i) != '' for i in self.required_variables]):
152+ self.finish_with_search = True
153+ self.window.destroy()
154+ else:
155+ self.finish_with_search = False
156+ #tkMessageBox.showinfo('Not all required information was entered!')
157+158+ def execute_search(self):
159+ """Calls the Fourmi crawler with the values from the GUI"""
160+ if self.values.get('all_attributes'):
161+ attributes = ".*"
162+ else:
163+ attribute_types = ['attributes', 'Common attributes', 'Always attributes']
164+ attributes = ','.join([str(self.values.get(attribute)) for attribute in attribute_types])
165+ output_file = "file://" + str(self.values.get('output_name')) #Dealing with absolute paths
166+167+ arguments = {'--attributes': attributes,
168+ '--exclude': None,
169+ '--format': self.values.get('output_type'),
170+ '--help': False,
171+ '--include': None,
172+ '--log': 'log.txt',
173+ '--output': output_file,
174+ '-v': 0 if self.values.get('logging') else 3,
175+ '--version': False,
176+ '<compound>': self.values.get('substance'),
177+ 'list': False,
178+ 'search': True}
179+180+ self.search(arguments, self.sourceloader)
181+182+ def run(self):
183+ """Starts the window and the search."""
184+ self.window.mainloop()
185+ if self.finish_with_search:
186+ self.execute_search()
187+188+ def check_output_type(self, filename):
189+ parts = str(filename).split('.')
190+ output_types = self.load_output_types()
191+ extension = parts[-1]
192+193+ for type in output_types:
194+ if extension==type:
195+ return extension
196+ return output_types[0]
+10
GUI.cfg.sample
···0000000000
···1+[GUI]
2+# Personalize options in your User Interface
3+4+# Commonly used parameters are listed in the GUI for easy selection
5+CommonParameters = Weight, Polarity, Viscosity, Solubility, Name
6+7+# Parameters that are always used in the search
8+AlwaysParameters = Name
9+10+OutputTypes = csv, json, jsonlines, xml
+21
LICENSE
···000000000000000000000
···1+The MIT License (MIT)
2+3+Copyright (c) 2014 Ivo B. Rietveld
4+5+Permission is hereby granted, free of charge, to any person obtaining a copy
6+of this software and associated documentation files (the "Software"), to deal
7+in the Software without restriction, including without limitation the rights
8+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+copies of the Software, and to permit persons to whom the Software is
10+furnished to do so, subject to the following conditions:
11+12+The above copyright notice and this permission notice shall be included in all
13+copies or substantial portions of the Software.
14+15+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+SOFTWARE.
···1+# Fourmi
2+3+**Master branch**: [](https://travis-ci.org/jjdekker/Fourmi) [](https://coveralls.io/r/jjdekker/Fourmi?branch=master)
4+5+**Developing branch**: [](https://travis-ci.org/jjdekker/Fourmi) [](https://coveralls.io/r/jjdekker/Fourmi?branch=develop)
6+7+Fourmi is an web scraper for chemical substances. The program is designed to be
8+used as a search engine to search multiple chemical databases for a specific
9+substance. The program will produce all available attributes of the substance
10+and conditions associated with the attributes. Fourmi also attempts to estimate
11+the reliability of each data point to assist the user in deciding which data
12+should be used.
13+14+The Fourmi project is open source project licensed under the MIT license. Feel
15+free to contribute!
16+17+Fourmi is based on the [Scrapy framework](http://scrapy.org/), an open source
18+web scraping framework for python. Most of the functionality of this project can
19+be traced to this framework. Should the documentation for this application fall
20+short, we suggest you take a close look at the [Scrapy architecture]
21+(http://doc.scrapy.org/en/latest/topics/architecture.html) and the [Scrapy
22+documentation](http://doc.scrapy.org/en/latest/index.html).
23+24+### Installing
25+26+If you're installing Fourmi, please take a look at our installation guides
27+on our [wiki](https://github.com/jjdekker/Fourmi/wiki). When you've installed the application, make sure to check our
28+usage guide on the [Command Line Interface](https://github.com/jjdekker/Fourmi/wiki/CLI) and on the [Graphical User Interface](https://github.com/jjdekker/Fourmi/wiki/GUI).
29+30+### Using the Source
31+32+To use the Fourmi source code multiple dependencies are required. Take a look at
33+our [wiki pages](https://github.com/jjdekker/Fourmi/wiki) on using the application source code in our a step by step
34+installation guide.
35+36+When developing for the Fourmi project keep in mind that code readability is a
37+must. To maintain the readability, code should be conform with the
38+[PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python
39+code. More information about the different structures and principles of the
40+Fourmi application can be found on our [wiki](https://github.com/jjdekker/Fourmi/wiki).
41+42+### To Do
43+44+The Fourmi project has the following goals for the nearby future:
45+46+__Main goals:__
47+48+- Build an graphical user interface(GUI) as alternative for the command line
49+interface(CLI). (Assignee: Harmen)
50+- Compiling the source into an windows executable. (Assignee: Bas)
51+52+__Side goals:__
53+54+- Clean and unify data.
55+- Extensive reliability analysis using statistical tests.
56+- Test data with Descartes 1.
57+58+### Project Origin
59+60+The Fourmi project was started in February of 2014 as part of a software
61+engineering course at the Radboud University for students studying Computer
62+Science, Information Science or Artificial Intelligence. Students participate in
63+a real software development project as part of the
64+[Giphouse](http://www.giphouse.nl/).
65+66+This particular project was started on behalf of Ivo B. Rietveld. As a chemist
67+he was in need of an application to automatically search information on chemical
68+substances and create an phase diagram. The so called "Descrates" project was
69+split into two teams each creating a different application that has part of the
70+functionality. We are the team Descartes 2 and as we were responsible for
71+creating a web crawler, we've named our application Fourmi (Englis: Ants).
72+73+The following people were part of the original team:
74+75+- [Jip J. Dekker](http://jip.dekker.li)
76+- Rob ten Berge
77+- Harmen Prins
78+- Bas van Berkel
79+- Nout van Deijck
80+- Michail Kuznetcov
-16
README.rst
···1-We are the team Descartes 2.
2-----------------------------
3-4-Our team members are:
5-6-+ Rob ten Berge
7-8-+ Bas van Berkel
9-10-+ Nout van Deijck
11-12-+ Jip J. Dekker
13-14-+ Michail Kuznetcov
15-16-+ Harmen Prins
···1+#!/usr/bin/env python
2+"""
3+Fourmi, a web scraper build to search specific information for a given compound (and its pseudonyms).
4+5+Usage:
6+ fourmi
7+ fourmi search <compound>
8+ fourmi [options] search <compound>
9+ fourmi [options] [-v | -vv | -vvv] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
10+ fourmi list
11+ fourmi [--include=<sourcename> | --exclude=<sourcename>] list
12+ fourmi -h | --help
13+ fourmi --version
14+15+Options:
16+ --attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
17+ -h --help Show this screen.
18+ --version Show version.
19+ -v Verbose logging output. (Multiple occurrences increase logging level)
20+ --log=<file> Save log to an file.
21+ -o <file> --output=<file> Output file [default: <compound>.*format*]
22+ -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
23+ --include=<regex> Include only sources that match these regular expressions split by a comma.
24+ --exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
25+"""
26+27+from twisted.internet import reactor
28+from scrapy.crawler import Crawler
29+from scrapy import signals, log
30+import docopt
31+32+from FourmiCrawler.spider import FourmiSpider
33+from utils.configurator import Configurator
34+from utils.sourceloader import SourceLoader
35+from GUI import gui
36+37+38+def setup_crawler(compound, settings, source_loader, attributes):
39+ """
40+ This function prepares and start the crawler which starts the actual search on the internet
41+ :param compound: The compound which should be searched
42+ :param settings: A scrapy settings object
43+ :param source_loader: A fully functional SourceLoader object which contains only the sources that should be used.
44+ :param attributes: A list of regular expressions which the attribute names should match.
45+ """
46+ spider = FourmiSpider(compound=compound, selected_attributes=attributes)
47+ spider.add_sources(source_loader.sources)
48+ crawler = Crawler(settings)
49+ crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
50+ crawler.configure()
51+ crawler.crawl(spider)
52+ crawler.start()
53+54+55+def search(docopt_arguments, source_loader):
56+ """
57+ The function that facilitates the search for a specific compound.
58+ :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
59+ :param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
60+ """
61+ conf = Configurator()
62+ conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
63+ conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments["<compound>"])
64+ setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
65+ source_loader, docopt_arguments["--attributes"].split(','))
66+ if conf.scrapy_settings.getbool("LOG_ENABLED"):
67+ log.start(conf.scrapy_settings.get("LOG_FILE"),
68+ conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
69+ reactor.run()
70+71+72+# The start for the Fourmi Command Line interface.
73+if __name__ == '__main__':
74+ arguments = docopt.docopt(__doc__, version='Fourmi - V0.6.0')
75+ loader = SourceLoader()
76+77+ if arguments["--include"]:
78+ loader.include(arguments["--include"].split(','))
79+ elif arguments["--exclude"]:
80+ loader.exclude(arguments["--exclude"].split(','))
81+82+ if arguments["search"]:
83+ search(arguments, loader)
84+ elif arguments["list"]:
85+ print "-== Available Sources ==-"
86+ print str(loader)
87+ else:
88+ gui_window = gui.GUI(search, sourceloader=SourceLoader())
89+ gui_window.run()
+19
sources.cfg.sample
···0000000000000000000
···1+[DEFAULT]
2+reliability = Unknown
3+4+#For each source listed in FourmiCrawler/sources there should be a section
5+#named exactly as the filename in here. If not present, the DEFAULT value is
6+#used for reliability of that source.
7+8+[ChemSpider]
9+reliability = High
10+#token=Paste ChemSpider API token here and remove the hashtag
11+12+[NIST]
13+reliability = High
14+15+[WikipediaParser]
16+reliability = Medium
17+18+[PubChem]
19+reliability = High
···1+import ConfigParser
2+import os
3+import shutil
4+5+from scrapy.utils.project import get_project_settings
6+7+8+class Configurator:
9+ """
10+ A helper class in the fourmi class. This class is used to process the settings as set
11+ from one of the Fourmi applications.
12+ """
13+14+ def __init__(self):
15+ self.scrapy_settings = get_project_settings()
16+17+ def set_output(self, filename, fileformat, compound):
18+ """
19+ This function manipulates the Scrapy output file settings that normally would be set in the settings file.
20+ In the Fourmi project these are command line arguments.
21+ :param filename: The filename of the file where the output will be put.
22+ :param fileformat: The format in which the output will be.
23+ """
24+25+ if filename != '<compound>.*format*':
26+ self.scrapy_settings.overrides["FEED_URI"] = filename
27+ elif fileformat == "jsonlines":
28+ self.scrapy_settings.overrides["FEED_URI"] = compound + ".json"
29+ elif fileformat is not None:
30+ self.scrapy_settings.overrides["FEED_URI"] = compound + "." + fileformat
31+32+ if fileformat is not None:
33+ self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
34+35+ def set_logging(self, logfile=None, verbose=0):
36+ """
37+ This function changes the default settings of Scapy's logging functionality
38+ using the settings given by the CLI.
39+ :param logfile: The location where the logfile will be saved.
40+ :param verbose: A integer value to switch between loglevels.
41+ """
42+ if verbose != 0:
43+ self.scrapy_settings.overrides["LOG_ENABLED"] = True
44+ else:
45+ self.scrapy_settings.overrides["LOG_ENABLED"] = False
46+47+ if verbose == 1:
48+ self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING"
49+ elif verbose == 2:
50+ self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO"
51+ else:
52+ self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG"
53+54+ if verbose > 1:
55+ self.scrapy_settings.overrides["LOG_STDOUT"] = False
56+ else:
57+ self.scrapy_settings.overrides["LOG_STDOUT"] = True
58+59+ if logfile is not None:
60+ self.scrapy_settings.overrides["LOG_FILE"] = logfile
61+ else:
62+ self.scrapy_settings.overrides["LOG_FILE"] = None
63+64+ @staticmethod
65+ def read_sourceconfiguration():
66+ """
67+ This function reads sources.cfg in the main folder for configuration
68+ variables for sources
69+ :return a ConfigParser object of sources.cfg
70+ """
71+ current_dir = os.path.dirname(os.path.abspath(__file__))
72+ config_path = current_dir + '/../sources.cfg'
73+ # [TODO]: location of sources.cfg should be softcoded eventually
74+ if not os.path.isfile(config_path):
75+ try:
76+ shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../sources.cfg.sample", config_path)
77+ except IOError:
78+ print "WARNING: Source configuration couldn't be found and couldn't be created."
79+ config = ConfigParser.ConfigParser()
80+ config.read(config_path)
81+ return config
82+83+ @staticmethod
84+ def get_section(config, sourcename):
85+ """
86+ This function reads a config section labeled in variable sourcename and
87+ tests whether the reliability variable is set else set to empty string.
88+ Return the default section if the labeled config section does not exist
89+ :param config: a ConfigParser object
90+ :param sourcename: the name of the section to be read
91+ :return a dictionary of the section in the config labeled in sourcename
92+ """
93+ section = dict()
94+ if config.has_section(sourcename):
95+ section = dict(config.items(sourcename))
96+ elif config.defaults():
97+ section = config.defaults()
98+ if 'reliability' not in section:
99+ print 'WARNING: Reliability not set for %s' % sourcename
100+ section['reliability'] = ''
101+ return section
···1+import inspect
2+import os
3+import re
4+5+from FourmiCrawler.sources.source import Source
6+from utils.configurator import Configurator
7+8+9+class SourceLoader:
10+ sources = []
11+12+ def __init__(self, rel_dir="../FourmiCrawler/sources"):
13+ """
14+ The initiation of a SourceLoader, selects and indexes a directory for usable sources.
15+ Also loads a configuration file for Sources and passes the arguments in
16+ the named section to the source
17+ :param rel_dir: A relative path to a directory.
18+ """
19+ path = os.path.dirname(os.path.abspath(__file__))
20+ path += "/" + rel_dir
21+ known_parser = set()
22+23+ config = Configurator.read_sourceconfiguration()
24+25+ for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
26+ mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py])
27+ classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
28+ for cls in classes:
29+ if issubclass(cls, Source) and cls not in known_parser:
30+ sourcecfg = Configurator.get_section(config, cls.__name__)
31+ self.sources.append(cls(sourcecfg))
32+ known_parser.add(cls)
33+34+ def include(self, source_names):
35+ """
36+ This function excludes all sources that don't match the given regular expressions.
37+ :param source_names: A list of regular expression (strings)
38+ """
39+ new = set()
40+ for name in source_names:
41+ new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
42+ self.sources = list(new)
43+44+ def exclude(self, source_names):
45+ """
46+ This function excludes all sources that match the given regular expressions.
47+ :param source_names: A list of regular expression (strings)
48+ """
49+ exclude = []
50+ for name in source_names:
51+ exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
52+ self.sources = [src for src in self.sources if src not in exclude]
53+54+ def __str__(self):
55+ """
56+ This function returns a string with all sources currently available in the SourceLoader.
57+ :return: a string with all available sources.
58+ """
59+ string = ""
60+ for src in self.sources:
61+ string += "Source: " + src.__class__.__name__
62+ string += " - "
63+ string += "URI: " + src.website + "\n"
64+ return string