-5
.gitignore
-5
.gitignore
+2
-10
.travis.yml
+2
-10
.travis.yml
···
3
language: python
4
python: 2.7
5
6
-
before_install:
7
-
- "export DISPLAY=:99.0"
8
-
- "sh -e /etc/init.d/xvfb start"
9
-
10
# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
11
install:
12
- pip install Scrapy docopt
13
-
- pip install coveralls
14
15
# command to run tests, e.g. python setup.py test
16
script:
17
-
- nosetests --with-coverage --cover-package=FourmiCrawler,utils,GUI tests
18
19
notifications:
20
-
slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
21
-
22
-
after_success:
23
-
coveralls --verbose
···
3
language: python
4
python: 2.7
5
6
# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
7
install:
8
- pip install Scrapy docopt
9
10
# command to run tests, e.g. python setup.py test
11
script:
12
+
- nosetests tests
13
14
notifications:
15
+
slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
-20
Changelog.md
-20
Changelog.md
···
1
-
### v0.6.0
2
-
- Feature: Added a Graphical User interface
3
-
- Feature: Automatic config file createion from config samples
4
-
- FIX: The default name of the output files will now consist of the compound name and the file format when using the CLI
5
-
- FIX: A lot of bugfixes of the PubChem plugin, as is wasn't working as it should
6
-
- FIX: Using absolute path for configuration files
7
-
- DEV: General Code cleanup in documentation
8
-
9
-
### v0.5.3
10
-
- FIX: It is now again possible to use both verbose and the source inclusion/exclusion options
11
-
- FIX: Logging is now "actually" disabled if not using the verbose option.
12
-
- FEATURE: Added support for PubChem
13
-
14
-
### v0.5.2
15
-
- FIX: Signatured used to contain untracked and older files, current signature
16
-
should be correct.
17
-
18
-
### v0.5.1
19
-
- UPDATED: Logging functionality from command line
20
-
- DEV: Code cleanup and extra tests
···
+2
-1
FourmiCrawler/settings.py
+2
-1
FourmiCrawler/settings.py
+60
-127
FourmiCrawler/sources/ChemSpider.py
+60
-127
FourmiCrawler/sources/ChemSpider.py
···
10
11
# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
12
13
class ChemSpider(Source):
14
-
"""
15
-
ChemSpider scraper for synonyms and properties
16
This parser will manage searching for chemicals through the
17
ChemsSpider API, and parsing the resulting ChemSpider page.
18
The token required for the API should be in a configuration file
19
somewhere.
20
"""
21
22
-
website = 'http://www\\.chemspider\\.com/.*'
23
24
-
search = 'Search.asmx/SimpleSearch?query=%s&token='
25
structure = 'Chemical-Structure.%s.html'
26
-
extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
27
28
-
def __init__(self, config=None):
29
-
"""
30
-
Initialization of ChemSpider scraper
31
-
:param config: a dictionary of settings for this scraper, must contain
32
-
'reliability' key
33
-
"""
34
-
Source.__init__(self, config)
35
-
self.ignore_list = []
36
-
if 'token' not in self.cfg or self.cfg['token'] == '':
37
-
log.msg('ChemSpider token not set or empty, search/MassSpec API '
38
-
'not available', level=log.WARNING)
39
-
self.cfg['token'] = ''
40
-
self.search += self.cfg['token']
41
-
self.extendedinfo += self.cfg['token']
42
43
def parse(self, response):
44
-
"""
45
-
This function is called when a Response matching the variable
46
-
'website' is available for parsing the Response object.
47
-
:param response: the Scrapy Response object to be parsed
48
-
:return: a list of Result items and Request objects
49
-
"""
50
sel = Selector(response)
51
requests = []
52
requests_synonyms = self.parse_synonyms(sel)
···
56
57
return requests
58
59
-
def parse_properties(self, sel):
60
-
"""
61
-
This function scrapes the Experimental Data and Predicted ACD/Labs tabs
62
-
:param sel: a Selector object of the whole page
63
-
:return: a list of Result items
64
-
"""
65
-
properties = []
66
-
67
-
properties.extend(self.parse_acdlabstab(sel))
68
-
properties.extend(self.parse_experimentaldatatab(sel))
69
-
70
-
return properties
71
-
72
-
def parse_acdlabstab(self, sel):
73
-
"""
74
-
This function scrapes the 'Predicted ACD/Labs tab' under Properties
75
-
:param sel: a Selector object of the whole page
76
-
:return: a list of Request objects
77
-
"""
78
properties = []
79
80
td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
81
'normalize-space(string())')
82
prop_names = td_list[::2]
···
88
prop_conditions = ''
89
90
# Test for properties without values, with one hardcoded exception
91
-
if (not re.match(r'^\d', prop_value) or
92
-
(prop_name == 'Polarizability' and prop_value == '10-24cm3')):
93
continue
94
95
m = re.match(r'(.*) \((.*)\)', prop_name)
96
if m:
97
prop_name = m.group(1)
98
prop_conditions = m.group(2)
99
100
m = re.match(r'(.*) at (.*)', prop_value)
101
if m:
102
prop_value = m.group(1)
103
prop_conditions = m.group(2)
104
105
-
new_prop = self.newresult(
106
-
attribute=prop_name,
107
-
value=prop_value,
108
-
source='ChemSpider Predicted - ACD/Labs Tab',
109
-
conditions=prop_conditions
110
-
)
111
properties.append(new_prop)
112
113
-
return properties
114
-
115
-
def parse_experimentaldatatab(self, sel):
116
-
"""
117
-
This function scrapes Experimental Data tab, Physico-chemical
118
-
properties in particular.
119
-
:param sel: a Selector object of the whole page
120
-
:return: a list of Result items
121
-
"""
122
-
properties = []
123
-
124
scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
125
'Properties"]//li/table/tr/td')
126
if not scraped_list:
···
132
if line.xpath('span/text()'):
133
property_name = line.xpath('span/text()').extract()[0].rstrip()
134
else:
135
-
new_prop = self.newresult(
136
-
attribute=property_name[:-1],
137
-
value=line.xpath('text()').extract()[0].rstrip(),
138
-
source=line.xpath('strong/text()').extract()[0].rstrip(),
139
-
)
140
-
properties.append(new_prop)
141
142
return properties
143
144
def parse_synonyms(self, sel):
145
-
"""
146
-
This function scrapes the list of Names and Identifiers
147
-
:param sel: a Selector object of the whole page
148
-
:return: a list of Requests
149
-
"""
150
requests = []
151
synonyms = []
152
···
178
return requests
179
180
def new_synonym(self, sel, name, category):
181
-
"""
182
-
This function scrapes for a single synonym at a given HTML tag
183
-
:param sel: a Selector object of the given HTML tag
184
-
:param name: the name of the synonym in the tag
185
-
:param category: the name of the category the synonym is labeled as
186
-
:return: a dictionary containing data on the synonym
187
-
"""
188
self.ignore_list.append(name)
189
language = sel.xpath('span[@class="synonym_language"]/text()')
190
if language:
···
219
}
220
return synonym
221
222
-
def parse_extendedinfo(self, response):
223
-
"""
224
-
This function scrapes data from the ChemSpider GetExtendedCompoundInfo
225
-
API, if a token is present in the configuration settings
226
-
:param response: a Response object to be parsed
227
-
:return: a list of Result items
228
-
"""
229
sel = Selector(response)
230
properties = []
231
names = sel.xpath('*').xpath('name()').extract()
232
values = sel.xpath('*').xpath('text()').extract()
233
for (name, value) in zip(names, values):
234
-
result = self.newresult(
235
-
attribute=name,
236
-
value=value, # These values have no unit!
237
-
source='ChemSpider ExtendedCompoundInfo',
238
-
)
239
if result['value']:
240
properties.append(result)
241
return properties
242
243
-
def newresult(self, attribute, value, conditions='', source='ChemSpider'):
244
-
"""
245
-
This function abstracts from the Result item and provides default
246
-
values.
247
-
:param attribute: the name of the attribute
248
-
:param value: the value of the attribute
249
-
:param conditions: optional conditions regarding the value
250
-
:param source: the name of the source if it is not ChemSpider
251
-
:return: A Result item
252
-
"""
253
-
return Result({
254
-
'attribute': attribute,
255
-
'value': value,
256
-
'source': source,
257
-
'reliability': self.cfg['reliability'],
258
-
'conditions': conditions
259
-
})
260
-
261
def parse_searchrequest(self, response):
262
-
"""
263
-
This function parses the initial response of the ChemSpider Search API
264
-
Requires a valid token to function.
265
-
:param response: the Response object to be parsed
266
-
:return: A Request for the information page and a Request for the
267
-
extendedinfo API call
268
-
"""
269
sel = Selector(response)
270
log.msg('chemspider parse_searchrequest', level=log.DEBUG)
271
sel.register_namespace('cs', 'http://www.chemspider.com/')
···
277
log.msg('ChemSpider found multiple substances, taking first '
278
'element', level=log.DEBUG)
279
csid = csids[0]
280
-
structure_url = self.website[:-2].replace("\\", "") + self.structure % csid
281
-
extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid
282
log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
283
return [Request(url=structure_url,
284
callback=self.parse),
···
286
callback=self.parse_extendedinfo)]
287
288
def new_compound_request(self, compound):
289
-
"""
290
-
This function is called when a new synonym is returned to the spider
291
-
to generate new requests
292
-
:param compound: the name of the compound to search for
293
-
"""
294
-
if compound in self.ignore_list or self.cfg['token'] == '':
295
return None
296
-
searchurl = self.website[:-2].replace("\\", "") + self.search % compound
297
log.msg('chemspider compound', level=log.DEBUG)
298
return Request(url=searchurl, callback=self.parse_searchrequest)
···
10
11
# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
12
13
+
14
class ChemSpider(Source):
15
+
"""ChemSpider scraper for synonyms and properties
16
+
17
This parser will manage searching for chemicals through the
18
ChemsSpider API, and parsing the resulting ChemSpider page.
19
The token required for the API should be in a configuration file
20
somewhere.
21
"""
22
23
+
def __init__(self):
24
+
Source.__init__(self)
25
26
+
website = 'http://www.chemspider.com/*'
27
+
28
+
# [TODO] - Save and access token of specific user.
29
+
search = ('Search.asmx/SimpleSearch?query=%s&token='
30
+
'052bfd06-5ce4-43d6-bf12-89eabefd2338')
31
structure = 'Chemical-Structure.%s.html'
32
+
extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
33
+
'052bfd06-5ce4-43d6-bf12-89eabefd2338')
34
35
+
ignore_list = []
36
37
def parse(self, response):
38
sel = Selector(response)
39
requests = []
40
requests_synonyms = self.parse_synonyms(sel)
···
44
45
return requests
46
47
+
@staticmethod
48
+
def parse_properties(sel):
49
+
"""scrape Experimental Data and Predicted ACD/Labs tabs"""
50
properties = []
51
52
+
# Predicted - ACD/Labs tab
53
td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
54
'normalize-space(string())')
55
prop_names = td_list[::2]
···
61
prop_conditions = ''
62
63
# Test for properties without values, with one hardcoded exception
64
+
if not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and prop_value == '10-24cm3'):
65
continue
66
67
+
# Match for condition in parentheses
68
m = re.match(r'(.*) \((.*)\)', prop_name)
69
if m:
70
prop_name = m.group(1)
71
prop_conditions = m.group(2)
72
73
+
# Match for condition in value seperated by an 'at'
74
m = re.match(r'(.*) at (.*)', prop_value)
75
if m:
76
prop_value = m.group(1)
77
prop_conditions = m.group(2)
78
79
+
new_prop = Result({
80
+
'attribute': prop_name,
81
+
'value': prop_value,
82
+
'source': 'ChemSpider Predicted - ACD/Labs Tab',
83
+
'reliability': 'Unknown',
84
+
'conditions': prop_conditions
85
+
})
86
properties.append(new_prop)
87
+
log.msg('CS prop: |%s| |%s| |%s|' %
88
+
(new_prop['attribute'], new_prop['value'], new_prop['source']),
89
+
level=log.DEBUG)
90
91
+
# Experimental Data Tab, Physico-chemical properties in particular
92
scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
93
'Properties"]//li/table/tr/td')
94
if not scraped_list:
···
100
if line.xpath('span/text()'):
101
property_name = line.xpath('span/text()').extract()[0].rstrip()
102
else:
103
+
new_prop = Result({
104
+
'attribute': property_name[:-1],
105
+
'value': line.xpath('text()').extract()[0].rstrip(),
106
+
'source': line.xpath(
107
+
'strong/text()').extract()[0].rstrip(),
108
+
'reliability': 'Unknown',
109
+
'conditions': ''
110
+
})
111
+
properties.append(new_prop)
112
+
log.msg('CS prop: |%s| |%s| |%s|' %
113
+
(new_prop['attribute'], new_prop['value'],
114
+
new_prop['source']), level=log.DEBUG)
115
116
return properties
117
118
def parse_synonyms(self, sel):
119
+
"""Scrape list of Names and Identifiers"""
120
requests = []
121
synonyms = []
122
···
148
return requests
149
150
def new_synonym(self, sel, name, category):
151
+
"""Scrape for a single synonym at a given HTML tag"""
152
self.ignore_list.append(name)
153
language = sel.xpath('span[@class="synonym_language"]/text()')
154
if language:
···
183
}
184
return synonym
185
186
+
@staticmethod
187
+
def parse_extendedinfo(response):
188
+
"""Scrape data from the ChemSpider GetExtendedCompoundInfo API"""
189
sel = Selector(response)
190
properties = []
191
names = sel.xpath('*').xpath('name()').extract()
192
values = sel.xpath('*').xpath('text()').extract()
193
for (name, value) in zip(names, values):
194
+
result = Result({
195
+
'attribute': name,
196
+
'value': value, # These values have no unit!
197
+
'source': 'ChemSpider ExtendedCompoundInfo',
198
+
'reliability': 'Unknown',
199
+
'conditions': ''
200
+
})
201
if result['value']:
202
properties.append(result)
203
return properties
204
205
def parse_searchrequest(self, response):
206
+
"""Parse the initial response of the ChemSpider Search API """
207
sel = Selector(response)
208
log.msg('chemspider parse_searchrequest', level=log.DEBUG)
209
sel.register_namespace('cs', 'http://www.chemspider.com/')
···
215
log.msg('ChemSpider found multiple substances, taking first '
216
'element', level=log.DEBUG)
217
csid = csids[0]
218
+
structure_url = self.website[:-1] + self.structure % csid
219
+
extendedinfo_url = self.website[:-1] + self.extendedinfo % csid
220
log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
221
return [Request(url=structure_url,
222
callback=self.parse),
···
224
callback=self.parse_extendedinfo)]
225
226
def new_compound_request(self, compound):
227
+
if compound in self.ignore_list: # [TODO] - add regular expression
228
return None
229
+
searchurl = self.website[:-1] + self.search % compound
230
log.msg('chemspider compound', level=log.DEBUG)
231
return Request(url=searchurl, callback=self.parse_searchrequest)
+83
-141
FourmiCrawler/sources/NIST.py
+83
-141
FourmiCrawler/sources/NIST.py
···
13
# Result item, but should be included eventually.
14
15
class NIST(Source):
16
-
"""
17
-
NIST Scraper plugin
18
This plugin manages searching for a chemical on the NIST website
19
and parsing the resulting page if the chemical exists on NIST.
20
"""
21
-
website = "http://webbook\\.nist\\.gov/.*"
22
23
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
24
25
-
def __init__(self, config=None):
26
-
"""
27
-
Initialization of NIST scraper
28
-
:param config: configuration variables for this scraper, must contain
29
-
'reliability' key.
30
-
"""
31
-
Source.__init__(self, config)
32
-
self.ignore_list = set()
33
34
def parse(self, response):
35
-
"""
36
-
This function is called when a Response matching the variable
37
-
'website' is available for parsing the Response object.
38
-
:param response: The Scrapy Response object to be parsed
39
-
:return: a list of Result items and Request objects
40
-
"""
41
sel = Selector(response)
42
43
title = sel.xpath('head/title/text()').extract()[0]
···
62
log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
63
level=log.DEBUG)
64
65
-
requests.extend(self.parse_tables(sel, symbol_table))
66
-
67
-
return requests
68
-
69
-
def parse_tables(self, sel, symbol_table):
70
-
"""
71
-
This function identifies and distributes parsing of tables to other
72
-
functions below.
73
-
:param sel: A Selector object of the whole page
74
-
:param symbol_table: a dictionary containing translations of raw HTML
75
-
tags to human readable names
76
-
:return: a list of Result items and Requests
77
-
"""
78
-
requests = []
79
-
80
for table in sel.xpath('//table[@class="data"]'):
81
summary = table.xpath('@summary').extract()[0]
82
if summary == 'One dimensional data':
···
107
return requests
108
109
def parse_generic_info(self, sel):
110
-
"""
111
-
This function parses: synonyms, chemical formula, molecular weight,
112
-
InChI, InChiKey, CAS number
113
-
:param sel: A Selector object of the entire page in the original
114
-
response
115
-
:return: a list of Result items
116
"""
117
ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
118
119
raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
120
for synonym in raw_synonyms[0].strip().split(';\n'):
···
142
143
requests = []
144
for key, value in data.iteritems():
145
-
result = self.newresult(
146
-
attribute=key,
147
-
value=value
148
-
)
149
requests.append(result)
150
151
return requests
152
153
def parse_aggregate_data(self, table, symbol_table):
154
-
"""
155
-
This function parses the table(s) which contain possible links to
156
-
individual data points
157
-
:param table: a Selector object of the table to be parsed
158
-
:param symbol_table: a dictionary containing translations of raw HTML
159
-
tags to human readable names
160
-
:return: a list of Result items and Request objects
161
"""
162
results = []
163
for tr in table.xpath('tr[td]'):
164
extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
165
'/a/@href').extract()
166
if extra_data_url:
167
-
request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0],
168
callback=self.parse_individual_datapoints)
169
results.append(request)
170
continue
···
180
name = m.group(1)
181
condition = m.group(2)
182
183
-
result = self.newresult(
184
-
attribute=name,
185
-
value=data[1] + ' ' + data[2],
186
-
conditions=condition
187
-
)
188
log.msg('NIST: |%s|' % data, level=log.DEBUG)
189
results.append(result)
190
return results
191
192
-
def parse_transition_data(self, table, summary):
193
-
"""
194
-
This function parses the table containing properties regarding phase
195
-
changes
196
-
:param table: a Selector object of the table to be parsed
197
-
:param summary: the name of the property
198
-
:return: a list of Result items
199
-
"""
200
results = []
201
202
-
unit = self.get_unit(table)
203
204
for tr in table.xpath('tr[td]'):
205
tds = tr.xpath('td/text()').extract()
206
-
result = self.newresult(
207
-
attribute=summary,
208
-
value=tds[0] + ' ' + unit,
209
-
conditions='%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
210
-
)
211
results.append(result)
212
213
return results
214
215
-
def parse_generic_data(self, table, summary):
216
-
"""
217
-
Parses the common tables of 4 and 5 rows. Assumes they are of the
218
form:
219
Symbol (unit)|Temperature (K)|Method|Reference|Comment
220
Symbol (unit)|Temperature (K)|Reference|Comment
221
-
:param table: a Selector object of the table to be parsed
222
-
:param summary: the name of the property
223
-
:return: a list of Result items
224
"""
225
results = []
226
227
-
unit = self.get_unit(table)
228
229
for tr in table.xpath('tr[td]'):
230
tds = tr.xpath('td/text()').extract()
231
-
result = self.newresult(
232
-
attribute=summary,
233
-
value=tds[0] + ' ' + unit,
234
-
conditions='%s K' % tds[1]
235
-
)
236
results.append(result)
237
return results
238
239
-
def parse_antoine_data(self, table, summary):
240
-
"""
241
-
This function parses the table containing parameters for the Antione
242
-
equation
243
-
:param table: a Selector object of the table to be parsed
244
-
:param summary: the name of the property
245
-
:return: a list of Result items
246
-
"""
247
results = []
248
249
for tr in table.xpath('tr[td]'):
250
tds = tr.xpath('td/text()').extract()
251
-
result = self.newresult(
252
-
attribute=summary,
253
-
value='A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
254
-
conditions='%s K' % tds[0]
255
-
)
256
results.append(result)
257
258
return results
259
260
-
def parse_individual_datapoints(self, response):
261
-
"""
262
-
This function parses the 'individual data points' page linked from
263
-
the aggregate data table(s)
264
-
:param response: the Scrapy Response object to be parsed
265
-
:return: a list of Result items
266
-
"""
267
sel = Selector(response)
268
table = sel.xpath('//table[@class="data"]')[0]
269
···
276
name = m.group(1)
277
condition = m.group(2)
278
279
-
unit = self.get_unit(table)
280
281
for tr in table.xpath('tr[td]'):
282
tds = tr.xpath('td/text()').extract()
···
285
if m:
286
uncertainty = '+- %s ' % m.group(1)
287
# [TODO]: get the plusminus sign working in here
288
-
result = self.newresult(
289
-
attribute=name,
290
-
value='%s %s%s' % (tds[0], uncertainty, unit),
291
-
conditions=condition
292
-
)
293
results.append(result)
294
295
return results
296
297
-
@staticmethod
298
-
def get_unit(table):
299
-
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
300
-
m = re.search(r'\((.*)\)', tr_unit)
301
-
unit = '!'
302
-
if m:
303
-
unit = m.group(1)
304
-
305
-
return unit
306
-
307
-
def newresult(self, attribute, value, conditions=''):
308
-
"""
309
-
This function abstracts from the Result item and provides default
310
-
values
311
-
:param attribute: the name of the attribute
312
-
:param value: the value of the attribute
313
-
:param conditions: optional conditions regarding the value
314
-
:return: A Result item
315
-
"""
316
-
return Result(
317
-
{
318
-
'attribute': attribute,
319
-
'value': value,
320
-
'source': 'NIST',
321
-
'reliability': self.cfg['reliability'],
322
-
'conditions': conditions
323
-
})
324
-
325
def new_compound_request(self, compound):
326
-
"""
327
-
This function is called when a new synonym is returned to the spider
328
-
to generate new requests
329
-
:param compound: the name of the compound to search for
330
-
"""
331
if compound not in self.ignore_list:
332
self.ignore_list.update(compound)
333
-
return Request(url=self.website[:-2].replace("\\", "") + self.search % compound,
334
callback=self.parse)
···
13
# Result item, but should be included eventually.
14
15
class NIST(Source):
16
+
"""NIST Scraper plugin
17
+
18
This plugin manages searching for a chemical on the NIST website
19
and parsing the resulting page if the chemical exists on NIST.
20
"""
21
+
website = "http://webbook.nist.gov/*"
22
23
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
24
25
+
ignore_list = set()
26
+
27
+
def __init__(self):
28
+
Source.__init__(self)
29
30
def parse(self, response):
31
sel = Selector(response)
32
33
title = sel.xpath('head/title/text()').extract()[0]
···
52
log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
53
level=log.DEBUG)
54
55
for table in sel.xpath('//table[@class="data"]'):
56
summary = table.xpath('@summary').extract()[0]
57
if summary == 'One dimensional data':
···
82
return requests
83
84
def parse_generic_info(self, sel):
85
+
"""Parses: synonyms, chemical formula, molecular weight, InChI,
86
+
InChiKey, CAS number
87
"""
88
ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
89
+
li = ul.xpath('li')
90
91
raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
92
for synonym in raw_synonyms[0].strip().split(';\n'):
···
114
115
requests = []
116
for key, value in data.iteritems():
117
+
result = Result({
118
+
'attribute': key,
119
+
'value': value,
120
+
'source': 'NIST',
121
+
'reliability': 'Unknown',
122
+
'conditions': ''
123
+
})
124
requests.append(result)
125
126
return requests
127
128
def parse_aggregate_data(self, table, symbol_table):
129
+
"""Parses the table(s) which contain possible links to individual
130
+
data points
131
"""
132
results = []
133
for tr in table.xpath('tr[td]'):
134
extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
135
'/a/@href').extract()
136
if extra_data_url:
137
+
request = Request(url=self.website[:-1] + extra_data_url[0],
138
callback=self.parse_individual_datapoints)
139
results.append(request)
140
continue
···
150
name = m.group(1)
151
condition = m.group(2)
152
153
+
result = Result({
154
+
'attribute': name,
155
+
'value': data[1] + ' ' + data[2],
156
+
'source': 'NIST',
157
+
'reliability': 'Unknown',
158
+
'conditions': condition
159
+
})
160
log.msg('NIST: |%s|' % data, level=log.DEBUG)
161
results.append(result)
162
return results
163
164
+
@staticmethod
165
+
def parse_transition_data(table, summary):
166
+
"""Parses the table containing properties regarding phase changes"""
167
results = []
168
169
+
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
170
+
m = re.search(r'\((.*)\)', tr_unit)
171
+
unit = '!'
172
+
if m:
173
+
unit = m.group(1)
174
175
for tr in table.xpath('tr[td]'):
176
tds = tr.xpath('td/text()').extract()
177
+
result = Result({
178
+
'attribute': summary,
179
+
'value': tds[0] + ' ' + unit,
180
+
'source': 'NIST',
181
+
'reliability': 'Unknown',
182
+
'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
183
+
})
184
results.append(result)
185
186
return results
187
188
+
@staticmethod
189
+
def parse_generic_data(table, summary):
190
+
"""Parses the common tables of 4 and 5 rows. Assumes they are of the
191
form:
192
Symbol (unit)|Temperature (K)|Method|Reference|Comment
193
Symbol (unit)|Temperature (K)|Reference|Comment
194
"""
195
results = []
196
197
+
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
198
+
m = re.search(r'\((.*)\)', tr_unit)
199
+
unit = '!'
200
+
if m:
201
+
unit = m.group(1)
202
203
for tr in table.xpath('tr[td]'):
204
tds = tr.xpath('td/text()').extract()
205
+
result = Result({
206
+
'attribute': summary,
207
+
'value': tds[0] + ' ' + unit,
208
+
'source': 'NIST',
209
+
'reliability': 'Unknown',
210
+
'conditions': '%s K' % tds[1]
211
+
})
212
results.append(result)
213
return results
214
215
+
@staticmethod
216
+
def parse_antoine_data(table, summary):
217
+
"""Parse table containing parameters for the Antione equation"""
218
results = []
219
220
for tr in table.xpath('tr[td]'):
221
tds = tr.xpath('td/text()').extract()
222
+
result = Result({
223
+
'attribute': summary,
224
+
'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
225
+
'source': 'NIST',
226
+
'reliability': 'Unknown',
227
+
'conditions': '%s K' % tds[0]
228
+
})
229
results.append(result)
230
231
return results
232
233
+
@staticmethod
234
+
def parse_individual_datapoints(response):
235
+
"""Parses the page linked from aggregate data"""
236
sel = Selector(response)
237
table = sel.xpath('//table[@class="data"]')[0]
238
···
245
name = m.group(1)
246
condition = m.group(2)
247
248
+
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
249
+
m = re.search(r'\((.*)\)', tr_unit)
250
+
unit = '!'
251
+
if m:
252
+
unit = m.group(1)
253
254
for tr in table.xpath('tr[td]'):
255
tds = tr.xpath('td/text()').extract()
···
258
if m:
259
uncertainty = '+- %s ' % m.group(1)
260
# [TODO]: get the plusminus sign working in here
261
+
result = Result({
262
+
'attribute': name,
263
+
'value': '%s %s%s' % (tds[0], uncertainty, unit),
264
+
'source': 'NIST',
265
+
'reliability': 'Unknown',
266
+
'conditions': condition
267
+
})
268
results.append(result)
269
270
return results
271
272
def new_compound_request(self, compound):
273
if compound not in self.ignore_list:
274
self.ignore_list.update(compound)
275
+
return Request(url=self.website[:-1] + self.search % compound,
276
callback=self.parse)
-149
FourmiCrawler/sources/PubChem.py
-149
FourmiCrawler/sources/PubChem.py
···
1
-
import re
2
-
3
-
from scrapy.http import Request
4
-
from scrapy import log
5
-
from scrapy.selector import Selector
6
-
7
-
from source import Source
8
-
from FourmiCrawler.items import Result
9
-
10
-
11
-
class PubChem(Source):
12
-
""" PubChem scraper for chemical properties
13
-
14
-
This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance,
15
-
including sources of the values of properties.
16
-
"""
17
-
18
-
# PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
19
-
website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
20
-
website_www = 'http://www.ncbi.nlm.nih.gov/*'
21
-
website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
22
-
search = 'pccompound?term=%s'
23
-
data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
24
-
25
-
__spider = None
26
-
searched_compounds = set()
27
-
28
-
def __init__(self, config):
29
-
Source.__init__(self, config)
30
-
self.cfg = config
31
-
32
-
def parse(self, response):
33
-
"""
34
-
Distributes the above described behaviour
35
-
:param response: The incoming search request
36
-
:return Returns the found properties if response is unique or returns none if it's already known
37
-
"""
38
-
requests = []
39
-
log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
40
-
41
-
sel = Selector(response)
42
-
compound = sel.xpath('//h1/text()').extract()[0]
43
-
if compound in self.searched_compounds:
44
-
return None
45
-
46
-
self.searched_compounds.update(compound)
47
-
raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0]
48
-
for synonym in raw_synonyms.strip().split(', '):
49
-
log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG)
50
-
self.searched_compounds.update(synonym)
51
-
self._spider.get_synonym_requests(synonym)
52
-
log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
53
-
54
-
n = re.search(r'cid=(\d+)', response.url)
55
-
if n:
56
-
cid = n.group(1)
57
-
log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach
58
-
# the seperate html page which contains the properties and their values
59
-
60
-
# using this cid to get the right url and scrape it
61
-
requests.append(
62
-
Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data))
63
-
return requests
64
-
65
-
def parse_data(self, response):
66
-
"""
67
-
Parse data found in 'Chemical and Physical properties' part of a substance page.
68
-
:param response: The response with the page to parse
69
-
:return: requests: Returns a list of properties with their values, source, etc.
70
-
"""
71
-
log.msg('parsing data', level=log.DEBUG)
72
-
requests = []
73
-
74
-
sel = Selector(response)
75
-
props = sel.xpath('//div')
76
-
77
-
for prop in props:
78
-
prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
79
-
if prop.xpath('a'): # parsing for single value in property
80
-
prop_source = ''.join(prop.xpath('a/@title').extract())
81
-
prop_value = ''.join(prop.xpath('a/text()').extract())
82
-
new_prop = Result({
83
-
'attribute': prop_name,
84
-
'value': prop_value,
85
-
'source': prop_source,
86
-
'reliability': self.cfg['reliability'],
87
-
'conditions': ''
88
-
})
89
-
log.msg('PubChem prop: |%s| |%s| |%s|' %
90
-
(new_prop['attribute'], new_prop['value'],
91
-
new_prop['source']), level=log.DEBUG)
92
-
requests.append(new_prop)
93
-
elif prop.xpath('ul'): # parsing for multiple values (list) in property
94
-
prop_values = prop.xpath('ul//li')
95
-
for prop_li in prop_values:
96
-
prop_value = ''.join(prop_li.xpath('a/text()').extract())
97
-
prop_source = ''.join(prop_li.xpath('a/@title').extract())
98
-
new_prop = Result({
99
-
'attribute': prop_name,
100
-
'value': prop_value,
101
-
'source': prop_source,
102
-
'reliability': self.cfg['reliability'],
103
-
'conditions': ''
104
-
})
105
-
log.msg('PubChem prop: |%s| |%s| |%s|' %
106
-
(new_prop['attribute'], new_prop['value'],
107
-
new_prop['source']), level=log.DEBUG)
108
-
requests.append(new_prop)
109
-
110
-
return requests
111
-
112
-
def parse_searchrequest(self, response):
113
-
"""
114
-
This function parses the response to the new_compound_request Request
115
-
:param response: the Response object to be parsed
116
-
:return: A Request for the compound page or what self.parse returns in
117
-
case the search request forwarded to the compound page
118
-
"""
119
-
120
-
# check if pubchem forwarded straight to compound page
121
-
m = re.match(self.website_pubchem, response.url)
122
-
if m:
123
-
log.msg('PubChem search forwarded to compound page',
124
-
level=log.DEBUG)
125
-
return self.parse(response)
126
-
127
-
sel = Selector(response)
128
-
129
-
results = sel.xpath('//div[@class="rsltcont"]')
130
-
if results:
131
-
url = results[0].xpath('div/p/a[1]/@href')
132
-
else:
133
-
log.msg('PubChem search found nothing or xpath failed',
134
-
level=log.DEBUG)
135
-
return None
136
-
137
-
if url:
138
-
url = 'http:' + ''.join(url[0].extract())
139
-
log.msg('PubChem compound page: %s' % url, level=log.DEBUG)
140
-
else:
141
-
log.msg('PubChem search found results, but no url in first result',
142
-
level=log.DEBUG)
143
-
return None
144
-
145
-
return Request(url=url, callback=self.parse)
146
-
147
-
def new_compound_request(self, compound):
148
-
return Request(url=self.website_www[:-1] + self.search % compound,
149
-
callback=self.parse_searchrequest)
···
+43
-93
FourmiCrawler/sources/WikipediaParser.py
+43
-93
FourmiCrawler/sources/WikipediaParser.py
···
12
""" Wikipedia scraper for chemical properties
13
14
This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values.
15
-
It also returns requests with other external sources which contain information on parsed subject.
16
"""
17
18
-
website = "http://en\\.wikipedia\\.org/wiki/.*"
19
__spider = None
20
searched_compounds = []
21
22
-
def __init__(self, config=None):
23
-
Source.__init__(self, config)
24
25
def parse(self, response):
26
-
"""
27
-
Distributes the above described behaviour
28
-
:param response: The incoming search request
29
-
:return: Returns the found properties if response is unique or returns none if it's already known
30
-
"""
31
log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
32
sel = Selector(response)
33
compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page
···
39
return items
40
41
def parse_infobox(self, sel):
42
-
"""
43
-
Scrape data from infobox on wikipedia.
44
-
45
-
Data from two types of infoboxes: class="infobox bordered" and class="infobox" is scraped and
46
-
:param sel: The selector with the html-information of the page to parse
47
-
:return: item_list: Returns a list of properties with their values, source, etc..
48
-
"""
49
-
50
items = []
51
52
-
# scrape the chembox (wikipedia template)
53
-
items = self.parse_chembox(sel, items)
54
55
-
# scrape the drugbox (wikipedia template)
56
-
items = self.parse_drugbox(sel, items)
57
58
items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
59
item_list = self.clean_items(items)
···
77
78
return item_list
79
80
-
def parse_chembox(self, sel, items):
81
-
"""
82
-
Scrape data from chembox infobox on wikipedia.
83
-
84
-
:param sel: The selector with the html-information of the page to parse
85
-
:param items: the list of items where the result have to be stored in
86
-
:return: items: the list of items with the new found and stored items
87
-
"""
88
-
tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
89
-
xpath('normalize-space(string())')
90
-
prop_names = tr_list[::2]
91
-
prop_values = tr_list[1::2]
92
-
for i, prop_name in enumerate(prop_names):
93
-
item = self.newresult(
94
-
attribute=prop_name.extract().encode('utf-8'),
95
-
value=prop_values[i].extract().encode('utf-8')
96
-
)
97
-
items.append(item)
98
-
log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
99
-
return items
100
-
101
-
def parse_drugbox(self, sel, items):
102
-
"""
103
-
Scrape data from drugbox infobox on wikipedia.
104
-
105
-
:param sel: The selector with the html-information of the page to parse
106
-
:param items: the list of items where the result have to be stored in
107
-
:return: items: the list of items with the new found and stored items
108
-
"""
109
-
tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
110
-
log.msg('dit: %s' % tr_list2, level=log.DEBUG)
111
-
for tablerow in tr_list2:
112
-
log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
113
-
if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
114
-
'normalize-space(string())'):
115
-
item = self.newresult(
116
-
attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
117
-
value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
118
-
)
119
-
items.append(item)
120
-
log.msg(
121
-
'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
122
-
level=log.DEBUG)
123
-
return items
124
-
125
def new_compound_request(self, compound):
126
-
return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
127
128
@staticmethod
129
def clean_items(items):
130
-
131
-
"""
132
-
Clean up properties using regex, makes it possible to split the values from the units
133
-
134
-
Almost not in use, only cleans J/K/mol values and boiling/melting points.
135
-
136
-
:param items: List of properties with their values, source, etc..
137
-
:return: items: List of now cleaned up items
138
-
"""
139
for item in items:
140
value = item['value']
141
m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F)
···
148
149
@staticmethod
150
def get_identifiers(sel):
151
-
"""
152
-
Find external links, named 'Identifiers' to different sources.
153
-
154
-
:param sel: The selector with the html-information of the page to parse
155
-
:return: links: New links which can be used to expand the crawlers search
156
-
"""
157
links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
158
'[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
159
-
return links
160
-
161
-
def newresult(self, attribute, value):
162
-
return Result(
163
-
{
164
-
'attribute': attribute,
165
-
'value': value,
166
-
'source': 'Wikipedia',
167
-
'reliability': self.cfg['reliability'],
168
-
'conditions': ''
169
-
})
···
12
""" Wikipedia scraper for chemical properties
13
14
This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values.
15
+
It also returns requests with other external sources which contain information on parsed subject.
16
"""
17
18
+
website = "http://en.wikipedia.org/wiki/*"
19
__spider = None
20
searched_compounds = []
21
22
+
def __init__(self):
23
+
Source.__init__(self)
24
25
def parse(self, response):
26
+
""" Distributes the above described behaviour """
27
log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
28
sel = Selector(response)
29
compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page
···
35
return items
36
37
def parse_infobox(self, sel):
38
+
""" scrape data from infobox on wikipedia. """
39
items = []
40
41
+
# be sure to get chembox (wikipedia template)
42
+
tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
43
+
xpath('normalize-space(string())')
44
+
prop_names = tr_list[::2]
45
+
prop_values = tr_list[1::2]
46
+
for i, prop_name in enumerate(prop_names):
47
+
item = Result({
48
+
'attribute': prop_name.extract().encode('utf-8'),
49
+
'value': prop_values[i].extract().encode('utf-8'),
50
+
'source': "Wikipedia",
51
+
'reliability': "Unknown",
52
+
'conditions': ""
53
+
})
54
+
items.append(item)
55
+
log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
56
57
+
#scrape the drugbox (wikipedia template)
58
+
tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
59
+
log.msg('dit: %s' % tr_list2, level=log.DEBUG)
60
+
for tablerow in tr_list2:
61
+
log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
62
+
if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
63
+
'normalize-space(string())'):
64
+
item = Result({
65
+
'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
66
+
'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
67
+
'source': "Wikipedia",
68
+
'reliability': "Unknown",
69
+
'conditions': ""
70
+
})
71
+
items.append(item)
72
+
log.msg(
73
+
'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
74
+
level=log.DEBUG)
75
76
items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
77
item_list = self.clean_items(items)
···
95
96
return item_list
97
98
def new_compound_request(self, compound):
99
+
return Request(url=self.website[:-1] + compound, callback=self.parse)
100
101
@staticmethod
102
def clean_items(items):
103
+
""" clean up properties using regex, makes it possible to split the values from the units """
104
for item in items:
105
value = item['value']
106
m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F)
···
113
114
@staticmethod
115
def get_identifiers(sel):
116
+
""" find external links, named 'Identifiers' to different sources. """
117
links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
118
'[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
119
+
return links
+3
-6
FourmiCrawler/sources/source.py
+3
-6
FourmiCrawler/sources/source.py
···
3
4
5
class Source:
6
-
website = "http://something/.*" # Regex of URI's the source is able to parse
7
_spider = None
8
9
-
def __init__(self, config=None):
10
"""
11
Initiation of a new Source
12
"""
13
-
self.cfg = {}
14
-
if config is not None:
15
-
self.cfg = config
16
pass
17
18
def parse(self, response):
···
30
:param compound: A compound name.
31
:return: A new Scrapy Request
32
"""
33
-
# return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
34
pass
35
36
def set_spider(self, spider):
···
3
4
5
class Source:
6
+
website = "http://something/*" # Regex of URI's the source is able to parse
7
_spider = None
8
9
+
def __init__(self):
10
"""
11
Initiation of a new Source
12
"""
13
pass
14
15
def parse(self, response):
···
27
:param compound: A compound name.
28
:return: A new Scrapy Request
29
"""
30
+
# return Request(url=self.website[:-1] + compound, callback=self.parse)
31
pass
32
33
def set_spider(self, spider):
+8
-12
FourmiCrawler/spider.py
+8
-12
FourmiCrawler/spider.py
···
9
A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
10
"""
11
name = "FourmiSpider"
12
13
-
def __init__(self, compound=None, selected_attributes=None, *args, **kwargs):
14
"""
15
Initiation of the Spider
16
:param compound: compound that will be searched.
17
:param selected_attributes: A list of regular expressions that the attributes should match.
18
"""
19
-
self._sources = []
20
-
self.synonyms = set()
21
super(FourmiSpider, self).__init__(*args, **kwargs)
22
self.synonyms.add(compound)
23
-
if selected_attributes is None:
24
-
self.selected_attributes = [".*"]
25
-
else:
26
-
self.selected_attributes = selected_attributes
27
28
def parse(self, response):
29
"""
···
34
"""
35
for source in self._sources:
36
if re.match(source.website, response.url):
37
-
log.msg("URL: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
38
return source.parse(response)
39
-
log.msg("URL: " + response.url + " -> No compatible source", level=log.INFO)
40
return None
41
42
-
def get_synonym_requests(self, compound, force=False):
43
"""
44
A function that generates new Scrapy Request for each source given a new synonym of a compound.
45
:param compound: A compound name
46
:return: A list of Scrapy Request objects
47
"""
48
requests = []
49
-
if force or compound not in self.synonyms:
50
self.synonyms.add(compound)
51
for parser in self._sources:
52
parser_requests = parser.new_compound_request(compound)
···
61
"""
62
requests = []
63
for synonym in self.synonyms:
64
-
requests.extend(self.get_synonym_requests(synonym, force=True))
65
return requests
66
67
def add_sources(self, sources):
···
9
A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
10
"""
11
name = "FourmiSpider"
12
+
_sources = []
13
+
synonyms = set()
14
15
+
def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
16
"""
17
Initiation of the Spider
18
:param compound: compound that will be searched.
19
:param selected_attributes: A list of regular expressions that the attributes should match.
20
"""
21
super(FourmiSpider, self).__init__(*args, **kwargs)
22
self.synonyms.add(compound)
23
+
self.selected_attributes = selected_attributes
24
25
def parse(self, response):
26
"""
···
31
"""
32
for source in self._sources:
33
if re.match(source.website, response.url):
34
+
log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
35
return source.parse(response)
36
return None
37
38
+
def get_synonym_requests(self, compound):
39
"""
40
A function that generates new Scrapy Request for each source given a new synonym of a compound.
41
:param compound: A compound name
42
:return: A list of Scrapy Request objects
43
"""
44
requests = []
45
+
if compound not in self.synonyms:
46
self.synonyms.add(compound)
47
for parser in self._sources:
48
parser_requests = parser.new_compound_request(compound)
···
57
"""
58
requests = []
59
for synonym in self.synonyms:
60
+
requests.extend(self.get_synonym_requests(synonym))
61
return requests
62
63
def add_sources(self, sources):
-1
GUI/__init__.py
-1
GUI/__init__.py
···
1
-
import gui
···
-30
GUI/configImporter.py
-30
GUI/configImporter.py
···
1
-
import ConfigParser
2
-
3
-
4
-
class ConfigImporter():
5
-
def __init__(self, filename):
6
-
"""Read the filename into the parser."""
7
-
self.filename = filename
8
-
self.parser = ConfigParser.ConfigParser()
9
-
self.parser.read(self.filename)
10
-
11
-
def load_common_attributes(self):
12
-
"""Loads common attributes from the initialized file."""
13
-
try:
14
-
return self.parser.get('GUI', 'CommonParameters')
15
-
except:
16
-
return 'One, Two, Three'
17
-
18
-
def load_output_types(self):
19
-
"""Loads output types from the initialized file."""
20
-
try:
21
-
return self.parser.get('GUI', 'OutputTypes')
22
-
except:
23
-
return 'csv'
24
-
25
-
def load_always_attributes(self):
26
-
"""Loads attributes that are always searched for from the initialized file."""
27
-
try:
28
-
return self.parser.get('GUI', 'AlwaysParameters')
29
-
except:
30
-
return 'Name, Weight'
···
-196
GUI/gui.py
-196
GUI/gui.py
···
1
-
from Tkinter import *
2
-
import os
3
-
import shutil
4
-
from tkFileDialog import asksaveasfilename
5
-
6
-
from configImporter import *
7
-
8
-
9
-
class GUI():
10
-
def __init__(self, search, config_file='GUI.cfg', sourceloader=None, in_source=True):
11
-
"""Boots the window, configuration."""
12
-
if not in_source:
13
-
current_dir = os.path.dirname(os.path.abspath(__file__))
14
-
config_file = current_dir + '../' + config_file
15
-
if not os.path.isfile(config_file):
16
-
try:
17
-
shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../GUI.cfg.sample", config_file)
18
-
except IOError:
19
-
print "GUI configuration couldn't be found and couldn't be created."
20
-
sys.exit()
21
-
self.configurator = ConfigImporter(config_file)
22
-
self.sourceloader = sourceloader
23
-
self.finish_with_search = False
24
-
self.values = {}
25
-
self.required_variables = ['substance']
26
-
self.search = search
27
-
self.window, self.variables = self.generate_window(self.load_common_attributes(), self.load_output_types())
28
-
29
-
def load_common_attributes(self):
30
-
"""Calls the configuration parser for common attributes."""
31
-
return [x.strip() for x in self.configurator.load_common_attributes().split(',')]
32
-
33
-
def load_output_types(self):
34
-
"""Calls the configuration parser for output types."""
35
-
return [x.strip() for x in self.configurator.load_output_types().split(',')]
36
-
37
-
def load_always_attributes(self):
38
-
"""Calls the configuration parser for attributes that are always used."""
39
-
return ','.join([x.strip() for x in self.configurator.load_always_attributes().split(',')])
40
-
41
-
def set_output(self):
42
-
self.variable_output_name.set(asksaveasfilename())
43
-
self.button_output_name.config(text=self.variable_output_name.get())
44
-
45
-
def generate_window(self, common_attributes, output_types):
46
-
"""Creates all widgets and variables in the window."""
47
-
window = Tk()
48
-
window.wm_title("Fourmi Crawler")
49
-
50
-
variables = {}
51
-
52
-
variable_substance = StringVar(window)
53
-
frame_substance = Frame(window)
54
-
label_substance = Label(frame_substance, text="Substance: ")
55
-
input_substance = Entry(frame_substance, font=("Helvetica", 12), width=25, textvariable=variable_substance)
56
-
variables.update({"substance": variable_substance})
57
-
frame_substance.pack(side=TOP)
58
-
label_substance.pack()
59
-
input_substance.pack()
60
-
input_substance.focus()
61
-
62
-
frame_all_attributes = Frame(window)
63
-
frame_selecting_attributes = Frame(frame_all_attributes)
64
-
frame_new_attributes = Frame(frame_selecting_attributes)
65
-
label_new_attributes = Label(frame_new_attributes, text="Parameters: ")
66
-
input_new_attributes = Text(frame_new_attributes, font=("Helvetica", 8), width=25, height=7, padx=5, pady=5)
67
-
variables.update({"new_attributes": input_new_attributes})
68
-
frame_new_attributes.pack(side=LEFT)
69
-
label_new_attributes.pack()
70
-
input_new_attributes.pack()
71
-
72
-
frame_common_attributes = Frame(frame_selecting_attributes)
73
-
label_common_attributes = Label(frame_common_attributes, text="Common Parameters: ")
74
-
input_common_attributes = Listbox(frame_common_attributes, selectmode=MULTIPLE, height=7)
75
-
scrollbar_common_attributes = Scrollbar(frame_common_attributes)
76
-
input_common_attributes.config(yscrollcommand=scrollbar_common_attributes.set)
77
-
scrollbar_common_attributes.config(command=input_common_attributes.yview)
78
-
if common_attributes and len(common_attributes) > 0:
79
-
input_common_attributes.insert(END, *common_attributes)
80
-
variables.update({"common_attributes": input_common_attributes})
81
-
frame_common_attributes.pack(side=RIGHT)
82
-
label_common_attributes.pack(side=TOP)
83
-
input_common_attributes.pack(side=LEFT)
84
-
scrollbar_common_attributes.pack(side=RIGHT, fill=Y)
85
-
frame_selecting_attributes.pack()
86
-
87
-
frame_last = Frame(window)
88
-
search_button = Button(frame_last, text="Start search", command=self.prepare_search)
89
-
cancel_button = Button(frame_last, text="Cancel", command=window.destroy)
90
-
frame_last.pack(side=BOTTOM)
91
-
search_button.pack(side=LEFT)
92
-
cancel_button.pack(side=RIGHT)
93
-
94
-
frame_name = Frame(window)
95
-
frame_output_name = Frame(frame_name)
96
-
label_output_name = Label(frame_output_name, text='Output file:')
97
-
self.variable_output_name = StringVar()
98
-
self.variable_output_name.set('results.csv')
99
-
variables.update({'output_name':self.variable_output_name})
100
-
self.button_output_name = Button(frame_output_name, command=self.set_output, text="Select file")
101
-
frame_output_name.pack(side=LEFT)
102
-
label_output_name.pack()
103
-
self.button_output_name.pack()
104
-
frame_name.pack(side=BOTTOM)
105
-
106
-
107
-
frame_checkboxes = Frame(window)
108
-
frame_checkbox_attributes = Frame(frame_checkboxes)
109
-
variable_all_attributes = BooleanVar()
110
-
variable_all_attributes.set(True)
111
-
input_all_attributes = Checkbutton(frame_checkbox_attributes, text="Search ALL parameters",
112
-
variable=variable_all_attributes)
113
-
variables.update({"all_attributes": variable_all_attributes})
114
-
frame_checkbox_attributes.pack(side=LEFT)
115
-
input_all_attributes.pack()
116
-
117
-
frame_logging = Frame(frame_checkboxes)
118
-
variable_logging = BooleanVar()
119
-
variable_logging.set(False)
120
-
input_logging = Checkbutton(frame_logging, text="Verbose logging", variable=variable_logging)
121
-
variables.update({'logging':variable_logging})
122
-
frame_logging.pack(side=RIGHT)
123
-
frame_checkboxes.pack(side=BOTTOM)
124
-
input_logging.pack()
125
-
frame_all_attributes.pack()
126
-
127
-
return window, variables
128
-
129
-
def prepare_search(self):
130
-
"""Saves the values from the window for later retrieval."""
131
-
variables = self.variables
132
-
values = {}
133
-
134
-
values.update({"Always attributes": self.load_always_attributes()})
135
-
for name, var in variables.iteritems():
136
-
if var.__class__ is StringVar:
137
-
values.update({name: var.get()})
138
-
elif var.__class__ is BooleanVar:
139
-
values.update({name: var.get()})
140
-
elif var.__class__ is Text:
141
-
values.update({name: str(var.get("1.0", END)).strip()})
142
-
elif var.__class__ is Listbox:
143
-
values.update({name: ", ".join([var.get(int(i)) for i in var.curselection()])})
144
-
else:
145
-
print "No known class, {}, {}".format(name, var)
146
-
147
-
values.update({'output_name':self.variable_output_name.get()})
148
-
values.update({'output_type':self.check_output_type(values.get('output_name'))})
149
-
150
-
self.values = values
151
-
if all([values.get(i) != '' for i in self.required_variables]):
152
-
self.finish_with_search = True
153
-
self.window.destroy()
154
-
else:
155
-
self.finish_with_search = False
156
-
#tkMessageBox.showinfo('Not all required information was entered!')
157
-
158
-
def execute_search(self):
159
-
"""Calls the Fourmi crawler with the values from the GUI"""
160
-
if self.values.get('all_attributes'):
161
-
attributes = ".*"
162
-
else:
163
-
attribute_types = ['attributes', 'Common attributes', 'Always attributes']
164
-
attributes = ','.join([str(self.values.get(attribute)) for attribute in attribute_types])
165
-
output_file = "file://" + str(self.values.get('output_name')) #Dealing with absolute paths
166
-
167
-
arguments = {'--attributes': attributes,
168
-
'--exclude': None,
169
-
'--format': self.values.get('output_type'),
170
-
'--help': False,
171
-
'--include': None,
172
-
'--log': 'log.txt',
173
-
'--output': output_file,
174
-
'-v': 0 if self.values.get('logging') else 3,
175
-
'--version': False,
176
-
'<compound>': self.values.get('substance'),
177
-
'list': False,
178
-
'search': True}
179
-
180
-
self.search(arguments, self.sourceloader)
181
-
182
-
def run(self):
183
-
"""Starts the window and the search."""
184
-
self.window.mainloop()
185
-
if self.finish_with_search:
186
-
self.execute_search()
187
-
188
-
def check_output_type(self, filename):
189
-
parts = str(filename).split('.')
190
-
output_types = self.load_output_types()
191
-
extension = parts[-1]
192
-
193
-
for type in output_types:
194
-
if extension==type:
195
-
return extension
196
-
return output_types[0]
···
-10
GUI.cfg.sample
-10
GUI.cfg.sample
···
1
-
[GUI]
2
-
# Personalize options in your User Interface
3
-
4
-
# Commonly used parameters are listed in the GUI for easy selection
5
-
CommonParameters = Weight, Polarity, Viscosity, Solubility, Name
6
-
7
-
# Parameters that are always used in the search
8
-
AlwaysParameters = Name
9
-
10
-
OutputTypes = csv, json, jsonlines, xml
···
+12
-7
README.md
+12
-7
README.md
···
1
# Fourmi
2
3
-
**Master branch**: [](https://travis-ci.org/jjdekker/Fourmi) [](https://coveralls.io/r/jjdekker/Fourmi?branch=master)
4
5
-
**Developing branch**: [](https://travis-ci.org/jjdekker/Fourmi) [](https://coveralls.io/r/jjdekker/Fourmi?branch=develop)
6
7
Fourmi is an web scraper for chemical substances. The program is designed to be
8
used as a search engine to search multiple chemical databases for a specific
···
23
24
### Installing
25
26
-
If you're installing Fourmi, please take a look at our installation guides
27
-
on our [wiki](https://github.com/jjdekker/Fourmi/wiki). When you've installed the application, make sure to check our
28
-
usage guide on the [Command Line Interface](https://github.com/jjdekker/Fourmi/wiki/CLI) and on the [Graphical User Interface](https://github.com/jjdekker/Fourmi/wiki/GUI).
29
30
### Using the Source
31
32
To use the Fourmi source code multiple dependencies are required. Take a look at
33
-
our [wiki pages](https://github.com/jjdekker/Fourmi/wiki) on using the application source code in our a step by step
34
installation guide.
35
36
When developing for the Fourmi project keep in mind that code readability is a
37
must. To maintain the readability, code should be conform with the
38
[PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python
39
code. More information about the different structures and principles of the
40
-
Fourmi application can be found on our [wiki](https://github.com/jjdekker/Fourmi/wiki).
41
42
### To Do
43
···
45
46
__Main goals:__
47
48
- Build an graphical user interface(GUI) as alternative for the command line
49
interface(CLI). (Assignee: Harmen)
50
- Compiling the source into an windows executable. (Assignee: Bas)
51
52
__Side goals:__
53
···
1
# Fourmi
2
3
+
**Master branch**: [](https://travis-ci.org/Recondor/Fourmi)
4
5
+
**Developing branch**: [](https://travis-ci.org/Recondor/Fourmi)
6
7
Fourmi is an web scraper for chemical substances. The program is designed to be
8
used as a search engine to search multiple chemical databases for a specific
···
23
24
### Installing
25
26
+
If you're installing Fourmi, please take a look at our [installation guide](...)
27
+
on our wiki. When you've installed the application, make sure to check our
28
+
[usage guide](...).
29
30
### Using the Source
31
32
To use the Fourmi source code multiple dependencies are required. Take a look at
33
+
the [wiki page](...) on using the application source code for a step by step
34
installation guide.
35
36
When developing for the Fourmi project keep in mind that code readability is a
37
must. To maintain the readability, code should be conform with the
38
[PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python
39
code. More information about the different structures and principles of the
40
+
Fourmi application can be found on our [wiki](...).
41
42
### To Do
43
···
45
46
__Main goals:__
47
48
+
- Improve our documentation and guides. (Assignee: Dekker)
49
- Build an graphical user interface(GUI) as alternative for the command line
50
interface(CLI). (Assignee: Harmen)
51
- Compiling the source into an windows executable. (Assignee: Bas)
52
+
- Create an configuration file to hold logins and API keys.
53
+
- Determine reliability of our data point.
54
+
- Create an module to gather data from NIST. (Assignee: Rob)
55
+
- Create an module to gather data from PubChem. (Assignee: Nout)
56
57
__Side goals:__
58
-108
SIGNED.md
-108
SIGNED.md
···
1
-
##### Signed by https://keybase.io/jdekker
2
-
```
3
-
-----BEGIN PGP SIGNATURE-----
4
-
Version: GnuPG v1.4.11 (GNU/Linux)
5
-
6
-
iQIcBAABAgAGBQJTpMZAAAoJEJrQ9RIUCT6/Hf8P/AyX9ZD5zj6rBi2CwDOTs5aa
7
-
flVqw9syvdqTzVfXQaR4UrCSOuyuOeAkiqub0BMjxyCurqAwN/SCPf3uOJ/tGXmt
8
-
ZPtYVHjevJ4mbojLhZiJ2av8LC9VOh3Zl+reR3L2cLuBD4rVSrfUMJtczbbtNlk+
9
-
+mczRcTpzNvHQW6mKqyUoKn8xqNnLC7C+p5ybNZ5EADUfoKIF1xyTN6je6fpYZ1U
10
-
IHxiUzeOvfX9ohmbfnfkpkuSll1nUJWsTgUPKhthJuxEhwCQ1xMdWhxfcyZJaMT2
11
-
Pxgo8C8S6lzAk4PxBRBoePjgWAeaFmbr317WXHvw6SSHPIdzToKZgDiDC5LWvKxb
12
-
RRdLZ6w7tg0/FSUexekrUafGT8Je0oIoLUQlNaEQzrPNhDpma1uHFfZg0vb2m4Hq
13
-
WHLLKTCr6FMczhP1TmuIEtdjKtymT+rO+Ls4ciw+654R7MtBYcmTr+RqmAd+GadJ
14
-
vJNmGDod2oPwCydEps8bYAbksqRhMmk3xwco/g6dWYh5/+1GzCr80J7fYpqtoPFH
15
-
V5qKyDQovF5jPlb/buq4mH8XYVT1z4Sx8azKVctMLig57zRnvN0WyskpT09oY7dK
16
-
TPvIqwTixekndYLcM3QacVq/NhVOOQPFvD0PwU18eKs4EfD2L7iWd2XjV9Az++aD
17
-
jUY6EwEuOzDCexWP4eM8
18
-
=h6TK
19
-
-----END PGP SIGNATURE-----
20
-
21
-
```
22
-
23
-
<!-- END SIGNATURES -->
24
-
25
-
### Begin signed statement
26
-
27
-
#### Expect
28
-
29
-
```
30
-
size exec file contents
31
-
./
32
-
412 .gitignore 25059da2ee328837ece01b979cd5c1083ed1679372f06c14c1c58035d8120614
33
-
548 .travis.yml 7f11bc58a8e94276ef949afeb107f9f1e184c0dbb84f821705ea2245902ed546
34
-
846 Changelog.md 345f9aea4812b37b1b2714703ea0d5edd27414c0f839ec3e322450ad5ec5c6ed
35
-
FourmiCrawler/
36
-
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
37
-
304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806
38
-
2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49
39
-
677 settings.py f1e7d21b899ffc2523516c0ebe67d967dc62495b90c2fe34651042a3049fcd94
40
-
sources/
41
-
12103 ChemSpider.py f647d70acf9b3f1ee7bde75586aa45156331f977ca7fe836ceac4477a2c0d4ce
42
-
12400 NIST.py cdb4c423355ac8fb1097197a9f8df44f667925a785c6bae7c583820da08908ee
43
-
6121 PubChem.py 8f8ad40459090b818a384a202e739fe4696a04154df2b8419aee896b0fa02481
44
-
6930 WikipediaParser.py ae9f57bbf2aad9c371abcd143fd2dda5995a196cb700734a5035dd94b1988870
45
-
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
46
-
1281 source.py 7927fda259ff2c8096fa526db1f08586de6e04473a491e19a07b092fdeed81fc
47
-
3111 spider.py ec7c946907fea10c17ee6dd88a506f3e3bf2cd748e3eb09200487fcec2ae7ba3
48
-
GUI/
49
-
11 __init__.py 40567015c415e853210425c1b4f3834dbc2a3165e3713e04dd3424b79bc90aa3
50
-
940 configImporter.py 5d731d63a3117b25b7e556a746a1dd5b16e8cbb60e57be46de333c31c8c00271
51
-
8776 gui.py 20b2220bc3ca55ebfd6d04e8c0bebbf1ae316c85a54db60b8fc02d22642f19d5
52
-
299 GUI.cfg.sample 4ee27f7099d588c21358cd645a21621e631d80712f1b514dad898faa5fee2483
53
-
1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c
54
-
3900 README.md f4a1e3ea1700d2b415acfad661cb45f960fe8e8ffbe98dbecb6c7ed071a101ac
55
-
3846 x fourmi.py f0b11f5f153f96f6af2e504cdf369e43c04316752de131a659eb6246fd80212a
56
-
261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85
57
-
416 sources.cfg.sample 11cd0fc18693da17883c98d25a384ae1b6158adfef13778b6dd02b878f6b8a70
58
-
tests/
59
-
107 __init__.py ce90e54e58a0912cadbe3adcf5166dc72477bf9ce289bf427f8e2f5b25406670
60
-
2870 test_configurator.py 318d542b1cda5075a2a9a6be97e9e7a79372ee58e1ab3014c161534094f7364d
61
-
1315 test_gui.py 0fb95d0b542765bf52bcebb037bf2ed1299209beab23448af741a93c9fbb1ca8
62
-
1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031
63
-
1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869
64
-
2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299
65
-
utils/
66
-
40 __init__.py f1237ae74693e2ec1b3154e57aec27438a80a735e5ccf2411aecd194ef443b6a
67
-
4047 configurator.py 8b566a0435a9f105a8ec616b16c3e21edb9b82f8debe1ef9f1df6bbbf20949d5
68
-
2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37
69
-
```
70
-
71
-
#### Ignore
72
-
73
-
```
74
-
/SIGNED.md
75
-
```
76
-
77
-
#### Presets
78
-
79
-
```
80
-
git # ignore .git and anything as described by .gitignore files
81
-
dropbox # ignore .dropbox-cache and other Dropbox-related files
82
-
kb # ignore anything as described by .kbignore files
83
-
```
84
-
85
-
<!-- summarize version = 0.0.9 -->
86
-
87
-
### End signed statement
88
-
89
-
<hr>
90
-
91
-
#### Notes
92
-
93
-
With keybase you can sign any directory's contents, whether it's a git repo,
94
-
source code distribution, or a personal documents folder. It aims to replace the drudgery of:
95
-
96
-
1. comparing a zipped file to a detached statement
97
-
2. downloading a public key
98
-
3. confirming it is in fact the author's by reviewing public statements they've made, using it
99
-
100
-
All in one simple command:
101
-
102
-
```bash
103
-
keybase dir verify
104
-
```
105
-
106
-
There are lots of options, including assertions for automating your checks.
107
-
108
-
For more info, check out https://keybase.io/docs/command_line/code_signing
···
+51
-23
fourmi.py
+51
-23
fourmi.py
···
1
-
#!/usr/bin/env python
2
"""
3
-
Fourmi, a web scraper build to search specific information for a given compound (and its pseudonyms).
4
5
Usage:
6
-
fourmi
7
fourmi search <compound>
8
fourmi [options] search <compound>
9
-
fourmi [options] [-v | -vv | -vvv] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
10
fourmi list
11
fourmi [--include=<sourcename> | --exclude=<sourcename>] list
12
fourmi -h | --help
···
16
--attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
17
-h --help Show this screen.
18
--version Show version.
19
-
-v Verbose logging output. (Multiple occurrences increase logging level)
20
--log=<file> Save log to an file.
21
-
-o <file> --output=<file> Output file [default: <compound>.*format*]
22
-
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
23
--include=<regex> Include only sources that match these regular expressions split by a comma.
24
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
25
"""
26
27
from twisted.internet import reactor
28
from scrapy.crawler import Crawler
29
-
from scrapy import signals, log
30
import docopt
31
32
from FourmiCrawler.spider import FourmiSpider
33
-
from utils.configurator import Configurator
34
-
from utils.sourceloader import SourceLoader
35
-
from GUI import gui
36
37
38
def setup_crawler(compound, settings, source_loader, attributes):
···
52
crawler.start()
53
54
55
def search(docopt_arguments, source_loader):
56
"""
57
The function that facilitates the search for a specific compound.
58
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
59
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
60
"""
61
-
conf = Configurator()
62
-
conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
63
-
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments["<compound>"])
64
-
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
65
-
source_loader, docopt_arguments["--attributes"].split(','))
66
-
if conf.scrapy_settings.getbool("LOG_ENABLED"):
67
-
log.start(conf.scrapy_settings.get("LOG_FILE"),
68
-
conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
69
reactor.run()
70
71
72
# The start for the Fourmi Command Line interface.
73
if __name__ == '__main__':
74
-
arguments = docopt.docopt(__doc__, version='Fourmi - V0.6.0')
75
loader = SourceLoader()
76
77
if arguments["--include"]:
···
84
elif arguments["list"]:
85
print "-== Available Sources ==-"
86
print str(loader)
87
-
else:
88
-
gui_window = gui.GUI(search, sourceloader=SourceLoader())
89
-
gui_window.run()
···
1
+
# !/usr/bin/env python
2
"""
3
+
Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms).
4
5
Usage:
6
fourmi search <compound>
7
fourmi [options] search <compound>
8
+
fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
9
fourmi list
10
fourmi [--include=<sourcename> | --exclude=<sourcename>] list
11
fourmi -h | --help
···
15
--attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
16
-h --help Show this screen.
17
--version Show version.
18
+
--verbose Verbose logging output.
19
--log=<file> Save log to an file.
20
+
-o <file> --output=<file> Output file [default: result.*format*]
21
+
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines]
22
--include=<regex> Include only sources that match these regular expressions split by a comma.
23
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
24
"""
25
26
from twisted.internet import reactor
27
from scrapy.crawler import Crawler
28
+
from scrapy import log, signals
29
+
from scrapy.utils.project import get_project_settings
30
import docopt
31
32
from FourmiCrawler.spider import FourmiSpider
33
+
from sourceloader import SourceLoader
34
35
36
def setup_crawler(compound, settings, source_loader, attributes):
···
50
crawler.start()
51
52
53
+
def scrapy_settings_manipulation(docopt_arguments):
54
+
"""
55
+
This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi
56
+
project these are command line arguments.
57
+
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
58
+
"""
59
+
settings = get_project_settings()
60
+
61
+
if docopt_arguments["--output"] != 'result.*format*':
62
+
settings.overrides["FEED_URI"] = docopt_arguments["--output"]
63
+
elif docopt_arguments["--format"] == "jsonlines":
64
+
settings.overrides["FEED_URI"] = "results.json"
65
+
elif docopt_arguments["--format"] is not None:
66
+
settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"]
67
+
68
+
if docopt_arguments["--format"] is not None:
69
+
settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"]
70
+
71
+
return settings
72
+
73
+
74
+
def start_log(docopt_arguments):
75
+
"""
76
+
This function starts the logging functionality of Scrapy using the settings given by the CLI.
77
+
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
78
+
"""
79
+
if docopt_arguments["--log"] is not None:
80
+
if docopt_arguments["--verbose"]:
81
+
log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
82
+
else:
83
+
log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING)
84
+
else:
85
+
if docopt_arguments["--verbose"]:
86
+
log.start(logstdout=False, loglevel=log.DEBUG)
87
+
else:
88
+
log.start(logstdout=True, loglevel=log.WARNING)
89
+
90
+
91
def search(docopt_arguments, source_loader):
92
"""
93
The function that facilitates the search for a specific compound.
94
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
95
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
96
"""
97
+
start_log(docopt_arguments)
98
+
settings = scrapy_settings_manipulation(docopt_arguments)
99
+
setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
100
reactor.run()
101
102
103
# The start for the Fourmi Command Line interface.
104
if __name__ == '__main__':
105
+
arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.1')
106
loader = SourceLoader()
107
108
if arguments["--include"]:
···
115
elif arguments["list"]:
116
print "-== Available Sources ==-"
117
print str(loader)
+18
setup.py
+18
setup.py
···
···
1
+
import sys
2
+
from cx_Freeze import setup, Executable
3
+
4
+
# After running the setup file (python setup.py build) the scrapy/VERSION file has to be manually put into the
5
+
# library.zip, also the FourmiCrawler map has to be copied to both the library and the exe.win32-2.7 folder. after
6
+
# putting the files in the library the library has to be zipped and replace the old library.
7
+
# Dependencies are automatically detected, but it might need fine tuning.
8
+
build_exe_options = {"packages": ["os", "scrapy", "lxml", "w3lib", "pkg_resources", "zope.interface", "twisted.internet"], "excludes": []}
9
+
10
+
# GUI applications require a different base on Windows (the default is for a
11
+
# console application).
12
+
base = None
13
+
14
+
setup( name = "Scrapy",
15
+
version = "0.1",
16
+
description = "My GUI application!",
17
+
options = {"build_exe": build_exe_options},
18
+
executables = [Executable("fourmi.py", base=base)])
+60
sourceloader.py
+60
sourceloader.py
···
···
1
+
import inspect
2
+
import sys
3
+
import os
4
+
import re
5
+
6
+
from FourmiCrawler.sources.source import Source
7
+
8
+
9
+
class SourceLoader:
10
+
sources = []
11
+
12
+
def __init__(self, rel_dir="FourmiCrawler/sources"):
13
+
14
+
if hasattr(sys,'frozen'):
15
+
path = os.path.dirname(sys.executable)
16
+
else:
17
+
path = os.path.dirname(os.path.abspath(__file__))
18
+
19
+
path += "/" + rel_dir
20
+
known_parser = set()
21
+
22
+
for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
23
+
mod = __import__('.'.join([rel_dir.replace('/', "."), py]), fromlist=[py])
24
+
classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
25
+
for cls in classes:
26
+
if issubclass(cls, Source) and cls not in known_parser:
27
+
self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers?
28
+
# known_parser.add(cls)
29
+
30
+
def include(self, source_names):
31
+
"""
32
+
This function excludes all sources that don't match the given regular expressions.
33
+
:param source_names: A list of regular expression (strings)
34
+
"""
35
+
new = set()
36
+
for name in source_names:
37
+
new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
38
+
self.sources = list(new)
39
+
40
+
def exclude(self, source_names):
41
+
"""
42
+
This function excludes all sources that match the given regular expressions.
43
+
:param source_names: A list of regular expression (strings)
44
+
"""
45
+
exclude = []
46
+
for name in source_names:
47
+
exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
48
+
self.sources = [src for src in self.sources if src not in exclude]
49
+
50
+
def __str__(self):
51
+
"""
52
+
This function returns a string with all sources currently available in the SourceLoader.
53
+
:return: a string with all available sources.
54
+
"""
55
+
string = ""
56
+
for src in self.sources:
57
+
string += "Source: " + src.__class__.__name__
58
+
string += " - "
59
+
string += "URI: " + src.website + "\n"
60
+
return string
-19
sources.cfg.sample
-19
sources.cfg.sample
···
1
-
[DEFAULT]
2
-
reliability = Unknown
3
-
4
-
#For each source listed in FourmiCrawler/sources there should be a section
5
-
#named exactly as the filename in here. If not present, the DEFAULT value is
6
-
#used for reliability of that source.
7
-
8
-
[ChemSpider]
9
-
reliability = High
10
-
#token=Paste ChemSpider API token here and remove the hashtag
11
-
12
-
[NIST]
13
-
reliability = High
14
-
15
-
[WikipediaParser]
16
-
reliability = Medium
17
-
18
-
[PubChem]
19
-
reliability = High
···
-5
tests/__init__.py
-5
tests/__init__.py
-68
tests/test_configurator.py
-68
tests/test_configurator.py
···
1
-
import unittest
2
-
import ConfigParser
3
-
4
-
from utils.configurator import Configurator
5
-
6
-
7
-
class TestConfigurator(unittest.TestCase):
8
-
9
-
def setUp(self):
10
-
self.conf = Configurator()
11
-
12
-
def test_set_output(self):
13
-
self.conf.set_output(filename="test.txt", fileformat="csv", compound="test")
14
-
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt")
15
-
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
16
-
17
-
self.conf.set_output("<compound>.*format*", "jsonlines", "test")
18
-
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.json")
19
-
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines")
20
-
21
-
self.conf.set_output("<compound>.*format*", "csv", "test")
22
-
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.csv")
23
-
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
24
-
25
-
def test_start_log(self):
26
-
for i in range(0, 3):
27
-
self.conf.set_logging("TEST", i)
28
-
self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), "TEST")
29
-
if i > 0:
30
-
self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), True)
31
-
if i > 1:
32
-
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), False)
33
-
else:
34
-
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True)
35
-
else:
36
-
self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), False)
37
-
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True)
38
-
if i == 1:
39
-
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "WARNING")
40
-
elif i == 2:
41
-
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "INFO")
42
-
elif i == 3:
43
-
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "DEBUG")
44
-
45
-
self.conf.set_logging(verbose=i)
46
-
self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), None)
47
-
48
-
def test_read_sourceconfiguration(self):
49
-
config = self.conf.read_sourceconfiguration()
50
-
self.assertIsInstance(config, ConfigParser.ConfigParser)
51
-
52
-
def test_get_section(self):
53
-
config = ConfigParser.ConfigParser()
54
-
section = self.conf.get_section(config, 'test')
55
-
self.assertIn('reliability', section)
56
-
self.assertEquals(section['reliability'], '')
57
-
58
-
config.set('DEFAULT', 'reliability', 'Low')
59
-
60
-
section = self.conf.get_section(config, 'test')
61
-
self.assertEquals(section['reliability'], 'Low')
62
-
63
-
config.add_section('test')
64
-
config.set('test', 'var', 'Maybe')
65
-
66
-
section = self.conf.get_section(config, 'test')
67
-
self.assertEquals(section['reliability'], 'Low')
68
-
self.assertEqual(section['var'], 'Maybe')
···
-32
tests/test_gui.py
-32
tests/test_gui.py
···
1
-
import unittest
2
-
3
-
from GUI import gui
4
-
5
-
class TestGUI(unittest.TestCase):
6
-
def setUp(self):
7
-
pass
8
-
9
-
def test_empty_attributes(self):
10
-
self.test_gui = gui.GUI(None, config_file="../GUI.cfg.sample", in_source=True)
11
-
self.test_gui.window.after(9, self.test_gui.prepare_search)
12
-
self.test_gui.window.after(11, self.test_gui.window.destroy)
13
-
self.test_gui.run()
14
-
15
-
output_type = self.test_gui.configurator.load_output_types().split(',')[0]
16
-
17
-
self.assertEqual(self.test_gui.values.get('substance'), '')
18
-
self.assertEqual(self.test_gui.values.get('output_type'), output_type)
19
-
self.assertEqual(self.test_gui.values.get('output_name'), 'results.csv')
20
-
21
-
22
-
def test_no_configurations(self):
23
-
self.test_gui = gui.GUI(None, config_file="../GUI.cfg.sample")
24
-
self.test_gui.configurator = gui.ConfigImporter('')
25
-
self.test_gui.finish_with_search = True
26
-
self.test_gui.window.after(9, self.test_gui.prepare_search)
27
-
self.test_gui.window.after(11, self.test_gui.window.destroy)
28
-
self.test_gui.run()
29
-
30
-
self.assertEqual(self.test_gui.values.get('substance'), '')
31
-
self.assertEqual(self.test_gui.values.get('output_type'), 'csv')
32
-
self.assertEqual(self.test_gui.values.get('output_name'), 'results.csv')
···
-1
tests/test_pipeline.py
-1
tests/test_pipeline.py
+1
-1
tests/test_sourceloader.py
+1
-1
tests/test_sourceloader.py
+5
-7
tests/test_spider.py
+5
-7
tests/test_spider.py
···
3
from scrapy.http import Request
4
5
from FourmiCrawler import spider
6
-
from FourmiCrawler.sources.NIST import NIST
7
from FourmiCrawler.sources.source import Source
8
9
···
41
self.spi.add_source(src)
42
self.assertEqual(self.spi.start_requests(), [])
43
44
-
src2 = NIST()
45
self.spi.add_source(src2)
46
-
requests = self.spi.start_requests()
47
-
self.assertGreater(len(requests), 0)
48
-
self.assertIsInstance(requests[0], Request)
49
50
def test_synonym_requests(self):
51
# A test for the synonym request function
···
56
self.assertEqual(self.spi.get_synonym_requests("new_compound"), [])
57
self.assertIn("new_compound", self.spi.synonyms)
58
59
-
src2 = NIST()
60
self.spi.add_source(src2)
61
self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request)
62
self.assertIn("other_compound", self.spi.synonyms)
63
-
self.assertEqual(self.spi.get_synonym_requests("other_compound"), [])
···
3
from scrapy.http import Request
4
5
from FourmiCrawler import spider
6
+
from FourmiCrawler.sources.ChemSpider import ChemSpider
7
from FourmiCrawler.sources.source import Source
8
9
···
41
self.spi.add_source(src)
42
self.assertEqual(self.spi.start_requests(), [])
43
44
+
src2 = ChemSpider()
45
self.spi.add_source(src2)
46
+
self.assertIsNotNone(self.spi.start_requests())
47
48
def test_synonym_requests(self):
49
# A test for the synonym request function
···
54
self.assertEqual(self.spi.get_synonym_requests("new_compound"), [])
55
self.assertIn("new_compound", self.spi.synonyms)
56
57
+
src2 = ChemSpider()
58
self.spi.add_source(src2)
59
self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request)
60
self.assertIn("other_compound", self.spi.synonyms)
61
+
self.assertEqual(self.spi.get_synonym_requests("other_compound"), [])
-101
utils/configurator.py
-101
utils/configurator.py
···
1
-
import ConfigParser
2
-
import os
3
-
import shutil
4
-
5
-
from scrapy.utils.project import get_project_settings
6
-
7
-
8
-
class Configurator:
9
-
"""
10
-
A helper class in the fourmi class. This class is used to process the settings as set
11
-
from one of the Fourmi applications.
12
-
"""
13
-
14
-
def __init__(self):
15
-
self.scrapy_settings = get_project_settings()
16
-
17
-
def set_output(self, filename, fileformat, compound):
18
-
"""
19
-
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
20
-
In the Fourmi project these are command line arguments.
21
-
:param filename: The filename of the file where the output will be put.
22
-
:param fileformat: The format in which the output will be.
23
-
"""
24
-
25
-
if filename != '<compound>.*format*':
26
-
self.scrapy_settings.overrides["FEED_URI"] = filename
27
-
elif fileformat == "jsonlines":
28
-
self.scrapy_settings.overrides["FEED_URI"] = compound + ".json"
29
-
elif fileformat is not None:
30
-
self.scrapy_settings.overrides["FEED_URI"] = compound + "." + fileformat
31
-
32
-
if fileformat is not None:
33
-
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
34
-
35
-
def set_logging(self, logfile=None, verbose=0):
36
-
"""
37
-
This function changes the default settings of Scapy's logging functionality
38
-
using the settings given by the CLI.
39
-
:param logfile: The location where the logfile will be saved.
40
-
:param verbose: A integer value to switch between loglevels.
41
-
"""
42
-
if verbose != 0:
43
-
self.scrapy_settings.overrides["LOG_ENABLED"] = True
44
-
else:
45
-
self.scrapy_settings.overrides["LOG_ENABLED"] = False
46
-
47
-
if verbose == 1:
48
-
self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING"
49
-
elif verbose == 2:
50
-
self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO"
51
-
else:
52
-
self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG"
53
-
54
-
if verbose > 1:
55
-
self.scrapy_settings.overrides["LOG_STDOUT"] = False
56
-
else:
57
-
self.scrapy_settings.overrides["LOG_STDOUT"] = True
58
-
59
-
if logfile is not None:
60
-
self.scrapy_settings.overrides["LOG_FILE"] = logfile
61
-
else:
62
-
self.scrapy_settings.overrides["LOG_FILE"] = None
63
-
64
-
@staticmethod
65
-
def read_sourceconfiguration():
66
-
"""
67
-
This function reads sources.cfg in the main folder for configuration
68
-
variables for sources
69
-
:return a ConfigParser object of sources.cfg
70
-
"""
71
-
current_dir = os.path.dirname(os.path.abspath(__file__))
72
-
config_path = current_dir + '/../sources.cfg'
73
-
# [TODO]: location of sources.cfg should be softcoded eventually
74
-
if not os.path.isfile(config_path):
75
-
try:
76
-
shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../sources.cfg.sample", config_path)
77
-
except IOError:
78
-
print "WARNING: Source configuration couldn't be found and couldn't be created."
79
-
config = ConfigParser.ConfigParser()
80
-
config.read(config_path)
81
-
return config
82
-
83
-
@staticmethod
84
-
def get_section(config, sourcename):
85
-
"""
86
-
This function reads a config section labeled in variable sourcename and
87
-
tests whether the reliability variable is set else set to empty string.
88
-
Return the default section if the labeled config section does not exist
89
-
:param config: a ConfigParser object
90
-
:param sourcename: the name of the section to be read
91
-
:return a dictionary of the section in the config labeled in sourcename
92
-
"""
93
-
section = dict()
94
-
if config.has_section(sourcename):
95
-
section = dict(config.items(sourcename))
96
-
elif config.defaults():
97
-
section = config.defaults()
98
-
if 'reliability' not in section:
99
-
print 'WARNING: Reliability not set for %s' % sourcename
100
-
section['reliability'] = ''
101
-
return section
···
-64
utils/sourceloader.py
-64
utils/sourceloader.py
···
1
-
import inspect
2
-
import os
3
-
import re
4
-
5
-
from FourmiCrawler.sources.source import Source
6
-
from utils.configurator import Configurator
7
-
8
-
9
-
class SourceLoader:
10
-
sources = []
11
-
12
-
def __init__(self, rel_dir="../FourmiCrawler/sources"):
13
-
"""
14
-
The initiation of a SourceLoader, selects and indexes a directory for usable sources.
15
-
Also loads a configuration file for Sources and passes the arguments in
16
-
the named section to the source
17
-
:param rel_dir: A relative path to a directory.
18
-
"""
19
-
path = os.path.dirname(os.path.abspath(__file__))
20
-
path += "/" + rel_dir
21
-
known_parser = set()
22
-
23
-
config = Configurator.read_sourceconfiguration()
24
-
25
-
for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
26
-
mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py])
27
-
classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
28
-
for cls in classes:
29
-
if issubclass(cls, Source) and cls not in known_parser:
30
-
sourcecfg = Configurator.get_section(config, cls.__name__)
31
-
self.sources.append(cls(sourcecfg))
32
-
known_parser.add(cls)
33
-
34
-
def include(self, source_names):
35
-
"""
36
-
This function excludes all sources that don't match the given regular expressions.
37
-
:param source_names: A list of regular expression (strings)
38
-
"""
39
-
new = set()
40
-
for name in source_names:
41
-
new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
42
-
self.sources = list(new)
43
-
44
-
def exclude(self, source_names):
45
-
"""
46
-
This function excludes all sources that match the given regular expressions.
47
-
:param source_names: A list of regular expression (strings)
48
-
"""
49
-
exclude = []
50
-
for name in source_names:
51
-
exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
52
-
self.sources = [src for src in self.sources if src not in exclude]
53
-
54
-
def __str__(self):
55
-
"""
56
-
This function returns a string with all sources currently available in the SourceLoader.
57
-
:return: a string with all available sources.
58
-
"""
59
-
string = ""
60
-
for src in self.sources:
61
-
string += "Source: " + src.__class__.__name__
62
-
string += " - "
63
-
string += "URI: " + src.website + "\n"
64
-
return string
···