-5
.gitignore
-5
.gitignore
+2
-10
.travis.yml
+2
-10
.travis.yml
···
3
3
language: python
4
4
python: 2.7
5
5
6
-
before_install:
7
-
- "export DISPLAY=:99.0"
8
-
- "sh -e /etc/init.d/xvfb start"
9
-
10
6
# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
11
7
install:
12
8
- pip install Scrapy docopt
13
-
- pip install coveralls
14
9
15
10
# command to run tests, e.g. python setup.py test
16
11
script:
17
-
- nosetests --with-coverage --cover-package=FourmiCrawler,utils,GUI tests
12
+
- nosetests tests
18
13
19
14
notifications:
20
-
slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
21
-
22
-
after_success:
23
-
coveralls --verbose
15
+
slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
-20
Changelog.md
-20
Changelog.md
···
1
-
### v0.6.0
2
-
- Feature: Added a Graphical User interface
3
-
- Feature: Automatic config file createion from config samples
4
-
- FIX: The default name of the output files will now consist of the compound name and the file format when using the CLI
5
-
- FIX: A lot of bugfixes of the PubChem plugin, as is wasn't working as it should
6
-
- FIX: Using absolute path for configuration files
7
-
- DEV: General Code cleanup in documentation
8
-
9
-
### v0.5.3
10
-
- FIX: It is now again possible to use both verbose and the source inclusion/exclusion options
11
-
- FIX: Logging is now "actually" disabled if not using the verbose option.
12
-
- FEATURE: Added support for PubChem
13
-
14
-
### v0.5.2
15
-
- FIX: Signatured used to contain untracked and older files, current signature
16
-
should be correct.
17
-
18
-
### v0.5.1
19
-
- UPDATED: Logging functionality from command line
20
-
- DEV: Code cleanup and extra tests
+2
-1
FourmiCrawler/settings.py
+2
-1
FourmiCrawler/settings.py
+60
-127
FourmiCrawler/sources/ChemSpider.py
+60
-127
FourmiCrawler/sources/ChemSpider.py
···
10
10
11
11
# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
12
12
13
+
13
14
class ChemSpider(Source):
14
-
"""
15
-
ChemSpider scraper for synonyms and properties
15
+
"""ChemSpider scraper for synonyms and properties
16
+
16
17
This parser will manage searching for chemicals through the
17
18
ChemsSpider API, and parsing the resulting ChemSpider page.
18
19
The token required for the API should be in a configuration file
19
20
somewhere.
20
21
"""
21
22
22
-
website = 'http://www\\.chemspider\\.com/.*'
23
+
def __init__(self):
24
+
Source.__init__(self)
23
25
24
-
search = 'Search.asmx/SimpleSearch?query=%s&token='
26
+
website = 'http://www.chemspider.com/*'
27
+
28
+
# [TODO] - Save and access token of specific user.
29
+
search = ('Search.asmx/SimpleSearch?query=%s&token='
30
+
'052bfd06-5ce4-43d6-bf12-89eabefd2338')
25
31
structure = 'Chemical-Structure.%s.html'
26
-
extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
32
+
extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
33
+
'052bfd06-5ce4-43d6-bf12-89eabefd2338')
27
34
28
-
def __init__(self, config=None):
29
-
"""
30
-
Initialization of ChemSpider scraper
31
-
:param config: a dictionary of settings for this scraper, must contain
32
-
'reliability' key
33
-
"""
34
-
Source.__init__(self, config)
35
-
self.ignore_list = []
36
-
if 'token' not in self.cfg or self.cfg['token'] == '':
37
-
log.msg('ChemSpider token not set or empty, search/MassSpec API '
38
-
'not available', level=log.WARNING)
39
-
self.cfg['token'] = ''
40
-
self.search += self.cfg['token']
41
-
self.extendedinfo += self.cfg['token']
35
+
ignore_list = []
42
36
43
37
def parse(self, response):
44
-
"""
45
-
This function is called when a Response matching the variable
46
-
'website' is available for parsing the Response object.
47
-
:param response: the Scrapy Response object to be parsed
48
-
:return: a list of Result items and Request objects
49
-
"""
50
38
sel = Selector(response)
51
39
requests = []
52
40
requests_synonyms = self.parse_synonyms(sel)
···
56
44
57
45
return requests
58
46
59
-
def parse_properties(self, sel):
60
-
"""
61
-
This function scrapes the Experimental Data and Predicted ACD/Labs tabs
62
-
:param sel: a Selector object of the whole page
63
-
:return: a list of Result items
64
-
"""
65
-
properties = []
66
-
67
-
properties.extend(self.parse_acdlabstab(sel))
68
-
properties.extend(self.parse_experimentaldatatab(sel))
69
-
70
-
return properties
71
-
72
-
def parse_acdlabstab(self, sel):
73
-
"""
74
-
This function scrapes the 'Predicted ACD/Labs tab' under Properties
75
-
:param sel: a Selector object of the whole page
76
-
:return: a list of Request objects
77
-
"""
47
+
@staticmethod
48
+
def parse_properties(sel):
49
+
"""scrape Experimental Data and Predicted ACD/Labs tabs"""
78
50
properties = []
79
51
52
+
# Predicted - ACD/Labs tab
80
53
td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
81
54
'normalize-space(string())')
82
55
prop_names = td_list[::2]
···
88
61
prop_conditions = ''
89
62
90
63
# Test for properties without values, with one hardcoded exception
91
-
if (not re.match(r'^\d', prop_value) or
92
-
(prop_name == 'Polarizability' and prop_value == '10-24cm3')):
64
+
if not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and prop_value == '10-24cm3'):
93
65
continue
94
66
67
+
# Match for condition in parentheses
95
68
m = re.match(r'(.*) \((.*)\)', prop_name)
96
69
if m:
97
70
prop_name = m.group(1)
98
71
prop_conditions = m.group(2)
99
72
73
+
# Match for condition in value seperated by an 'at'
100
74
m = re.match(r'(.*) at (.*)', prop_value)
101
75
if m:
102
76
prop_value = m.group(1)
103
77
prop_conditions = m.group(2)
104
78
105
-
new_prop = self.newresult(
106
-
attribute=prop_name,
107
-
value=prop_value,
108
-
source='ChemSpider Predicted - ACD/Labs Tab',
109
-
conditions=prop_conditions
110
-
)
79
+
new_prop = Result({
80
+
'attribute': prop_name,
81
+
'value': prop_value,
82
+
'source': 'ChemSpider Predicted - ACD/Labs Tab',
83
+
'reliability': 'Unknown',
84
+
'conditions': prop_conditions
85
+
})
111
86
properties.append(new_prop)
87
+
log.msg('CS prop: |%s| |%s| |%s|' %
88
+
(new_prop['attribute'], new_prop['value'], new_prop['source']),
89
+
level=log.DEBUG)
112
90
113
-
return properties
114
-
115
-
def parse_experimentaldatatab(self, sel):
116
-
"""
117
-
This function scrapes Experimental Data tab, Physico-chemical
118
-
properties in particular.
119
-
:param sel: a Selector object of the whole page
120
-
:return: a list of Result items
121
-
"""
122
-
properties = []
123
-
91
+
# Experimental Data Tab, Physico-chemical properties in particular
124
92
scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
125
93
'Properties"]//li/table/tr/td')
126
94
if not scraped_list:
···
132
100
if line.xpath('span/text()'):
133
101
property_name = line.xpath('span/text()').extract()[0].rstrip()
134
102
else:
135
-
new_prop = self.newresult(
136
-
attribute=property_name[:-1],
137
-
value=line.xpath('text()').extract()[0].rstrip(),
138
-
source=line.xpath('strong/text()').extract()[0].rstrip(),
139
-
)
140
-
properties.append(new_prop)
103
+
new_prop = Result({
104
+
'attribute': property_name[:-1],
105
+
'value': line.xpath('text()').extract()[0].rstrip(),
106
+
'source': line.xpath(
107
+
'strong/text()').extract()[0].rstrip(),
108
+
'reliability': 'Unknown',
109
+
'conditions': ''
110
+
})
111
+
properties.append(new_prop)
112
+
log.msg('CS prop: |%s| |%s| |%s|' %
113
+
(new_prop['attribute'], new_prop['value'],
114
+
new_prop['source']), level=log.DEBUG)
141
115
142
116
return properties
143
117
144
118
def parse_synonyms(self, sel):
145
-
"""
146
-
This function scrapes the list of Names and Identifiers
147
-
:param sel: a Selector object of the whole page
148
-
:return: a list of Requests
149
-
"""
119
+
"""Scrape list of Names and Identifiers"""
150
120
requests = []
151
121
synonyms = []
152
122
···
178
148
return requests
179
149
180
150
def new_synonym(self, sel, name, category):
181
-
"""
182
-
This function scrapes for a single synonym at a given HTML tag
183
-
:param sel: a Selector object of the given HTML tag
184
-
:param name: the name of the synonym in the tag
185
-
:param category: the name of the category the synonym is labeled as
186
-
:return: a dictionary containing data on the synonym
187
-
"""
151
+
"""Scrape for a single synonym at a given HTML tag"""
188
152
self.ignore_list.append(name)
189
153
language = sel.xpath('span[@class="synonym_language"]/text()')
190
154
if language:
···
219
183
}
220
184
return synonym
221
185
222
-
def parse_extendedinfo(self, response):
223
-
"""
224
-
This function scrapes data from the ChemSpider GetExtendedCompoundInfo
225
-
API, if a token is present in the configuration settings
226
-
:param response: a Response object to be parsed
227
-
:return: a list of Result items
228
-
"""
186
+
@staticmethod
187
+
def parse_extendedinfo(response):
188
+
"""Scrape data from the ChemSpider GetExtendedCompoundInfo API"""
229
189
sel = Selector(response)
230
190
properties = []
231
191
names = sel.xpath('*').xpath('name()').extract()
232
192
values = sel.xpath('*').xpath('text()').extract()
233
193
for (name, value) in zip(names, values):
234
-
result = self.newresult(
235
-
attribute=name,
236
-
value=value, # These values have no unit!
237
-
source='ChemSpider ExtendedCompoundInfo',
238
-
)
194
+
result = Result({
195
+
'attribute': name,
196
+
'value': value, # These values have no unit!
197
+
'source': 'ChemSpider ExtendedCompoundInfo',
198
+
'reliability': 'Unknown',
199
+
'conditions': ''
200
+
})
239
201
if result['value']:
240
202
properties.append(result)
241
203
return properties
242
204
243
-
def newresult(self, attribute, value, conditions='', source='ChemSpider'):
244
-
"""
245
-
This function abstracts from the Result item and provides default
246
-
values.
247
-
:param attribute: the name of the attribute
248
-
:param value: the value of the attribute
249
-
:param conditions: optional conditions regarding the value
250
-
:param source: the name of the source if it is not ChemSpider
251
-
:return: A Result item
252
-
"""
253
-
return Result({
254
-
'attribute': attribute,
255
-
'value': value,
256
-
'source': source,
257
-
'reliability': self.cfg['reliability'],
258
-
'conditions': conditions
259
-
})
260
-
261
205
def parse_searchrequest(self, response):
262
-
"""
263
-
This function parses the initial response of the ChemSpider Search API
264
-
Requires a valid token to function.
265
-
:param response: the Response object to be parsed
266
-
:return: A Request for the information page and a Request for the
267
-
extendedinfo API call
268
-
"""
206
+
"""Parse the initial response of the ChemSpider Search API """
269
207
sel = Selector(response)
270
208
log.msg('chemspider parse_searchrequest', level=log.DEBUG)
271
209
sel.register_namespace('cs', 'http://www.chemspider.com/')
···
277
215
log.msg('ChemSpider found multiple substances, taking first '
278
216
'element', level=log.DEBUG)
279
217
csid = csids[0]
280
-
structure_url = self.website[:-2].replace("\\", "") + self.structure % csid
281
-
extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid
218
+
structure_url = self.website[:-1] + self.structure % csid
219
+
extendedinfo_url = self.website[:-1] + self.extendedinfo % csid
282
220
log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
283
221
return [Request(url=structure_url,
284
222
callback=self.parse),
···
286
224
callback=self.parse_extendedinfo)]
287
225
288
226
def new_compound_request(self, compound):
289
-
"""
290
-
This function is called when a new synonym is returned to the spider
291
-
to generate new requests
292
-
:param compound: the name of the compound to search for
293
-
"""
294
-
if compound in self.ignore_list or self.cfg['token'] == '':
227
+
if compound in self.ignore_list: # [TODO] - add regular expression
295
228
return None
296
-
searchurl = self.website[:-2].replace("\\", "") + self.search % compound
229
+
searchurl = self.website[:-1] + self.search % compound
297
230
log.msg('chemspider compound', level=log.DEBUG)
298
231
return Request(url=searchurl, callback=self.parse_searchrequest)
+83
-141
FourmiCrawler/sources/NIST.py
+83
-141
FourmiCrawler/sources/NIST.py
···
13
13
# Result item, but should be included eventually.
14
14
15
15
class NIST(Source):
16
-
"""
17
-
NIST Scraper plugin
16
+
"""NIST Scraper plugin
17
+
18
18
This plugin manages searching for a chemical on the NIST website
19
19
and parsing the resulting page if the chemical exists on NIST.
20
20
"""
21
-
website = "http://webbook\\.nist\\.gov/.*"
21
+
website = "http://webbook.nist.gov/*"
22
22
23
23
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
24
24
25
-
def __init__(self, config=None):
26
-
"""
27
-
Initialization of NIST scraper
28
-
:param config: configuration variables for this scraper, must contain
29
-
'reliability' key.
30
-
"""
31
-
Source.__init__(self, config)
32
-
self.ignore_list = set()
25
+
ignore_list = set()
26
+
27
+
def __init__(self):
28
+
Source.__init__(self)
33
29
34
30
def parse(self, response):
35
-
"""
36
-
This function is called when a Response matching the variable
37
-
'website' is available for parsing the Response object.
38
-
:param response: The Scrapy Response object to be parsed
39
-
:return: a list of Result items and Request objects
40
-
"""
41
31
sel = Selector(response)
42
32
43
33
title = sel.xpath('head/title/text()').extract()[0]
···
62
52
log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
63
53
level=log.DEBUG)
64
54
65
-
requests.extend(self.parse_tables(sel, symbol_table))
66
-
67
-
return requests
68
-
69
-
def parse_tables(self, sel, symbol_table):
70
-
"""
71
-
This function identifies and distributes parsing of tables to other
72
-
functions below.
73
-
:param sel: A Selector object of the whole page
74
-
:param symbol_table: a dictionary containing translations of raw HTML
75
-
tags to human readable names
76
-
:return: a list of Result items and Requests
77
-
"""
78
-
requests = []
79
-
80
55
for table in sel.xpath('//table[@class="data"]'):
81
56
summary = table.xpath('@summary').extract()[0]
82
57
if summary == 'One dimensional data':
···
107
82
return requests
108
83
109
84
def parse_generic_info(self, sel):
110
-
"""
111
-
This function parses: synonyms, chemical formula, molecular weight,
112
-
InChI, InChiKey, CAS number
113
-
:param sel: A Selector object of the entire page in the original
114
-
response
115
-
:return: a list of Result items
85
+
"""Parses: synonyms, chemical formula, molecular weight, InChI,
86
+
InChiKey, CAS number
116
87
"""
117
88
ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
89
+
li = ul.xpath('li')
118
90
119
91
raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
120
92
for synonym in raw_synonyms[0].strip().split(';\n'):
···
142
114
143
115
requests = []
144
116
for key, value in data.iteritems():
145
-
result = self.newresult(
146
-
attribute=key,
147
-
value=value
148
-
)
117
+
result = Result({
118
+
'attribute': key,
119
+
'value': value,
120
+
'source': 'NIST',
121
+
'reliability': 'Unknown',
122
+
'conditions': ''
123
+
})
149
124
requests.append(result)
150
125
151
126
return requests
152
127
153
128
def parse_aggregate_data(self, table, symbol_table):
154
-
"""
155
-
This function parses the table(s) which contain possible links to
156
-
individual data points
157
-
:param table: a Selector object of the table to be parsed
158
-
:param symbol_table: a dictionary containing translations of raw HTML
159
-
tags to human readable names
160
-
:return: a list of Result items and Request objects
129
+
"""Parses the table(s) which contain possible links to individual
130
+
data points
161
131
"""
162
132
results = []
163
133
for tr in table.xpath('tr[td]'):
164
134
extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
165
135
'/a/@href').extract()
166
136
if extra_data_url:
167
-
request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0],
137
+
request = Request(url=self.website[:-1] + extra_data_url[0],
168
138
callback=self.parse_individual_datapoints)
169
139
results.append(request)
170
140
continue
···
180
150
name = m.group(1)
181
151
condition = m.group(2)
182
152
183
-
result = self.newresult(
184
-
attribute=name,
185
-
value=data[1] + ' ' + data[2],
186
-
conditions=condition
187
-
)
153
+
result = Result({
154
+
'attribute': name,
155
+
'value': data[1] + ' ' + data[2],
156
+
'source': 'NIST',
157
+
'reliability': 'Unknown',
158
+
'conditions': condition
159
+
})
188
160
log.msg('NIST: |%s|' % data, level=log.DEBUG)
189
161
results.append(result)
190
162
return results
191
163
192
-
def parse_transition_data(self, table, summary):
193
-
"""
194
-
This function parses the table containing properties regarding phase
195
-
changes
196
-
:param table: a Selector object of the table to be parsed
197
-
:param summary: the name of the property
198
-
:return: a list of Result items
199
-
"""
164
+
@staticmethod
165
+
def parse_transition_data(table, summary):
166
+
"""Parses the table containing properties regarding phase changes"""
200
167
results = []
201
168
202
-
unit = self.get_unit(table)
169
+
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
170
+
m = re.search(r'\((.*)\)', tr_unit)
171
+
unit = '!'
172
+
if m:
173
+
unit = m.group(1)
203
174
204
175
for tr in table.xpath('tr[td]'):
205
176
tds = tr.xpath('td/text()').extract()
206
-
result = self.newresult(
207
-
attribute=summary,
208
-
value=tds[0] + ' ' + unit,
209
-
conditions='%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
210
-
)
177
+
result = Result({
178
+
'attribute': summary,
179
+
'value': tds[0] + ' ' + unit,
180
+
'source': 'NIST',
181
+
'reliability': 'Unknown',
182
+
'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
183
+
})
211
184
results.append(result)
212
185
213
186
return results
214
187
215
-
def parse_generic_data(self, table, summary):
216
-
"""
217
-
Parses the common tables of 4 and 5 rows. Assumes they are of the
188
+
@staticmethod
189
+
def parse_generic_data(table, summary):
190
+
"""Parses the common tables of 4 and 5 rows. Assumes they are of the
218
191
form:
219
192
Symbol (unit)|Temperature (K)|Method|Reference|Comment
220
193
Symbol (unit)|Temperature (K)|Reference|Comment
221
-
:param table: a Selector object of the table to be parsed
222
-
:param summary: the name of the property
223
-
:return: a list of Result items
224
194
"""
225
195
results = []
226
196
227
-
unit = self.get_unit(table)
197
+
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
198
+
m = re.search(r'\((.*)\)', tr_unit)
199
+
unit = '!'
200
+
if m:
201
+
unit = m.group(1)
228
202
229
203
for tr in table.xpath('tr[td]'):
230
204
tds = tr.xpath('td/text()').extract()
231
-
result = self.newresult(
232
-
attribute=summary,
233
-
value=tds[0] + ' ' + unit,
234
-
conditions='%s K' % tds[1]
235
-
)
205
+
result = Result({
206
+
'attribute': summary,
207
+
'value': tds[0] + ' ' + unit,
208
+
'source': 'NIST',
209
+
'reliability': 'Unknown',
210
+
'conditions': '%s K' % tds[1]
211
+
})
236
212
results.append(result)
237
213
return results
238
214
239
-
def parse_antoine_data(self, table, summary):
240
-
"""
241
-
This function parses the table containing parameters for the Antione
242
-
equation
243
-
:param table: a Selector object of the table to be parsed
244
-
:param summary: the name of the property
245
-
:return: a list of Result items
246
-
"""
215
+
@staticmethod
216
+
def parse_antoine_data(table, summary):
217
+
"""Parse table containing parameters for the Antione equation"""
247
218
results = []
248
219
249
220
for tr in table.xpath('tr[td]'):
250
221
tds = tr.xpath('td/text()').extract()
251
-
result = self.newresult(
252
-
attribute=summary,
253
-
value='A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
254
-
conditions='%s K' % tds[0]
255
-
)
222
+
result = Result({
223
+
'attribute': summary,
224
+
'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
225
+
'source': 'NIST',
226
+
'reliability': 'Unknown',
227
+
'conditions': '%s K' % tds[0]
228
+
})
256
229
results.append(result)
257
230
258
231
return results
259
232
260
-
def parse_individual_datapoints(self, response):
261
-
"""
262
-
This function parses the 'individual data points' page linked from
263
-
the aggregate data table(s)
264
-
:param response: the Scrapy Response object to be parsed
265
-
:return: a list of Result items
266
-
"""
233
+
@staticmethod
234
+
def parse_individual_datapoints(response):
235
+
"""Parses the page linked from aggregate data"""
267
236
sel = Selector(response)
268
237
table = sel.xpath('//table[@class="data"]')[0]
269
238
···
276
245
name = m.group(1)
277
246
condition = m.group(2)
278
247
279
-
unit = self.get_unit(table)
248
+
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
249
+
m = re.search(r'\((.*)\)', tr_unit)
250
+
unit = '!'
251
+
if m:
252
+
unit = m.group(1)
280
253
281
254
for tr in table.xpath('tr[td]'):
282
255
tds = tr.xpath('td/text()').extract()
···
285
258
if m:
286
259
uncertainty = '+- %s ' % m.group(1)
287
260
# [TODO]: get the plusminus sign working in here
288
-
result = self.newresult(
289
-
attribute=name,
290
-
value='%s %s%s' % (tds[0], uncertainty, unit),
291
-
conditions=condition
292
-
)
261
+
result = Result({
262
+
'attribute': name,
263
+
'value': '%s %s%s' % (tds[0], uncertainty, unit),
264
+
'source': 'NIST',
265
+
'reliability': 'Unknown',
266
+
'conditions': condition
267
+
})
293
268
results.append(result)
294
269
295
270
return results
296
271
297
-
@staticmethod
298
-
def get_unit(table):
299
-
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
300
-
m = re.search(r'\((.*)\)', tr_unit)
301
-
unit = '!'
302
-
if m:
303
-
unit = m.group(1)
304
-
305
-
return unit
306
-
307
-
def newresult(self, attribute, value, conditions=''):
308
-
"""
309
-
This function abstracts from the Result item and provides default
310
-
values
311
-
:param attribute: the name of the attribute
312
-
:param value: the value of the attribute
313
-
:param conditions: optional conditions regarding the value
314
-
:return: A Result item
315
-
"""
316
-
return Result(
317
-
{
318
-
'attribute': attribute,
319
-
'value': value,
320
-
'source': 'NIST',
321
-
'reliability': self.cfg['reliability'],
322
-
'conditions': conditions
323
-
})
324
-
325
272
def new_compound_request(self, compound):
326
-
"""
327
-
This function is called when a new synonym is returned to the spider
328
-
to generate new requests
329
-
:param compound: the name of the compound to search for
330
-
"""
331
273
if compound not in self.ignore_list:
332
274
self.ignore_list.update(compound)
333
-
return Request(url=self.website[:-2].replace("\\", "") + self.search % compound,
275
+
return Request(url=self.website[:-1] + self.search % compound,
334
276
callback=self.parse)
-149
FourmiCrawler/sources/PubChem.py
-149
FourmiCrawler/sources/PubChem.py
···
1
-
import re
2
-
3
-
from scrapy.http import Request
4
-
from scrapy import log
5
-
from scrapy.selector import Selector
6
-
7
-
from source import Source
8
-
from FourmiCrawler.items import Result
9
-
10
-
11
-
class PubChem(Source):
12
-
""" PubChem scraper for chemical properties
13
-
14
-
This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance,
15
-
including sources of the values of properties.
16
-
"""
17
-
18
-
# PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
19
-
website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
20
-
website_www = 'http://www.ncbi.nlm.nih.gov/*'
21
-
website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
22
-
search = 'pccompound?term=%s'
23
-
data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
24
-
25
-
__spider = None
26
-
searched_compounds = set()
27
-
28
-
def __init__(self, config):
29
-
Source.__init__(self, config)
30
-
self.cfg = config
31
-
32
-
def parse(self, response):
33
-
"""
34
-
Distributes the above described behaviour
35
-
:param response: The incoming search request
36
-
:return Returns the found properties if response is unique or returns none if it's already known
37
-
"""
38
-
requests = []
39
-
log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
40
-
41
-
sel = Selector(response)
42
-
compound = sel.xpath('//h1/text()').extract()[0]
43
-
if compound in self.searched_compounds:
44
-
return None
45
-
46
-
self.searched_compounds.update(compound)
47
-
raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0]
48
-
for synonym in raw_synonyms.strip().split(', '):
49
-
log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG)
50
-
self.searched_compounds.update(synonym)
51
-
self._spider.get_synonym_requests(synonym)
52
-
log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
53
-
54
-
n = re.search(r'cid=(\d+)', response.url)
55
-
if n:
56
-
cid = n.group(1)
57
-
log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach
58
-
# the seperate html page which contains the properties and their values
59
-
60
-
# using this cid to get the right url and scrape it
61
-
requests.append(
62
-
Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data))
63
-
return requests
64
-
65
-
def parse_data(self, response):
66
-
"""
67
-
Parse data found in 'Chemical and Physical properties' part of a substance page.
68
-
:param response: The response with the page to parse
69
-
:return: requests: Returns a list of properties with their values, source, etc.
70
-
"""
71
-
log.msg('parsing data', level=log.DEBUG)
72
-
requests = []
73
-
74
-
sel = Selector(response)
75
-
props = sel.xpath('//div')
76
-
77
-
for prop in props:
78
-
prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
79
-
if prop.xpath('a'): # parsing for single value in property
80
-
prop_source = ''.join(prop.xpath('a/@title').extract())
81
-
prop_value = ''.join(prop.xpath('a/text()').extract())
82
-
new_prop = Result({
83
-
'attribute': prop_name,
84
-
'value': prop_value,
85
-
'source': prop_source,
86
-
'reliability': self.cfg['reliability'],
87
-
'conditions': ''
88
-
})
89
-
log.msg('PubChem prop: |%s| |%s| |%s|' %
90
-
(new_prop['attribute'], new_prop['value'],
91
-
new_prop['source']), level=log.DEBUG)
92
-
requests.append(new_prop)
93
-
elif prop.xpath('ul'): # parsing for multiple values (list) in property
94
-
prop_values = prop.xpath('ul//li')
95
-
for prop_li in prop_values:
96
-
prop_value = ''.join(prop_li.xpath('a/text()').extract())
97
-
prop_source = ''.join(prop_li.xpath('a/@title').extract())
98
-
new_prop = Result({
99
-
'attribute': prop_name,
100
-
'value': prop_value,
101
-
'source': prop_source,
102
-
'reliability': self.cfg['reliability'],
103
-
'conditions': ''
104
-
})
105
-
log.msg('PubChem prop: |%s| |%s| |%s|' %
106
-
(new_prop['attribute'], new_prop['value'],
107
-
new_prop['source']), level=log.DEBUG)
108
-
requests.append(new_prop)
109
-
110
-
return requests
111
-
112
-
def parse_searchrequest(self, response):
113
-
"""
114
-
This function parses the response to the new_compound_request Request
115
-
:param response: the Response object to be parsed
116
-
:return: A Request for the compound page or what self.parse returns in
117
-
case the search request forwarded to the compound page
118
-
"""
119
-
120
-
# check if pubchem forwarded straight to compound page
121
-
m = re.match(self.website_pubchem, response.url)
122
-
if m:
123
-
log.msg('PubChem search forwarded to compound page',
124
-
level=log.DEBUG)
125
-
return self.parse(response)
126
-
127
-
sel = Selector(response)
128
-
129
-
results = sel.xpath('//div[@class="rsltcont"]')
130
-
if results:
131
-
url = results[0].xpath('div/p/a[1]/@href')
132
-
else:
133
-
log.msg('PubChem search found nothing or xpath failed',
134
-
level=log.DEBUG)
135
-
return None
136
-
137
-
if url:
138
-
url = 'http:' + ''.join(url[0].extract())
139
-
log.msg('PubChem compound page: %s' % url, level=log.DEBUG)
140
-
else:
141
-
log.msg('PubChem search found results, but no url in first result',
142
-
level=log.DEBUG)
143
-
return None
144
-
145
-
return Request(url=url, callback=self.parse)
146
-
147
-
def new_compound_request(self, compound):
148
-
return Request(url=self.website_www[:-1] + self.search % compound,
149
-
callback=self.parse_searchrequest)
+43
-93
FourmiCrawler/sources/WikipediaParser.py
+43
-93
FourmiCrawler/sources/WikipediaParser.py
···
12
12
""" Wikipedia scraper for chemical properties
13
13
14
14
This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values.
15
-
It also returns requests with other external sources which contain information on parsed subject.
15
+
It also returns requests with other external sources which contain information on parsed subject.
16
16
"""
17
17
18
-
website = "http://en\\.wikipedia\\.org/wiki/.*"
18
+
website = "http://en.wikipedia.org/wiki/*"
19
19
__spider = None
20
20
searched_compounds = []
21
21
22
-
def __init__(self, config=None):
23
-
Source.__init__(self, config)
22
+
def __init__(self):
23
+
Source.__init__(self)
24
24
25
25
def parse(self, response):
26
-
"""
27
-
Distributes the above described behaviour
28
-
:param response: The incoming search request
29
-
:return: Returns the found properties if response is unique or returns none if it's already known
30
-
"""
26
+
""" Distributes the above described behaviour """
31
27
log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
32
28
sel = Selector(response)
33
29
compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page
···
39
35
return items
40
36
41
37
def parse_infobox(self, sel):
42
-
"""
43
-
Scrape data from infobox on wikipedia.
44
-
45
-
Data from two types of infoboxes: class="infobox bordered" and class="infobox" is scraped and
46
-
:param sel: The selector with the html-information of the page to parse
47
-
:return: item_list: Returns a list of properties with their values, source, etc..
48
-
"""
49
-
38
+
""" scrape data from infobox on wikipedia. """
50
39
items = []
51
40
52
-
# scrape the chembox (wikipedia template)
53
-
items = self.parse_chembox(sel, items)
41
+
# be sure to get chembox (wikipedia template)
42
+
tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
43
+
xpath('normalize-space(string())')
44
+
prop_names = tr_list[::2]
45
+
prop_values = tr_list[1::2]
46
+
for i, prop_name in enumerate(prop_names):
47
+
item = Result({
48
+
'attribute': prop_name.extract().encode('utf-8'),
49
+
'value': prop_values[i].extract().encode('utf-8'),
50
+
'source': "Wikipedia",
51
+
'reliability': "Unknown",
52
+
'conditions': ""
53
+
})
54
+
items.append(item)
55
+
log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
54
56
55
-
# scrape the drugbox (wikipedia template)
56
-
items = self.parse_drugbox(sel, items)
57
+
#scrape the drugbox (wikipedia template)
58
+
tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
59
+
log.msg('dit: %s' % tr_list2, level=log.DEBUG)
60
+
for tablerow in tr_list2:
61
+
log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
62
+
if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
63
+
'normalize-space(string())'):
64
+
item = Result({
65
+
'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
66
+
'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
67
+
'source': "Wikipedia",
68
+
'reliability': "Unknown",
69
+
'conditions': ""
70
+
})
71
+
items.append(item)
72
+
log.msg(
73
+
'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
74
+
level=log.DEBUG)
57
75
58
76
items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
59
77
item_list = self.clean_items(items)
···
77
95
78
96
return item_list
79
97
80
-
def parse_chembox(self, sel, items):
81
-
"""
82
-
Scrape data from chembox infobox on wikipedia.
83
-
84
-
:param sel: The selector with the html-information of the page to parse
85
-
:param items: the list of items where the result have to be stored in
86
-
:return: items: the list of items with the new found and stored items
87
-
"""
88
-
tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
89
-
xpath('normalize-space(string())')
90
-
prop_names = tr_list[::2]
91
-
prop_values = tr_list[1::2]
92
-
for i, prop_name in enumerate(prop_names):
93
-
item = self.newresult(
94
-
attribute=prop_name.extract().encode('utf-8'),
95
-
value=prop_values[i].extract().encode('utf-8')
96
-
)
97
-
items.append(item)
98
-
log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
99
-
return items
100
-
101
-
def parse_drugbox(self, sel, items):
102
-
"""
103
-
Scrape data from drugbox infobox on wikipedia.
104
-
105
-
:param sel: The selector with the html-information of the page to parse
106
-
:param items: the list of items where the result have to be stored in
107
-
:return: items: the list of items with the new found and stored items
108
-
"""
109
-
tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
110
-
log.msg('dit: %s' % tr_list2, level=log.DEBUG)
111
-
for tablerow in tr_list2:
112
-
log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
113
-
if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
114
-
'normalize-space(string())'):
115
-
item = self.newresult(
116
-
attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
117
-
value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
118
-
)
119
-
items.append(item)
120
-
log.msg(
121
-
'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
122
-
level=log.DEBUG)
123
-
return items
124
-
125
98
def new_compound_request(self, compound):
126
-
return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
99
+
return Request(url=self.website[:-1] + compound, callback=self.parse)
127
100
128
101
@staticmethod
129
102
def clean_items(items):
130
-
131
-
"""
132
-
Clean up properties using regex, makes it possible to split the values from the units
133
-
134
-
Almost not in use, only cleans J/K/mol values and boiling/melting points.
135
-
136
-
:param items: List of properties with their values, source, etc..
137
-
:return: items: List of now cleaned up items
138
-
"""
103
+
""" clean up properties using regex, makes it possible to split the values from the units """
139
104
for item in items:
140
105
value = item['value']
141
106
m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F)
···
148
113
149
114
@staticmethod
150
115
def get_identifiers(sel):
151
-
"""
152
-
Find external links, named 'Identifiers' to different sources.
153
-
154
-
:param sel: The selector with the html-information of the page to parse
155
-
:return: links: New links which can be used to expand the crawlers search
156
-
"""
116
+
""" find external links, named 'Identifiers' to different sources. """
157
117
links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
158
118
'[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
159
-
return links
160
-
161
-
def newresult(self, attribute, value):
162
-
return Result(
163
-
{
164
-
'attribute': attribute,
165
-
'value': value,
166
-
'source': 'Wikipedia',
167
-
'reliability': self.cfg['reliability'],
168
-
'conditions': ''
169
-
})
119
+
return links
+3
-6
FourmiCrawler/sources/source.py
+3
-6
FourmiCrawler/sources/source.py
···
3
3
4
4
5
5
class Source:
6
-
website = "http://something/.*" # Regex of URI's the source is able to parse
6
+
website = "http://something/*" # Regex of URI's the source is able to parse
7
7
_spider = None
8
8
9
-
def __init__(self, config=None):
9
+
def __init__(self):
10
10
"""
11
11
Initiation of a new Source
12
12
"""
13
-
self.cfg = {}
14
-
if config is not None:
15
-
self.cfg = config
16
13
pass
17
14
18
15
def parse(self, response):
···
30
27
:param compound: A compound name.
31
28
:return: A new Scrapy Request
32
29
"""
33
-
# return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
30
+
# return Request(url=self.website[:-1] + compound, callback=self.parse)
34
31
pass
35
32
36
33
def set_spider(self, spider):
+8
-12
FourmiCrawler/spider.py
+8
-12
FourmiCrawler/spider.py
···
9
9
A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
10
10
"""
11
11
name = "FourmiSpider"
12
+
_sources = []
13
+
synonyms = set()
12
14
13
-
def __init__(self, compound=None, selected_attributes=None, *args, **kwargs):
15
+
def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
14
16
"""
15
17
Initiation of the Spider
16
18
:param compound: compound that will be searched.
17
19
:param selected_attributes: A list of regular expressions that the attributes should match.
18
20
"""
19
-
self._sources = []
20
-
self.synonyms = set()
21
21
super(FourmiSpider, self).__init__(*args, **kwargs)
22
22
self.synonyms.add(compound)
23
-
if selected_attributes is None:
24
-
self.selected_attributes = [".*"]
25
-
else:
26
-
self.selected_attributes = selected_attributes
23
+
self.selected_attributes = selected_attributes
27
24
28
25
def parse(self, response):
29
26
"""
···
34
31
"""
35
32
for source in self._sources:
36
33
if re.match(source.website, response.url):
37
-
log.msg("URL: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
34
+
log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
38
35
return source.parse(response)
39
-
log.msg("URL: " + response.url + " -> No compatible source", level=log.INFO)
40
36
return None
41
37
42
-
def get_synonym_requests(self, compound, force=False):
38
+
def get_synonym_requests(self, compound):
43
39
"""
44
40
A function that generates new Scrapy Request for each source given a new synonym of a compound.
45
41
:param compound: A compound name
46
42
:return: A list of Scrapy Request objects
47
43
"""
48
44
requests = []
49
-
if force or compound not in self.synonyms:
45
+
if compound not in self.synonyms:
50
46
self.synonyms.add(compound)
51
47
for parser in self._sources:
52
48
parser_requests = parser.new_compound_request(compound)
···
61
57
"""
62
58
requests = []
63
59
for synonym in self.synonyms:
64
-
requests.extend(self.get_synonym_requests(synonym, force=True))
60
+
requests.extend(self.get_synonym_requests(synonym))
65
61
return requests
66
62
67
63
def add_sources(self, sources):
-1
GUI/__init__.py
-1
GUI/__init__.py
···
1
-
import gui
-30
GUI/configImporter.py
-30
GUI/configImporter.py
···
1
-
import ConfigParser
2
-
3
-
4
-
class ConfigImporter():
5
-
def __init__(self, filename):
6
-
"""Read the filename into the parser."""
7
-
self.filename = filename
8
-
self.parser = ConfigParser.ConfigParser()
9
-
self.parser.read(self.filename)
10
-
11
-
def load_common_attributes(self):
12
-
"""Loads common attributes from the initialized file."""
13
-
try:
14
-
return self.parser.get('GUI', 'CommonParameters')
15
-
except:
16
-
return 'One, Two, Three'
17
-
18
-
def load_output_types(self):
19
-
"""Loads output types from the initialized file."""
20
-
try:
21
-
return self.parser.get('GUI', 'OutputTypes')
22
-
except:
23
-
return 'csv'
24
-
25
-
def load_always_attributes(self):
26
-
"""Loads attributes that are always searched for from the initialized file."""
27
-
try:
28
-
return self.parser.get('GUI', 'AlwaysParameters')
29
-
except:
30
-
return 'Name, Weight'
-196
GUI/gui.py
-196
GUI/gui.py
···
1
-
from Tkinter import *
2
-
import os
3
-
import shutil
4
-
from tkFileDialog import asksaveasfilename
5
-
6
-
from configImporter import *
7
-
8
-
9
-
class GUI():
10
-
def __init__(self, search, config_file='GUI.cfg', sourceloader=None, in_source=True):
11
-
"""Boots the window, configuration."""
12
-
if not in_source:
13
-
current_dir = os.path.dirname(os.path.abspath(__file__))
14
-
config_file = current_dir + '../' + config_file
15
-
if not os.path.isfile(config_file):
16
-
try:
17
-
shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../GUI.cfg.sample", config_file)
18
-
except IOError:
19
-
print "GUI configuration couldn't be found and couldn't be created."
20
-
sys.exit()
21
-
self.configurator = ConfigImporter(config_file)
22
-
self.sourceloader = sourceloader
23
-
self.finish_with_search = False
24
-
self.values = {}
25
-
self.required_variables = ['substance']
26
-
self.search = search
27
-
self.window, self.variables = self.generate_window(self.load_common_attributes(), self.load_output_types())
28
-
29
-
def load_common_attributes(self):
30
-
"""Calls the configuration parser for common attributes."""
31
-
return [x.strip() for x in self.configurator.load_common_attributes().split(',')]
32
-
33
-
def load_output_types(self):
34
-
"""Calls the configuration parser for output types."""
35
-
return [x.strip() for x in self.configurator.load_output_types().split(',')]
36
-
37
-
def load_always_attributes(self):
38
-
"""Calls the configuration parser for attributes that are always used."""
39
-
return ','.join([x.strip() for x in self.configurator.load_always_attributes().split(',')])
40
-
41
-
def set_output(self):
42
-
self.variable_output_name.set(asksaveasfilename())
43
-
self.button_output_name.config(text=self.variable_output_name.get())
44
-
45
-
def generate_window(self, common_attributes, output_types):
46
-
"""Creates all widgets and variables in the window."""
47
-
window = Tk()
48
-
window.wm_title("Fourmi Crawler")
49
-
50
-
variables = {}
51
-
52
-
variable_substance = StringVar(window)
53
-
frame_substance = Frame(window)
54
-
label_substance = Label(frame_substance, text="Substance: ")
55
-
input_substance = Entry(frame_substance, font=("Helvetica", 12), width=25, textvariable=variable_substance)
56
-
variables.update({"substance": variable_substance})
57
-
frame_substance.pack(side=TOP)
58
-
label_substance.pack()
59
-
input_substance.pack()
60
-
input_substance.focus()
61
-
62
-
frame_all_attributes = Frame(window)
63
-
frame_selecting_attributes = Frame(frame_all_attributes)
64
-
frame_new_attributes = Frame(frame_selecting_attributes)
65
-
label_new_attributes = Label(frame_new_attributes, text="Parameters: ")
66
-
input_new_attributes = Text(frame_new_attributes, font=("Helvetica", 8), width=25, height=7, padx=5, pady=5)
67
-
variables.update({"new_attributes": input_new_attributes})
68
-
frame_new_attributes.pack(side=LEFT)
69
-
label_new_attributes.pack()
70
-
input_new_attributes.pack()
71
-
72
-
frame_common_attributes = Frame(frame_selecting_attributes)
73
-
label_common_attributes = Label(frame_common_attributes, text="Common Parameters: ")
74
-
input_common_attributes = Listbox(frame_common_attributes, selectmode=MULTIPLE, height=7)
75
-
scrollbar_common_attributes = Scrollbar(frame_common_attributes)
76
-
input_common_attributes.config(yscrollcommand=scrollbar_common_attributes.set)
77
-
scrollbar_common_attributes.config(command=input_common_attributes.yview)
78
-
if common_attributes and len(common_attributes) > 0:
79
-
input_common_attributes.insert(END, *common_attributes)
80
-
variables.update({"common_attributes": input_common_attributes})
81
-
frame_common_attributes.pack(side=RIGHT)
82
-
label_common_attributes.pack(side=TOP)
83
-
input_common_attributes.pack(side=LEFT)
84
-
scrollbar_common_attributes.pack(side=RIGHT, fill=Y)
85
-
frame_selecting_attributes.pack()
86
-
87
-
frame_last = Frame(window)
88
-
search_button = Button(frame_last, text="Start search", command=self.prepare_search)
89
-
cancel_button = Button(frame_last, text="Cancel", command=window.destroy)
90
-
frame_last.pack(side=BOTTOM)
91
-
search_button.pack(side=LEFT)
92
-
cancel_button.pack(side=RIGHT)
93
-
94
-
frame_name = Frame(window)
95
-
frame_output_name = Frame(frame_name)
96
-
label_output_name = Label(frame_output_name, text='Output file:')
97
-
self.variable_output_name = StringVar()
98
-
self.variable_output_name.set('results.csv')
99
-
variables.update({'output_name':self.variable_output_name})
100
-
self.button_output_name = Button(frame_output_name, command=self.set_output, text="Select file")
101
-
frame_output_name.pack(side=LEFT)
102
-
label_output_name.pack()
103
-
self.button_output_name.pack()
104
-
frame_name.pack(side=BOTTOM)
105
-
106
-
107
-
frame_checkboxes = Frame(window)
108
-
frame_checkbox_attributes = Frame(frame_checkboxes)
109
-
variable_all_attributes = BooleanVar()
110
-
variable_all_attributes.set(True)
111
-
input_all_attributes = Checkbutton(frame_checkbox_attributes, text="Search ALL parameters",
112
-
variable=variable_all_attributes)
113
-
variables.update({"all_attributes": variable_all_attributes})
114
-
frame_checkbox_attributes.pack(side=LEFT)
115
-
input_all_attributes.pack()
116
-
117
-
frame_logging = Frame(frame_checkboxes)
118
-
variable_logging = BooleanVar()
119
-
variable_logging.set(False)
120
-
input_logging = Checkbutton(frame_logging, text="Verbose logging", variable=variable_logging)
121
-
variables.update({'logging':variable_logging})
122
-
frame_logging.pack(side=RIGHT)
123
-
frame_checkboxes.pack(side=BOTTOM)
124
-
input_logging.pack()
125
-
frame_all_attributes.pack()
126
-
127
-
return window, variables
128
-
129
-
def prepare_search(self):
130
-
"""Saves the values from the window for later retrieval."""
131
-
variables = self.variables
132
-
values = {}
133
-
134
-
values.update({"Always attributes": self.load_always_attributes()})
135
-
for name, var in variables.iteritems():
136
-
if var.__class__ is StringVar:
137
-
values.update({name: var.get()})
138
-
elif var.__class__ is BooleanVar:
139
-
values.update({name: var.get()})
140
-
elif var.__class__ is Text:
141
-
values.update({name: str(var.get("1.0", END)).strip()})
142
-
elif var.__class__ is Listbox:
143
-
values.update({name: ", ".join([var.get(int(i)) for i in var.curselection()])})
144
-
else:
145
-
print "No known class, {}, {}".format(name, var)
146
-
147
-
values.update({'output_name':self.variable_output_name.get()})
148
-
values.update({'output_type':self.check_output_type(values.get('output_name'))})
149
-
150
-
self.values = values
151
-
if all([values.get(i) != '' for i in self.required_variables]):
152
-
self.finish_with_search = True
153
-
self.window.destroy()
154
-
else:
155
-
self.finish_with_search = False
156
-
#tkMessageBox.showinfo('Not all required information was entered!')
157
-
158
-
def execute_search(self):
159
-
"""Calls the Fourmi crawler with the values from the GUI"""
160
-
if self.values.get('all_attributes'):
161
-
attributes = ".*"
162
-
else:
163
-
attribute_types = ['attributes', 'Common attributes', 'Always attributes']
164
-
attributes = ','.join([str(self.values.get(attribute)) for attribute in attribute_types])
165
-
output_file = "file://" + str(self.values.get('output_name')) #Dealing with absolute paths
166
-
167
-
arguments = {'--attributes': attributes,
168
-
'--exclude': None,
169
-
'--format': self.values.get('output_type'),
170
-
'--help': False,
171
-
'--include': None,
172
-
'--log': 'log.txt',
173
-
'--output': output_file,
174
-
'-v': 0 if self.values.get('logging') else 3,
175
-
'--version': False,
176
-
'<compound>': self.values.get('substance'),
177
-
'list': False,
178
-
'search': True}
179
-
180
-
self.search(arguments, self.sourceloader)
181
-
182
-
def run(self):
183
-
"""Starts the window and the search."""
184
-
self.window.mainloop()
185
-
if self.finish_with_search:
186
-
self.execute_search()
187
-
188
-
def check_output_type(self, filename):
189
-
parts = str(filename).split('.')
190
-
output_types = self.load_output_types()
191
-
extension = parts[-1]
192
-
193
-
for type in output_types:
194
-
if extension==type:
195
-
return extension
196
-
return output_types[0]
-10
GUI.cfg.sample
-10
GUI.cfg.sample
···
1
-
[GUI]
2
-
# Personalize options in your User Interface
3
-
4
-
# Commonly used parameters are listed in the GUI for easy selection
5
-
CommonParameters = Weight, Polarity, Viscosity, Solubility, Name
6
-
7
-
# Parameters that are always used in the search
8
-
AlwaysParameters = Name
9
-
10
-
OutputTypes = csv, json, jsonlines, xml
+12
-7
README.md
+12
-7
README.md
···
1
1
# Fourmi
2
2
3
-
**Master branch**: [](https://travis-ci.org/jjdekker/Fourmi) [](https://coveralls.io/r/jjdekker/Fourmi?branch=master)
3
+
**Master branch**: [](https://travis-ci.org/Recondor/Fourmi)
4
4
5
-
**Developing branch**: [](https://travis-ci.org/jjdekker/Fourmi) [](https://coveralls.io/r/jjdekker/Fourmi?branch=develop)
5
+
**Developing branch**: [](https://travis-ci.org/Recondor/Fourmi)
6
6
7
7
Fourmi is an web scraper for chemical substances. The program is designed to be
8
8
used as a search engine to search multiple chemical databases for a specific
···
23
23
24
24
### Installing
25
25
26
-
If you're installing Fourmi, please take a look at our installation guides
27
-
on our [wiki](https://github.com/jjdekker/Fourmi/wiki). When you've installed the application, make sure to check our
28
-
usage guide on the [Command Line Interface](https://github.com/jjdekker/Fourmi/wiki/CLI) and on the [Graphical User Interface](https://github.com/jjdekker/Fourmi/wiki/GUI).
26
+
If you're installing Fourmi, please take a look at our [installation guide](...)
27
+
on our wiki. When you've installed the application, make sure to check our
28
+
[usage guide](...).
29
29
30
30
### Using the Source
31
31
32
32
To use the Fourmi source code multiple dependencies are required. Take a look at
33
-
our [wiki pages](https://github.com/jjdekker/Fourmi/wiki) on using the application source code in our a step by step
33
+
the [wiki page](...) on using the application source code for a step by step
34
34
installation guide.
35
35
36
36
When developing for the Fourmi project keep in mind that code readability is a
37
37
must. To maintain the readability, code should be conform with the
38
38
[PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python
39
39
code. More information about the different structures and principles of the
40
-
Fourmi application can be found on our [wiki](https://github.com/jjdekker/Fourmi/wiki).
40
+
Fourmi application can be found on our [wiki](...).
41
41
42
42
### To Do
43
43
···
45
45
46
46
__Main goals:__
47
47
48
+
- Improve our documentation and guides. (Assignee: Dekker)
48
49
- Build an graphical user interface(GUI) as alternative for the command line
49
50
interface(CLI). (Assignee: Harmen)
50
51
- Compiling the source into an windows executable. (Assignee: Bas)
52
+
- Create an configuration file to hold logins and API keys.
53
+
- Determine reliability of our data point.
54
+
- Create an module to gather data from NIST. (Assignee: Rob)
55
+
- Create an module to gather data from PubChem. (Assignee: Nout)
51
56
52
57
__Side goals:__
53
58
-108
SIGNED.md
-108
SIGNED.md
···
1
-
##### Signed by https://keybase.io/jdekker
2
-
```
3
-
-----BEGIN PGP SIGNATURE-----
4
-
Version: GnuPG v1.4.11 (GNU/Linux)
5
-
6
-
iQIcBAABAgAGBQJTpMZAAAoJEJrQ9RIUCT6/Hf8P/AyX9ZD5zj6rBi2CwDOTs5aa
7
-
flVqw9syvdqTzVfXQaR4UrCSOuyuOeAkiqub0BMjxyCurqAwN/SCPf3uOJ/tGXmt
8
-
ZPtYVHjevJ4mbojLhZiJ2av8LC9VOh3Zl+reR3L2cLuBD4rVSrfUMJtczbbtNlk+
9
-
+mczRcTpzNvHQW6mKqyUoKn8xqNnLC7C+p5ybNZ5EADUfoKIF1xyTN6je6fpYZ1U
10
-
IHxiUzeOvfX9ohmbfnfkpkuSll1nUJWsTgUPKhthJuxEhwCQ1xMdWhxfcyZJaMT2
11
-
Pxgo8C8S6lzAk4PxBRBoePjgWAeaFmbr317WXHvw6SSHPIdzToKZgDiDC5LWvKxb
12
-
RRdLZ6w7tg0/FSUexekrUafGT8Je0oIoLUQlNaEQzrPNhDpma1uHFfZg0vb2m4Hq
13
-
WHLLKTCr6FMczhP1TmuIEtdjKtymT+rO+Ls4ciw+654R7MtBYcmTr+RqmAd+GadJ
14
-
vJNmGDod2oPwCydEps8bYAbksqRhMmk3xwco/g6dWYh5/+1GzCr80J7fYpqtoPFH
15
-
V5qKyDQovF5jPlb/buq4mH8XYVT1z4Sx8azKVctMLig57zRnvN0WyskpT09oY7dK
16
-
TPvIqwTixekndYLcM3QacVq/NhVOOQPFvD0PwU18eKs4EfD2L7iWd2XjV9Az++aD
17
-
jUY6EwEuOzDCexWP4eM8
18
-
=h6TK
19
-
-----END PGP SIGNATURE-----
20
-
21
-
```
22
-
23
-
<!-- END SIGNATURES -->
24
-
25
-
### Begin signed statement
26
-
27
-
#### Expect
28
-
29
-
```
30
-
size exec file contents
31
-
./
32
-
412 .gitignore 25059da2ee328837ece01b979cd5c1083ed1679372f06c14c1c58035d8120614
33
-
548 .travis.yml 7f11bc58a8e94276ef949afeb107f9f1e184c0dbb84f821705ea2245902ed546
34
-
846 Changelog.md 345f9aea4812b37b1b2714703ea0d5edd27414c0f839ec3e322450ad5ec5c6ed
35
-
FourmiCrawler/
36
-
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
37
-
304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806
38
-
2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49
39
-
677 settings.py f1e7d21b899ffc2523516c0ebe67d967dc62495b90c2fe34651042a3049fcd94
40
-
sources/
41
-
12103 ChemSpider.py f647d70acf9b3f1ee7bde75586aa45156331f977ca7fe836ceac4477a2c0d4ce
42
-
12400 NIST.py cdb4c423355ac8fb1097197a9f8df44f667925a785c6bae7c583820da08908ee
43
-
6121 PubChem.py 8f8ad40459090b818a384a202e739fe4696a04154df2b8419aee896b0fa02481
44
-
6930 WikipediaParser.py ae9f57bbf2aad9c371abcd143fd2dda5995a196cb700734a5035dd94b1988870
45
-
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
46
-
1281 source.py 7927fda259ff2c8096fa526db1f08586de6e04473a491e19a07b092fdeed81fc
47
-
3111 spider.py ec7c946907fea10c17ee6dd88a506f3e3bf2cd748e3eb09200487fcec2ae7ba3
48
-
GUI/
49
-
11 __init__.py 40567015c415e853210425c1b4f3834dbc2a3165e3713e04dd3424b79bc90aa3
50
-
940 configImporter.py 5d731d63a3117b25b7e556a746a1dd5b16e8cbb60e57be46de333c31c8c00271
51
-
8776 gui.py 20b2220bc3ca55ebfd6d04e8c0bebbf1ae316c85a54db60b8fc02d22642f19d5
52
-
299 GUI.cfg.sample 4ee27f7099d588c21358cd645a21621e631d80712f1b514dad898faa5fee2483
53
-
1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c
54
-
3900 README.md f4a1e3ea1700d2b415acfad661cb45f960fe8e8ffbe98dbecb6c7ed071a101ac
55
-
3846 x fourmi.py f0b11f5f153f96f6af2e504cdf369e43c04316752de131a659eb6246fd80212a
56
-
261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85
57
-
416 sources.cfg.sample 11cd0fc18693da17883c98d25a384ae1b6158adfef13778b6dd02b878f6b8a70
58
-
tests/
59
-
107 __init__.py ce90e54e58a0912cadbe3adcf5166dc72477bf9ce289bf427f8e2f5b25406670
60
-
2870 test_configurator.py 318d542b1cda5075a2a9a6be97e9e7a79372ee58e1ab3014c161534094f7364d
61
-
1315 test_gui.py 0fb95d0b542765bf52bcebb037bf2ed1299209beab23448af741a93c9fbb1ca8
62
-
1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031
63
-
1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869
64
-
2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299
65
-
utils/
66
-
40 __init__.py f1237ae74693e2ec1b3154e57aec27438a80a735e5ccf2411aecd194ef443b6a
67
-
4047 configurator.py 8b566a0435a9f105a8ec616b16c3e21edb9b82f8debe1ef9f1df6bbbf20949d5
68
-
2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37
69
-
```
70
-
71
-
#### Ignore
72
-
73
-
```
74
-
/SIGNED.md
75
-
```
76
-
77
-
#### Presets
78
-
79
-
```
80
-
git # ignore .git and anything as described by .gitignore files
81
-
dropbox # ignore .dropbox-cache and other Dropbox-related files
82
-
kb # ignore anything as described by .kbignore files
83
-
```
84
-
85
-
<!-- summarize version = 0.0.9 -->
86
-
87
-
### End signed statement
88
-
89
-
<hr>
90
-
91
-
#### Notes
92
-
93
-
With keybase you can sign any directory's contents, whether it's a git repo,
94
-
source code distribution, or a personal documents folder. It aims to replace the drudgery of:
95
-
96
-
1. comparing a zipped file to a detached statement
97
-
2. downloading a public key
98
-
3. confirming it is in fact the author's by reviewing public statements they've made, using it
99
-
100
-
All in one simple command:
101
-
102
-
```bash
103
-
keybase dir verify
104
-
```
105
-
106
-
There are lots of options, including assertions for automating your checks.
107
-
108
-
For more info, check out https://keybase.io/docs/command_line/code_signing
+51
-23
fourmi.py
+51
-23
fourmi.py
···
1
-
#!/usr/bin/env python
1
+
# !/usr/bin/env python
2
2
"""
3
-
Fourmi, a web scraper build to search specific information for a given compound (and its pseudonyms).
3
+
Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms).
4
4
5
5
Usage:
6
-
fourmi
7
6
fourmi search <compound>
8
7
fourmi [options] search <compound>
9
-
fourmi [options] [-v | -vv | -vvv] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
8
+
fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
10
9
fourmi list
11
10
fourmi [--include=<sourcename> | --exclude=<sourcename>] list
12
11
fourmi -h | --help
···
16
15
--attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
17
16
-h --help Show this screen.
18
17
--version Show version.
19
-
-v Verbose logging output. (Multiple occurrences increase logging level)
18
+
--verbose Verbose logging output.
20
19
--log=<file> Save log to an file.
21
-
-o <file> --output=<file> Output file [default: <compound>.*format*]
22
-
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
20
+
-o <file> --output=<file> Output file [default: result.*format*]
21
+
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines]
23
22
--include=<regex> Include only sources that match these regular expressions split by a comma.
24
23
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
25
24
"""
26
25
27
26
from twisted.internet import reactor
28
27
from scrapy.crawler import Crawler
29
-
from scrapy import signals, log
28
+
from scrapy import log, signals
29
+
from scrapy.utils.project import get_project_settings
30
30
import docopt
31
31
32
32
from FourmiCrawler.spider import FourmiSpider
33
-
from utils.configurator import Configurator
34
-
from utils.sourceloader import SourceLoader
35
-
from GUI import gui
33
+
from sourceloader import SourceLoader
36
34
37
35
38
36
def setup_crawler(compound, settings, source_loader, attributes):
···
52
50
crawler.start()
53
51
54
52
53
+
def scrapy_settings_manipulation(docopt_arguments):
54
+
"""
55
+
This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi
56
+
project these are command line arguments.
57
+
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
58
+
"""
59
+
settings = get_project_settings()
60
+
61
+
if docopt_arguments["--output"] != 'result.*format*':
62
+
settings.overrides["FEED_URI"] = docopt_arguments["--output"]
63
+
elif docopt_arguments["--format"] == "jsonlines":
64
+
settings.overrides["FEED_URI"] = "results.json"
65
+
elif docopt_arguments["--format"] is not None:
66
+
settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"]
67
+
68
+
if docopt_arguments["--format"] is not None:
69
+
settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"]
70
+
71
+
return settings
72
+
73
+
74
+
def start_log(docopt_arguments):
75
+
"""
76
+
This function starts the logging functionality of Scrapy using the settings given by the CLI.
77
+
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
78
+
"""
79
+
if docopt_arguments["--log"] is not None:
80
+
if docopt_arguments["--verbose"]:
81
+
log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
82
+
else:
83
+
log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING)
84
+
else:
85
+
if docopt_arguments["--verbose"]:
86
+
log.start(logstdout=False, loglevel=log.DEBUG)
87
+
else:
88
+
log.start(logstdout=True, loglevel=log.WARNING)
89
+
90
+
55
91
def search(docopt_arguments, source_loader):
56
92
"""
57
93
The function that facilitates the search for a specific compound.
58
94
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
59
95
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
60
96
"""
61
-
conf = Configurator()
62
-
conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
63
-
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments["<compound>"])
64
-
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
65
-
source_loader, docopt_arguments["--attributes"].split(','))
66
-
if conf.scrapy_settings.getbool("LOG_ENABLED"):
67
-
log.start(conf.scrapy_settings.get("LOG_FILE"),
68
-
conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
97
+
start_log(docopt_arguments)
98
+
settings = scrapy_settings_manipulation(docopt_arguments)
99
+
setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
69
100
reactor.run()
70
101
71
102
72
103
# The start for the Fourmi Command Line interface.
73
104
if __name__ == '__main__':
74
-
arguments = docopt.docopt(__doc__, version='Fourmi - V0.6.0')
105
+
arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.1')
75
106
loader = SourceLoader()
76
107
77
108
if arguments["--include"]:
···
84
115
elif arguments["list"]:
85
116
print "-== Available Sources ==-"
86
117
print str(loader)
87
-
else:
88
-
gui_window = gui.GUI(search, sourceloader=SourceLoader())
89
-
gui_window.run()
+18
setup.py
+18
setup.py
···
1
+
import sys
2
+
from cx_Freeze import setup, Executable
3
+
4
+
# After running the setup file (python setup.py build) the scrapy/VERSION file has to be manually put into the
5
+
# library.zip, also the FourmiCrawler map has to be copied to both the library and the exe.win32-2.7 folder. after
6
+
# putting the files in the library the library has to be zipped and replace the old library.
7
+
# Dependencies are automatically detected, but it might need fine tuning.
8
+
build_exe_options = {"packages": ["os", "scrapy", "lxml", "w3lib", "pkg_resources", "zope.interface", "twisted.internet"], "excludes": []}
9
+
10
+
# GUI applications require a different base on Windows (the default is for a
11
+
# console application).
12
+
base = None
13
+
14
+
setup( name = "Scrapy",
15
+
version = "0.1",
16
+
description = "My GUI application!",
17
+
options = {"build_exe": build_exe_options},
18
+
executables = [Executable("fourmi.py", base=base)])
+60
sourceloader.py
+60
sourceloader.py
···
1
+
import inspect
2
+
import sys
3
+
import os
4
+
import re
5
+
6
+
from FourmiCrawler.sources.source import Source
7
+
8
+
9
+
class SourceLoader:
10
+
sources = []
11
+
12
+
def __init__(self, rel_dir="FourmiCrawler/sources"):
13
+
14
+
if hasattr(sys,'frozen'):
15
+
path = os.path.dirname(sys.executable)
16
+
else:
17
+
path = os.path.dirname(os.path.abspath(__file__))
18
+
19
+
path += "/" + rel_dir
20
+
known_parser = set()
21
+
22
+
for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
23
+
mod = __import__('.'.join([rel_dir.replace('/', "."), py]), fromlist=[py])
24
+
classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
25
+
for cls in classes:
26
+
if issubclass(cls, Source) and cls not in known_parser:
27
+
self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers?
28
+
# known_parser.add(cls)
29
+
30
+
def include(self, source_names):
31
+
"""
32
+
This function excludes all sources that don't match the given regular expressions.
33
+
:param source_names: A list of regular expression (strings)
34
+
"""
35
+
new = set()
36
+
for name in source_names:
37
+
new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
38
+
self.sources = list(new)
39
+
40
+
def exclude(self, source_names):
41
+
"""
42
+
This function excludes all sources that match the given regular expressions.
43
+
:param source_names: A list of regular expression (strings)
44
+
"""
45
+
exclude = []
46
+
for name in source_names:
47
+
exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
48
+
self.sources = [src for src in self.sources if src not in exclude]
49
+
50
+
def __str__(self):
51
+
"""
52
+
This function returns a string with all sources currently available in the SourceLoader.
53
+
:return: a string with all available sources.
54
+
"""
55
+
string = ""
56
+
for src in self.sources:
57
+
string += "Source: " + src.__class__.__name__
58
+
string += " - "
59
+
string += "URI: " + src.website + "\n"
60
+
return string
-19
sources.cfg.sample
-19
sources.cfg.sample
···
1
-
[DEFAULT]
2
-
reliability = Unknown
3
-
4
-
#For each source listed in FourmiCrawler/sources there should be a section
5
-
#named exactly as the filename in here. If not present, the DEFAULT value is
6
-
#used for reliability of that source.
7
-
8
-
[ChemSpider]
9
-
reliability = High
10
-
#token=Paste ChemSpider API token here and remove the hashtag
11
-
12
-
[NIST]
13
-
reliability = High
14
-
15
-
[WikipediaParser]
16
-
reliability = Medium
17
-
18
-
[PubChem]
19
-
reliability = High
-5
tests/__init__.py
-5
tests/__init__.py
-68
tests/test_configurator.py
-68
tests/test_configurator.py
···
1
-
import unittest
2
-
import ConfigParser
3
-
4
-
from utils.configurator import Configurator
5
-
6
-
7
-
class TestConfigurator(unittest.TestCase):
8
-
9
-
def setUp(self):
10
-
self.conf = Configurator()
11
-
12
-
def test_set_output(self):
13
-
self.conf.set_output(filename="test.txt", fileformat="csv", compound="test")
14
-
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt")
15
-
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
16
-
17
-
self.conf.set_output("<compound>.*format*", "jsonlines", "test")
18
-
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.json")
19
-
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines")
20
-
21
-
self.conf.set_output("<compound>.*format*", "csv", "test")
22
-
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.csv")
23
-
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
24
-
25
-
def test_start_log(self):
26
-
for i in range(0, 3):
27
-
self.conf.set_logging("TEST", i)
28
-
self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), "TEST")
29
-
if i > 0:
30
-
self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), True)
31
-
if i > 1:
32
-
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), False)
33
-
else:
34
-
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True)
35
-
else:
36
-
self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), False)
37
-
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True)
38
-
if i == 1:
39
-
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "WARNING")
40
-
elif i == 2:
41
-
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "INFO")
42
-
elif i == 3:
43
-
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "DEBUG")
44
-
45
-
self.conf.set_logging(verbose=i)
46
-
self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), None)
47
-
48
-
def test_read_sourceconfiguration(self):
49
-
config = self.conf.read_sourceconfiguration()
50
-
self.assertIsInstance(config, ConfigParser.ConfigParser)
51
-
52
-
def test_get_section(self):
53
-
config = ConfigParser.ConfigParser()
54
-
section = self.conf.get_section(config, 'test')
55
-
self.assertIn('reliability', section)
56
-
self.assertEquals(section['reliability'], '')
57
-
58
-
config.set('DEFAULT', 'reliability', 'Low')
59
-
60
-
section = self.conf.get_section(config, 'test')
61
-
self.assertEquals(section['reliability'], 'Low')
62
-
63
-
config.add_section('test')
64
-
config.set('test', 'var', 'Maybe')
65
-
66
-
section = self.conf.get_section(config, 'test')
67
-
self.assertEquals(section['reliability'], 'Low')
68
-
self.assertEqual(section['var'], 'Maybe')
-32
tests/test_gui.py
-32
tests/test_gui.py
···
1
-
import unittest
2
-
3
-
from GUI import gui
4
-
5
-
class TestGUI(unittest.TestCase):
6
-
def setUp(self):
7
-
pass
8
-
9
-
def test_empty_attributes(self):
10
-
self.test_gui = gui.GUI(None, config_file="../GUI.cfg.sample", in_source=True)
11
-
self.test_gui.window.after(9, self.test_gui.prepare_search)
12
-
self.test_gui.window.after(11, self.test_gui.window.destroy)
13
-
self.test_gui.run()
14
-
15
-
output_type = self.test_gui.configurator.load_output_types().split(',')[0]
16
-
17
-
self.assertEqual(self.test_gui.values.get('substance'), '')
18
-
self.assertEqual(self.test_gui.values.get('output_type'), output_type)
19
-
self.assertEqual(self.test_gui.values.get('output_name'), 'results.csv')
20
-
21
-
22
-
def test_no_configurations(self):
23
-
self.test_gui = gui.GUI(None, config_file="../GUI.cfg.sample")
24
-
self.test_gui.configurator = gui.ConfigImporter('')
25
-
self.test_gui.finish_with_search = True
26
-
self.test_gui.window.after(9, self.test_gui.prepare_search)
27
-
self.test_gui.window.after(11, self.test_gui.window.destroy)
28
-
self.test_gui.run()
29
-
30
-
self.assertEqual(self.test_gui.values.get('substance'), '')
31
-
self.assertEqual(self.test_gui.values.get('output_type'), 'csv')
32
-
self.assertEqual(self.test_gui.values.get('output_name'), 'results.csv')
-1
tests/test_pipeline.py
-1
tests/test_pipeline.py
···
13
13
def test_none_pipeline(self):
14
14
# Testing the pipeline that replaces the None values in items.
15
15
self.testItem["value"] = "abc"
16
-
self.testItem["source"] = None
17
16
pipe = pipelines.RemoveNonePipeline()
18
17
processed = pipe.process_item(self.testItem, spider.FourmiSpider())
19
18
+1
-1
tests/test_sourceloader.py
+1
-1
tests/test_sourceloader.py
+5
-7
tests/test_spider.py
+5
-7
tests/test_spider.py
···
3
3
from scrapy.http import Request
4
4
5
5
from FourmiCrawler import spider
6
-
from FourmiCrawler.sources.NIST import NIST
6
+
from FourmiCrawler.sources.ChemSpider import ChemSpider
7
7
from FourmiCrawler.sources.source import Source
8
8
9
9
···
41
41
self.spi.add_source(src)
42
42
self.assertEqual(self.spi.start_requests(), [])
43
43
44
-
src2 = NIST()
44
+
src2 = ChemSpider()
45
45
self.spi.add_source(src2)
46
-
requests = self.spi.start_requests()
47
-
self.assertGreater(len(requests), 0)
48
-
self.assertIsInstance(requests[0], Request)
46
+
self.assertIsNotNone(self.spi.start_requests())
49
47
50
48
def test_synonym_requests(self):
51
49
# A test for the synonym request function
···
56
54
self.assertEqual(self.spi.get_synonym_requests("new_compound"), [])
57
55
self.assertIn("new_compound", self.spi.synonyms)
58
56
59
-
src2 = NIST()
57
+
src2 = ChemSpider()
60
58
self.spi.add_source(src2)
61
59
self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request)
62
60
self.assertIn("other_compound", self.spi.synonyms)
63
-
self.assertEqual(self.spi.get_synonym_requests("other_compound"), [])
61
+
self.assertEqual(self.spi.get_synonym_requests("other_compound"), [])
-101
utils/configurator.py
-101
utils/configurator.py
···
1
-
import ConfigParser
2
-
import os
3
-
import shutil
4
-
5
-
from scrapy.utils.project import get_project_settings
6
-
7
-
8
-
class Configurator:
9
-
"""
10
-
A helper class in the fourmi class. This class is used to process the settings as set
11
-
from one of the Fourmi applications.
12
-
"""
13
-
14
-
def __init__(self):
15
-
self.scrapy_settings = get_project_settings()
16
-
17
-
def set_output(self, filename, fileformat, compound):
18
-
"""
19
-
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
20
-
In the Fourmi project these are command line arguments.
21
-
:param filename: The filename of the file where the output will be put.
22
-
:param fileformat: The format in which the output will be.
23
-
"""
24
-
25
-
if filename != '<compound>.*format*':
26
-
self.scrapy_settings.overrides["FEED_URI"] = filename
27
-
elif fileformat == "jsonlines":
28
-
self.scrapy_settings.overrides["FEED_URI"] = compound + ".json"
29
-
elif fileformat is not None:
30
-
self.scrapy_settings.overrides["FEED_URI"] = compound + "." + fileformat
31
-
32
-
if fileformat is not None:
33
-
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
34
-
35
-
def set_logging(self, logfile=None, verbose=0):
36
-
"""
37
-
This function changes the default settings of Scapy's logging functionality
38
-
using the settings given by the CLI.
39
-
:param logfile: The location where the logfile will be saved.
40
-
:param verbose: A integer value to switch between loglevels.
41
-
"""
42
-
if verbose != 0:
43
-
self.scrapy_settings.overrides["LOG_ENABLED"] = True
44
-
else:
45
-
self.scrapy_settings.overrides["LOG_ENABLED"] = False
46
-
47
-
if verbose == 1:
48
-
self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING"
49
-
elif verbose == 2:
50
-
self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO"
51
-
else:
52
-
self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG"
53
-
54
-
if verbose > 1:
55
-
self.scrapy_settings.overrides["LOG_STDOUT"] = False
56
-
else:
57
-
self.scrapy_settings.overrides["LOG_STDOUT"] = True
58
-
59
-
if logfile is not None:
60
-
self.scrapy_settings.overrides["LOG_FILE"] = logfile
61
-
else:
62
-
self.scrapy_settings.overrides["LOG_FILE"] = None
63
-
64
-
@staticmethod
65
-
def read_sourceconfiguration():
66
-
"""
67
-
This function reads sources.cfg in the main folder for configuration
68
-
variables for sources
69
-
:return a ConfigParser object of sources.cfg
70
-
"""
71
-
current_dir = os.path.dirname(os.path.abspath(__file__))
72
-
config_path = current_dir + '/../sources.cfg'
73
-
# [TODO]: location of sources.cfg should be softcoded eventually
74
-
if not os.path.isfile(config_path):
75
-
try:
76
-
shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../sources.cfg.sample", config_path)
77
-
except IOError:
78
-
print "WARNING: Source configuration couldn't be found and couldn't be created."
79
-
config = ConfigParser.ConfigParser()
80
-
config.read(config_path)
81
-
return config
82
-
83
-
@staticmethod
84
-
def get_section(config, sourcename):
85
-
"""
86
-
This function reads a config section labeled in variable sourcename and
87
-
tests whether the reliability variable is set else set to empty string.
88
-
Return the default section if the labeled config section does not exist
89
-
:param config: a ConfigParser object
90
-
:param sourcename: the name of the section to be read
91
-
:return a dictionary of the section in the config labeled in sourcename
92
-
"""
93
-
section = dict()
94
-
if config.has_section(sourcename):
95
-
section = dict(config.items(sourcename))
96
-
elif config.defaults():
97
-
section = config.defaults()
98
-
if 'reliability' not in section:
99
-
print 'WARNING: Reliability not set for %s' % sourcename
100
-
section['reliability'] = ''
101
-
return section
-64
utils/sourceloader.py
-64
utils/sourceloader.py
···
1
-
import inspect
2
-
import os
3
-
import re
4
-
5
-
from FourmiCrawler.sources.source import Source
6
-
from utils.configurator import Configurator
7
-
8
-
9
-
class SourceLoader:
10
-
sources = []
11
-
12
-
def __init__(self, rel_dir="../FourmiCrawler/sources"):
13
-
"""
14
-
The initiation of a SourceLoader, selects and indexes a directory for usable sources.
15
-
Also loads a configuration file for Sources and passes the arguments in
16
-
the named section to the source
17
-
:param rel_dir: A relative path to a directory.
18
-
"""
19
-
path = os.path.dirname(os.path.abspath(__file__))
20
-
path += "/" + rel_dir
21
-
known_parser = set()
22
-
23
-
config = Configurator.read_sourceconfiguration()
24
-
25
-
for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
26
-
mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py])
27
-
classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
28
-
for cls in classes:
29
-
if issubclass(cls, Source) and cls not in known_parser:
30
-
sourcecfg = Configurator.get_section(config, cls.__name__)
31
-
self.sources.append(cls(sourcecfg))
32
-
known_parser.add(cls)
33
-
34
-
def include(self, source_names):
35
-
"""
36
-
This function excludes all sources that don't match the given regular expressions.
37
-
:param source_names: A list of regular expression (strings)
38
-
"""
39
-
new = set()
40
-
for name in source_names:
41
-
new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
42
-
self.sources = list(new)
43
-
44
-
def exclude(self, source_names):
45
-
"""
46
-
This function excludes all sources that match the given regular expressions.
47
-
:param source_names: A list of regular expression (strings)
48
-
"""
49
-
exclude = []
50
-
for name in source_names:
51
-
exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
52
-
self.sources = [src for src in self.sources if src not in exclude]
53
-
54
-
def __str__(self):
55
-
"""
56
-
This function returns a string with all sources currently available in the SourceLoader.
57
-
:return: a string with all available sources.
58
-
"""
59
-
string = ""
60
-
for src in self.sources:
61
-
string += "Source: " + src.__class__.__name__
62
-
string += " - "
63
-
string += "URI: " + src.website + "\n"
64
-
return string