A web scraper build to search specific information for a given compound (and its pseudonyms)

Compare changes

Choose any two refs to compare.

-5
.gitignore
··· 4 4 #Python Specific ignores 5 5 *.pyc 6 6 7 - #may contain authentication information 8 - sources.cfg 9 - #Another of our config files 10 - GUI.cfg 11 - 12 7 #THINGS WE WOULD NEVER EVER WANT! 13 8 #ignore thumbnails created by windows 14 9 Thumbs.db
+2 -10
.travis.yml
··· 3 3 language: python 4 4 python: 2.7 5 5 6 - before_install: 7 - - "export DISPLAY=:99.0" 8 - - "sh -e /etc/init.d/xvfb start" 9 - 10 6 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors 11 7 install: 12 8 - pip install Scrapy docopt 13 - - pip install coveralls 14 9 15 10 # command to run tests, e.g. python setup.py test 16 11 script: 17 - - nosetests --with-coverage --cover-package=FourmiCrawler,utils,GUI tests 12 + - nosetests tests 18 13 19 14 notifications: 20 - slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM 21 - 22 - after_success: 23 - coveralls --verbose 15 + slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
-20
Changelog.md
··· 1 - ### v0.6.0 2 - - Feature: Added a Graphical User interface 3 - - Feature: Automatic config file createion from config samples 4 - - FIX: The default name of the output files will now consist of the compound name and the file format when using the CLI 5 - - FIX: A lot of bugfixes of the PubChem plugin, as is wasn't working as it should 6 - - FIX: Using absolute path for configuration files 7 - - DEV: General Code cleanup in documentation 8 - 9 - ### v0.5.3 10 - - FIX: It is now again possible to use both verbose and the source inclusion/exclusion options 11 - - FIX: Logging is now "actually" disabled if not using the verbose option. 12 - - FEATURE: Added support for PubChem 13 - 14 - ### v0.5.2 15 - - FIX: Signatured used to contain untracked and older files, current signature 16 - should be correct. 17 - 18 - ### v0.5.1 19 - - UPDATED: Logging functionality from command line 20 - - DEV: Code cleanup and extra tests
+2 -1
FourmiCrawler/settings.py
··· 18 18 FEED_URI = 'results.json' 19 19 FEED_FORMAT = 'jsonlines' 20 20 21 + 21 22 # Crawl responsibly by identifying yourself (and your website) on the 22 23 # user-agent 23 24 24 - USER_AGENT = 'Fourmi' 25 + # USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
+60 -127
FourmiCrawler/sources/ChemSpider.py
··· 10 10 11 11 # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. 12 12 13 + 13 14 class ChemSpider(Source): 14 - """ 15 - ChemSpider scraper for synonyms and properties 15 + """ChemSpider scraper for synonyms and properties 16 + 16 17 This parser will manage searching for chemicals through the 17 18 ChemsSpider API, and parsing the resulting ChemSpider page. 18 19 The token required for the API should be in a configuration file 19 20 somewhere. 20 21 """ 21 22 22 - website = 'http://www\\.chemspider\\.com/.*' 23 + def __init__(self): 24 + Source.__init__(self) 23 25 24 - search = 'Search.asmx/SimpleSearch?query=%s&token=' 26 + website = 'http://www.chemspider.com/*' 27 + 28 + # [TODO] - Save and access token of specific user. 29 + search = ('Search.asmx/SimpleSearch?query=%s&token=' 30 + '052bfd06-5ce4-43d6-bf12-89eabefd2338') 25 31 structure = 'Chemical-Structure.%s.html' 26 - extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' 32 + extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' 33 + '052bfd06-5ce4-43d6-bf12-89eabefd2338') 27 34 28 - def __init__(self, config=None): 29 - """ 30 - Initialization of ChemSpider scraper 31 - :param config: a dictionary of settings for this scraper, must contain 32 - 'reliability' key 33 - """ 34 - Source.__init__(self, config) 35 - self.ignore_list = [] 36 - if 'token' not in self.cfg or self.cfg['token'] == '': 37 - log.msg('ChemSpider token not set or empty, search/MassSpec API ' 38 - 'not available', level=log.WARNING) 39 - self.cfg['token'] = '' 40 - self.search += self.cfg['token'] 41 - self.extendedinfo += self.cfg['token'] 35 + ignore_list = [] 42 36 43 37 def parse(self, response): 44 - """ 45 - This function is called when a Response matching the variable 46 - 'website' is available for parsing the Response object. 47 - :param response: the Scrapy Response object to be parsed 48 - :return: a list of Result items and Request objects 49 - """ 50 38 sel = Selector(response) 51 39 requests = [] 52 40 requests_synonyms = self.parse_synonyms(sel) ··· 56 44 57 45 return requests 58 46 59 - def parse_properties(self, sel): 60 - """ 61 - This function scrapes the Experimental Data and Predicted ACD/Labs tabs 62 - :param sel: a Selector object of the whole page 63 - :return: a list of Result items 64 - """ 65 - properties = [] 66 - 67 - properties.extend(self.parse_acdlabstab(sel)) 68 - properties.extend(self.parse_experimentaldatatab(sel)) 69 - 70 - return properties 71 - 72 - def parse_acdlabstab(self, sel): 73 - """ 74 - This function scrapes the 'Predicted ACD/Labs tab' under Properties 75 - :param sel: a Selector object of the whole page 76 - :return: a list of Request objects 77 - """ 47 + @staticmethod 48 + def parse_properties(sel): 49 + """scrape Experimental Data and Predicted ACD/Labs tabs""" 78 50 properties = [] 79 51 52 + # Predicted - ACD/Labs tab 80 53 td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath( 81 54 'normalize-space(string())') 82 55 prop_names = td_list[::2] ··· 88 61 prop_conditions = '' 89 62 90 63 # Test for properties without values, with one hardcoded exception 91 - if (not re.match(r'^\d', prop_value) or 92 - (prop_name == 'Polarizability' and prop_value == '10-24cm3')): 64 + if not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and prop_value == '10-24cm3'): 93 65 continue 94 66 67 + # Match for condition in parentheses 95 68 m = re.match(r'(.*) \((.*)\)', prop_name) 96 69 if m: 97 70 prop_name = m.group(1) 98 71 prop_conditions = m.group(2) 99 72 73 + # Match for condition in value seperated by an 'at' 100 74 m = re.match(r'(.*) at (.*)', prop_value) 101 75 if m: 102 76 prop_value = m.group(1) 103 77 prop_conditions = m.group(2) 104 78 105 - new_prop = self.newresult( 106 - attribute=prop_name, 107 - value=prop_value, 108 - source='ChemSpider Predicted - ACD/Labs Tab', 109 - conditions=prop_conditions 110 - ) 79 + new_prop = Result({ 80 + 'attribute': prop_name, 81 + 'value': prop_value, 82 + 'source': 'ChemSpider Predicted - ACD/Labs Tab', 83 + 'reliability': 'Unknown', 84 + 'conditions': prop_conditions 85 + }) 111 86 properties.append(new_prop) 87 + log.msg('CS prop: |%s| |%s| |%s|' % 88 + (new_prop['attribute'], new_prop['value'], new_prop['source']), 89 + level=log.DEBUG) 112 90 113 - return properties 114 - 115 - def parse_experimentaldatatab(self, sel): 116 - """ 117 - This function scrapes Experimental Data tab, Physico-chemical 118 - properties in particular. 119 - :param sel: a Selector object of the whole page 120 - :return: a list of Result items 121 - """ 122 - properties = [] 123 - 91 + # Experimental Data Tab, Physico-chemical properties in particular 124 92 scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical ' 125 93 'Properties"]//li/table/tr/td') 126 94 if not scraped_list: ··· 132 100 if line.xpath('span/text()'): 133 101 property_name = line.xpath('span/text()').extract()[0].rstrip() 134 102 else: 135 - new_prop = self.newresult( 136 - attribute=property_name[:-1], 137 - value=line.xpath('text()').extract()[0].rstrip(), 138 - source=line.xpath('strong/text()').extract()[0].rstrip(), 139 - ) 140 - properties.append(new_prop) 103 + new_prop = Result({ 104 + 'attribute': property_name[:-1], 105 + 'value': line.xpath('text()').extract()[0].rstrip(), 106 + 'source': line.xpath( 107 + 'strong/text()').extract()[0].rstrip(), 108 + 'reliability': 'Unknown', 109 + 'conditions': '' 110 + }) 111 + properties.append(new_prop) 112 + log.msg('CS prop: |%s| |%s| |%s|' % 113 + (new_prop['attribute'], new_prop['value'], 114 + new_prop['source']), level=log.DEBUG) 141 115 142 116 return properties 143 117 144 118 def parse_synonyms(self, sel): 145 - """ 146 - This function scrapes the list of Names and Identifiers 147 - :param sel: a Selector object of the whole page 148 - :return: a list of Requests 149 - """ 119 + """Scrape list of Names and Identifiers""" 150 120 requests = [] 151 121 synonyms = [] 152 122 ··· 178 148 return requests 179 149 180 150 def new_synonym(self, sel, name, category): 181 - """ 182 - This function scrapes for a single synonym at a given HTML tag 183 - :param sel: a Selector object of the given HTML tag 184 - :param name: the name of the synonym in the tag 185 - :param category: the name of the category the synonym is labeled as 186 - :return: a dictionary containing data on the synonym 187 - """ 151 + """Scrape for a single synonym at a given HTML tag""" 188 152 self.ignore_list.append(name) 189 153 language = sel.xpath('span[@class="synonym_language"]/text()') 190 154 if language: ··· 219 183 } 220 184 return synonym 221 185 222 - def parse_extendedinfo(self, response): 223 - """ 224 - This function scrapes data from the ChemSpider GetExtendedCompoundInfo 225 - API, if a token is present in the configuration settings 226 - :param response: a Response object to be parsed 227 - :return: a list of Result items 228 - """ 186 + @staticmethod 187 + def parse_extendedinfo(response): 188 + """Scrape data from the ChemSpider GetExtendedCompoundInfo API""" 229 189 sel = Selector(response) 230 190 properties = [] 231 191 names = sel.xpath('*').xpath('name()').extract() 232 192 values = sel.xpath('*').xpath('text()').extract() 233 193 for (name, value) in zip(names, values): 234 - result = self.newresult( 235 - attribute=name, 236 - value=value, # These values have no unit! 237 - source='ChemSpider ExtendedCompoundInfo', 238 - ) 194 + result = Result({ 195 + 'attribute': name, 196 + 'value': value, # These values have no unit! 197 + 'source': 'ChemSpider ExtendedCompoundInfo', 198 + 'reliability': 'Unknown', 199 + 'conditions': '' 200 + }) 239 201 if result['value']: 240 202 properties.append(result) 241 203 return properties 242 204 243 - def newresult(self, attribute, value, conditions='', source='ChemSpider'): 244 - """ 245 - This function abstracts from the Result item and provides default 246 - values. 247 - :param attribute: the name of the attribute 248 - :param value: the value of the attribute 249 - :param conditions: optional conditions regarding the value 250 - :param source: the name of the source if it is not ChemSpider 251 - :return: A Result item 252 - """ 253 - return Result({ 254 - 'attribute': attribute, 255 - 'value': value, 256 - 'source': source, 257 - 'reliability': self.cfg['reliability'], 258 - 'conditions': conditions 259 - }) 260 - 261 205 def parse_searchrequest(self, response): 262 - """ 263 - This function parses the initial response of the ChemSpider Search API 264 - Requires a valid token to function. 265 - :param response: the Response object to be parsed 266 - :return: A Request for the information page and a Request for the 267 - extendedinfo API call 268 - """ 206 + """Parse the initial response of the ChemSpider Search API """ 269 207 sel = Selector(response) 270 208 log.msg('chemspider parse_searchrequest', level=log.DEBUG) 271 209 sel.register_namespace('cs', 'http://www.chemspider.com/') ··· 277 215 log.msg('ChemSpider found multiple substances, taking first ' 278 216 'element', level=log.DEBUG) 279 217 csid = csids[0] 280 - structure_url = self.website[:-2].replace("\\", "") + self.structure % csid 281 - extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid 218 + structure_url = self.website[:-1] + self.structure % csid 219 + extendedinfo_url = self.website[:-1] + self.extendedinfo % csid 282 220 log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG) 283 221 return [Request(url=structure_url, 284 222 callback=self.parse), ··· 286 224 callback=self.parse_extendedinfo)] 287 225 288 226 def new_compound_request(self, compound): 289 - """ 290 - This function is called when a new synonym is returned to the spider 291 - to generate new requests 292 - :param compound: the name of the compound to search for 293 - """ 294 - if compound in self.ignore_list or self.cfg['token'] == '': 227 + if compound in self.ignore_list: # [TODO] - add regular expression 295 228 return None 296 - searchurl = self.website[:-2].replace("\\", "") + self.search % compound 229 + searchurl = self.website[:-1] + self.search % compound 297 230 log.msg('chemspider compound', level=log.DEBUG) 298 231 return Request(url=searchurl, callback=self.parse_searchrequest)
+83 -141
FourmiCrawler/sources/NIST.py
··· 13 13 # Result item, but should be included eventually. 14 14 15 15 class NIST(Source): 16 - """ 17 - NIST Scraper plugin 16 + """NIST Scraper plugin 17 + 18 18 This plugin manages searching for a chemical on the NIST website 19 19 and parsing the resulting page if the chemical exists on NIST. 20 20 """ 21 - website = "http://webbook\\.nist\\.gov/.*" 21 + website = "http://webbook.nist.gov/*" 22 22 23 23 search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' 24 24 25 - def __init__(self, config=None): 26 - """ 27 - Initialization of NIST scraper 28 - :param config: configuration variables for this scraper, must contain 29 - 'reliability' key. 30 - """ 31 - Source.__init__(self, config) 32 - self.ignore_list = set() 25 + ignore_list = set() 26 + 27 + def __init__(self): 28 + Source.__init__(self) 33 29 34 30 def parse(self, response): 35 - """ 36 - This function is called when a Response matching the variable 37 - 'website' is available for parsing the Response object. 38 - :param response: The Scrapy Response object to be parsed 39 - :return: a list of Result items and Request objects 40 - """ 41 31 sel = Selector(response) 42 32 43 33 title = sel.xpath('head/title/text()').extract()[0] ··· 62 52 log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name), 63 53 level=log.DEBUG) 64 54 65 - requests.extend(self.parse_tables(sel, symbol_table)) 66 - 67 - return requests 68 - 69 - def parse_tables(self, sel, symbol_table): 70 - """ 71 - This function identifies and distributes parsing of tables to other 72 - functions below. 73 - :param sel: A Selector object of the whole page 74 - :param symbol_table: a dictionary containing translations of raw HTML 75 - tags to human readable names 76 - :return: a list of Result items and Requests 77 - """ 78 - requests = [] 79 - 80 55 for table in sel.xpath('//table[@class="data"]'): 81 56 summary = table.xpath('@summary').extract()[0] 82 57 if summary == 'One dimensional data': ··· 107 82 return requests 108 83 109 84 def parse_generic_info(self, sel): 110 - """ 111 - This function parses: synonyms, chemical formula, molecular weight, 112 - InChI, InChiKey, CAS number 113 - :param sel: A Selector object of the entire page in the original 114 - response 115 - :return: a list of Result items 85 + """Parses: synonyms, chemical formula, molecular weight, InChI, 86 + InChiKey, CAS number 116 87 """ 117 88 ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]') 89 + li = ul.xpath('li') 118 90 119 91 raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract() 120 92 for synonym in raw_synonyms[0].strip().split(';\n'): ··· 142 114 143 115 requests = [] 144 116 for key, value in data.iteritems(): 145 - result = self.newresult( 146 - attribute=key, 147 - value=value 148 - ) 117 + result = Result({ 118 + 'attribute': key, 119 + 'value': value, 120 + 'source': 'NIST', 121 + 'reliability': 'Unknown', 122 + 'conditions': '' 123 + }) 149 124 requests.append(result) 150 125 151 126 return requests 152 127 153 128 def parse_aggregate_data(self, table, symbol_table): 154 - """ 155 - This function parses the table(s) which contain possible links to 156 - individual data points 157 - :param table: a Selector object of the table to be parsed 158 - :param symbol_table: a dictionary containing translations of raw HTML 159 - tags to human readable names 160 - :return: a list of Result items and Request objects 129 + """Parses the table(s) which contain possible links to individual 130 + data points 161 131 """ 162 132 results = [] 163 133 for tr in table.xpath('tr[td]'): 164 134 extra_data_url = tr.xpath('td[last()][a="Individual data points"]' 165 135 '/a/@href').extract() 166 136 if extra_data_url: 167 - request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0], 137 + request = Request(url=self.website[:-1] + extra_data_url[0], 168 138 callback=self.parse_individual_datapoints) 169 139 results.append(request) 170 140 continue ··· 180 150 name = m.group(1) 181 151 condition = m.group(2) 182 152 183 - result = self.newresult( 184 - attribute=name, 185 - value=data[1] + ' ' + data[2], 186 - conditions=condition 187 - ) 153 + result = Result({ 154 + 'attribute': name, 155 + 'value': data[1] + ' ' + data[2], 156 + 'source': 'NIST', 157 + 'reliability': 'Unknown', 158 + 'conditions': condition 159 + }) 188 160 log.msg('NIST: |%s|' % data, level=log.DEBUG) 189 161 results.append(result) 190 162 return results 191 163 192 - def parse_transition_data(self, table, summary): 193 - """ 194 - This function parses the table containing properties regarding phase 195 - changes 196 - :param table: a Selector object of the table to be parsed 197 - :param summary: the name of the property 198 - :return: a list of Result items 199 - """ 164 + @staticmethod 165 + def parse_transition_data(table, summary): 166 + """Parses the table containing properties regarding phase changes""" 200 167 results = [] 201 168 202 - unit = self.get_unit(table) 169 + tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract()) 170 + m = re.search(r'\((.*)\)', tr_unit) 171 + unit = '!' 172 + if m: 173 + unit = m.group(1) 203 174 204 175 for tr in table.xpath('tr[td]'): 205 176 tds = tr.xpath('td/text()').extract() 206 - result = self.newresult( 207 - attribute=summary, 208 - value=tds[0] + ' ' + unit, 209 - conditions='%s K, (%s -> %s)' % (tds[1], tds[2], tds[3]) 210 - ) 177 + result = Result({ 178 + 'attribute': summary, 179 + 'value': tds[0] + ' ' + unit, 180 + 'source': 'NIST', 181 + 'reliability': 'Unknown', 182 + 'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3]) 183 + }) 211 184 results.append(result) 212 185 213 186 return results 214 187 215 - def parse_generic_data(self, table, summary): 216 - """ 217 - Parses the common tables of 4 and 5 rows. Assumes they are of the 188 + @staticmethod 189 + def parse_generic_data(table, summary): 190 + """Parses the common tables of 4 and 5 rows. Assumes they are of the 218 191 form: 219 192 Symbol (unit)|Temperature (K)|Method|Reference|Comment 220 193 Symbol (unit)|Temperature (K)|Reference|Comment 221 - :param table: a Selector object of the table to be parsed 222 - :param summary: the name of the property 223 - :return: a list of Result items 224 194 """ 225 195 results = [] 226 196 227 - unit = self.get_unit(table) 197 + tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract()) 198 + m = re.search(r'\((.*)\)', tr_unit) 199 + unit = '!' 200 + if m: 201 + unit = m.group(1) 228 202 229 203 for tr in table.xpath('tr[td]'): 230 204 tds = tr.xpath('td/text()').extract() 231 - result = self.newresult( 232 - attribute=summary, 233 - value=tds[0] + ' ' + unit, 234 - conditions='%s K' % tds[1] 235 - ) 205 + result = Result({ 206 + 'attribute': summary, 207 + 'value': tds[0] + ' ' + unit, 208 + 'source': 'NIST', 209 + 'reliability': 'Unknown', 210 + 'conditions': '%s K' % tds[1] 211 + }) 236 212 results.append(result) 237 213 return results 238 214 239 - def parse_antoine_data(self, table, summary): 240 - """ 241 - This function parses the table containing parameters for the Antione 242 - equation 243 - :param table: a Selector object of the table to be parsed 244 - :param summary: the name of the property 245 - :return: a list of Result items 246 - """ 215 + @staticmethod 216 + def parse_antoine_data(table, summary): 217 + """Parse table containing parameters for the Antione equation""" 247 218 results = [] 248 219 249 220 for tr in table.xpath('tr[td]'): 250 221 tds = tr.xpath('td/text()').extract() 251 - result = self.newresult( 252 - attribute=summary, 253 - value='A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]), 254 - conditions='%s K' % tds[0] 255 - ) 222 + result = Result({ 223 + 'attribute': summary, 224 + 'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]), 225 + 'source': 'NIST', 226 + 'reliability': 'Unknown', 227 + 'conditions': '%s K' % tds[0] 228 + }) 256 229 results.append(result) 257 230 258 231 return results 259 232 260 - def parse_individual_datapoints(self, response): 261 - """ 262 - This function parses the 'individual data points' page linked from 263 - the aggregate data table(s) 264 - :param response: the Scrapy Response object to be parsed 265 - :return: a list of Result items 266 - """ 233 + @staticmethod 234 + def parse_individual_datapoints(response): 235 + """Parses the page linked from aggregate data""" 267 236 sel = Selector(response) 268 237 table = sel.xpath('//table[@class="data"]')[0] 269 238 ··· 276 245 name = m.group(1) 277 246 condition = m.group(2) 278 247 279 - unit = self.get_unit(table) 248 + tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract()) 249 + m = re.search(r'\((.*)\)', tr_unit) 250 + unit = '!' 251 + if m: 252 + unit = m.group(1) 280 253 281 254 for tr in table.xpath('tr[td]'): 282 255 tds = tr.xpath('td/text()').extract() ··· 285 258 if m: 286 259 uncertainty = '+- %s ' % m.group(1) 287 260 # [TODO]: get the plusminus sign working in here 288 - result = self.newresult( 289 - attribute=name, 290 - value='%s %s%s' % (tds[0], uncertainty, unit), 291 - conditions=condition 292 - ) 261 + result = Result({ 262 + 'attribute': name, 263 + 'value': '%s %s%s' % (tds[0], uncertainty, unit), 264 + 'source': 'NIST', 265 + 'reliability': 'Unknown', 266 + 'conditions': condition 267 + }) 293 268 results.append(result) 294 269 295 270 return results 296 271 297 - @staticmethod 298 - def get_unit(table): 299 - tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract()) 300 - m = re.search(r'\((.*)\)', tr_unit) 301 - unit = '!' 302 - if m: 303 - unit = m.group(1) 304 - 305 - return unit 306 - 307 - def newresult(self, attribute, value, conditions=''): 308 - """ 309 - This function abstracts from the Result item and provides default 310 - values 311 - :param attribute: the name of the attribute 312 - :param value: the value of the attribute 313 - :param conditions: optional conditions regarding the value 314 - :return: A Result item 315 - """ 316 - return Result( 317 - { 318 - 'attribute': attribute, 319 - 'value': value, 320 - 'source': 'NIST', 321 - 'reliability': self.cfg['reliability'], 322 - 'conditions': conditions 323 - }) 324 - 325 272 def new_compound_request(self, compound): 326 - """ 327 - This function is called when a new synonym is returned to the spider 328 - to generate new requests 329 - :param compound: the name of the compound to search for 330 - """ 331 273 if compound not in self.ignore_list: 332 274 self.ignore_list.update(compound) 333 - return Request(url=self.website[:-2].replace("\\", "") + self.search % compound, 275 + return Request(url=self.website[:-1] + self.search % compound, 334 276 callback=self.parse)
-149
FourmiCrawler/sources/PubChem.py
··· 1 - import re 2 - 3 - from scrapy.http import Request 4 - from scrapy import log 5 - from scrapy.selector import Selector 6 - 7 - from source import Source 8 - from FourmiCrawler.items import Result 9 - 10 - 11 - class PubChem(Source): 12 - """ PubChem scraper for chemical properties 13 - 14 - This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance, 15 - including sources of the values of properties. 16 - """ 17 - 18 - # PubChem has its data on compound name, properties and their values on different html pages, so different URLs used 19 - website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*' 20 - website_www = 'http://www.ncbi.nlm.nih.gov/*' 21 - website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*' 22 - search = 'pccompound?term=%s' 23 - data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s' 24 - 25 - __spider = None 26 - searched_compounds = set() 27 - 28 - def __init__(self, config): 29 - Source.__init__(self, config) 30 - self.cfg = config 31 - 32 - def parse(self, response): 33 - """ 34 - Distributes the above described behaviour 35 - :param response: The incoming search request 36 - :return Returns the found properties if response is unique or returns none if it's already known 37 - """ 38 - requests = [] 39 - log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) 40 - 41 - sel = Selector(response) 42 - compound = sel.xpath('//h1/text()').extract()[0] 43 - if compound in self.searched_compounds: 44 - return None 45 - 46 - self.searched_compounds.update(compound) 47 - raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0] 48 - for synonym in raw_synonyms.strip().split(', '): 49 - log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG) 50 - self.searched_compounds.update(synonym) 51 - self._spider.get_synonym_requests(synonym) 52 - log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG) 53 - 54 - n = re.search(r'cid=(\d+)', response.url) 55 - if n: 56 - cid = n.group(1) 57 - log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach 58 - # the seperate html page which contains the properties and their values 59 - 60 - # using this cid to get the right url and scrape it 61 - requests.append( 62 - Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data)) 63 - return requests 64 - 65 - def parse_data(self, response): 66 - """ 67 - Parse data found in 'Chemical and Physical properties' part of a substance page. 68 - :param response: The response with the page to parse 69 - :return: requests: Returns a list of properties with their values, source, etc. 70 - """ 71 - log.msg('parsing data', level=log.DEBUG) 72 - requests = [] 73 - 74 - sel = Selector(response) 75 - props = sel.xpath('//div') 76 - 77 - for prop in props: 78 - prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing 79 - if prop.xpath('a'): # parsing for single value in property 80 - prop_source = ''.join(prop.xpath('a/@title').extract()) 81 - prop_value = ''.join(prop.xpath('a/text()').extract()) 82 - new_prop = Result({ 83 - 'attribute': prop_name, 84 - 'value': prop_value, 85 - 'source': prop_source, 86 - 'reliability': self.cfg['reliability'], 87 - 'conditions': '' 88 - }) 89 - log.msg('PubChem prop: |%s| |%s| |%s|' % 90 - (new_prop['attribute'], new_prop['value'], 91 - new_prop['source']), level=log.DEBUG) 92 - requests.append(new_prop) 93 - elif prop.xpath('ul'): # parsing for multiple values (list) in property 94 - prop_values = prop.xpath('ul//li') 95 - for prop_li in prop_values: 96 - prop_value = ''.join(prop_li.xpath('a/text()').extract()) 97 - prop_source = ''.join(prop_li.xpath('a/@title').extract()) 98 - new_prop = Result({ 99 - 'attribute': prop_name, 100 - 'value': prop_value, 101 - 'source': prop_source, 102 - 'reliability': self.cfg['reliability'], 103 - 'conditions': '' 104 - }) 105 - log.msg('PubChem prop: |%s| |%s| |%s|' % 106 - (new_prop['attribute'], new_prop['value'], 107 - new_prop['source']), level=log.DEBUG) 108 - requests.append(new_prop) 109 - 110 - return requests 111 - 112 - def parse_searchrequest(self, response): 113 - """ 114 - This function parses the response to the new_compound_request Request 115 - :param response: the Response object to be parsed 116 - :return: A Request for the compound page or what self.parse returns in 117 - case the search request forwarded to the compound page 118 - """ 119 - 120 - # check if pubchem forwarded straight to compound page 121 - m = re.match(self.website_pubchem, response.url) 122 - if m: 123 - log.msg('PubChem search forwarded to compound page', 124 - level=log.DEBUG) 125 - return self.parse(response) 126 - 127 - sel = Selector(response) 128 - 129 - results = sel.xpath('//div[@class="rsltcont"]') 130 - if results: 131 - url = results[0].xpath('div/p/a[1]/@href') 132 - else: 133 - log.msg('PubChem search found nothing or xpath failed', 134 - level=log.DEBUG) 135 - return None 136 - 137 - if url: 138 - url = 'http:' + ''.join(url[0].extract()) 139 - log.msg('PubChem compound page: %s' % url, level=log.DEBUG) 140 - else: 141 - log.msg('PubChem search found results, but no url in first result', 142 - level=log.DEBUG) 143 - return None 144 - 145 - return Request(url=url, callback=self.parse) 146 - 147 - def new_compound_request(self, compound): 148 - return Request(url=self.website_www[:-1] + self.search % compound, 149 - callback=self.parse_searchrequest)
+43 -93
FourmiCrawler/sources/WikipediaParser.py
··· 12 12 """ Wikipedia scraper for chemical properties 13 13 14 14 This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values. 15 - It also returns requests with other external sources which contain information on parsed subject. 15 + It also returns requests with other external sources which contain information on parsed subject. 16 16 """ 17 17 18 - website = "http://en\\.wikipedia\\.org/wiki/.*" 18 + website = "http://en.wikipedia.org/wiki/*" 19 19 __spider = None 20 20 searched_compounds = [] 21 21 22 - def __init__(self, config=None): 23 - Source.__init__(self, config) 22 + def __init__(self): 23 + Source.__init__(self) 24 24 25 25 def parse(self, response): 26 - """ 27 - Distributes the above described behaviour 28 - :param response: The incoming search request 29 - :return: Returns the found properties if response is unique or returns none if it's already known 30 - """ 26 + """ Distributes the above described behaviour """ 31 27 log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) 32 28 sel = Selector(response) 33 29 compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page ··· 39 35 return items 40 36 41 37 def parse_infobox(self, sel): 42 - """ 43 - Scrape data from infobox on wikipedia. 44 - 45 - Data from two types of infoboxes: class="infobox bordered" and class="infobox" is scraped and 46 - :param sel: The selector with the html-information of the page to parse 47 - :return: item_list: Returns a list of properties with their values, source, etc.. 48 - """ 49 - 38 + """ scrape data from infobox on wikipedia. """ 50 39 items = [] 51 40 52 - # scrape the chembox (wikipedia template) 53 - items = self.parse_chembox(sel, items) 41 + # be sure to get chembox (wikipedia template) 42 + tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \ 43 + xpath('normalize-space(string())') 44 + prop_names = tr_list[::2] 45 + prop_values = tr_list[1::2] 46 + for i, prop_name in enumerate(prop_names): 47 + item = Result({ 48 + 'attribute': prop_name.extract().encode('utf-8'), 49 + 'value': prop_values[i].extract().encode('utf-8'), 50 + 'source': "Wikipedia", 51 + 'reliability': "Unknown", 52 + 'conditions': "" 53 + }) 54 + items.append(item) 55 + log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) 54 56 55 - # scrape the drugbox (wikipedia template) 56 - items = self.parse_drugbox(sel, items) 57 + #scrape the drugbox (wikipedia template) 58 + tr_list2 = sel.xpath('.//table[@class="infobox"]//tr') 59 + log.msg('dit: %s' % tr_list2, level=log.DEBUG) 60 + for tablerow in tr_list2: 61 + log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG) 62 + if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath( 63 + 'normalize-space(string())'): 64 + item = Result({ 65 + 'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'), 66 + 'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'), 67 + 'source': "Wikipedia", 68 + 'reliability': "Unknown", 69 + 'conditions': "" 70 + }) 71 + items.append(item) 72 + log.msg( 73 + 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']), 74 + level=log.DEBUG) 57 75 58 76 items = filter(lambda a: a['value'] != '', items) # remove items with an empty value 59 77 item_list = self.clean_items(items) ··· 77 95 78 96 return item_list 79 97 80 - def parse_chembox(self, sel, items): 81 - """ 82 - Scrape data from chembox infobox on wikipedia. 83 - 84 - :param sel: The selector with the html-information of the page to parse 85 - :param items: the list of items where the result have to be stored in 86 - :return: items: the list of items with the new found and stored items 87 - """ 88 - tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \ 89 - xpath('normalize-space(string())') 90 - prop_names = tr_list[::2] 91 - prop_values = tr_list[1::2] 92 - for i, prop_name in enumerate(prop_names): 93 - item = self.newresult( 94 - attribute=prop_name.extract().encode('utf-8'), 95 - value=prop_values[i].extract().encode('utf-8') 96 - ) 97 - items.append(item) 98 - log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) 99 - return items 100 - 101 - def parse_drugbox(self, sel, items): 102 - """ 103 - Scrape data from drugbox infobox on wikipedia. 104 - 105 - :param sel: The selector with the html-information of the page to parse 106 - :param items: the list of items where the result have to be stored in 107 - :return: items: the list of items with the new found and stored items 108 - """ 109 - tr_list2 = sel.xpath('.//table[@class="infobox"]//tr') 110 - log.msg('dit: %s' % tr_list2, level=log.DEBUG) 111 - for tablerow in tr_list2: 112 - log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG) 113 - if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath( 114 - 'normalize-space(string())'): 115 - item = self.newresult( 116 - attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'), 117 - value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'), 118 - ) 119 - items.append(item) 120 - log.msg( 121 - 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']), 122 - level=log.DEBUG) 123 - return items 124 - 125 98 def new_compound_request(self, compound): 126 - return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse) 99 + return Request(url=self.website[:-1] + compound, callback=self.parse) 127 100 128 101 @staticmethod 129 102 def clean_items(items): 130 - 131 - """ 132 - Clean up properties using regex, makes it possible to split the values from the units 133 - 134 - Almost not in use, only cleans J/K/mol values and boiling/melting points. 135 - 136 - :param items: List of properties with their values, source, etc.. 137 - :return: items: List of now cleaned up items 138 - """ 103 + """ clean up properties using regex, makes it possible to split the values from the units """ 139 104 for item in items: 140 105 value = item['value'] 141 106 m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F) ··· 148 113 149 114 @staticmethod 150 115 def get_identifiers(sel): 151 - """ 152 - Find external links, named 'Identifiers' to different sources. 153 - 154 - :param sel: The selector with the html-information of the page to parse 155 - :return: links: New links which can be used to expand the crawlers search 156 - """ 116 + """ find external links, named 'Identifiers' to different sources. """ 157 117 links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a' 158 118 '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() 159 - return links 160 - 161 - def newresult(self, attribute, value): 162 - return Result( 163 - { 164 - 'attribute': attribute, 165 - 'value': value, 166 - 'source': 'Wikipedia', 167 - 'reliability': self.cfg['reliability'], 168 - 'conditions': '' 169 - }) 119 + return links
+3 -6
FourmiCrawler/sources/source.py
··· 3 3 4 4 5 5 class Source: 6 - website = "http://something/.*" # Regex of URI's the source is able to parse 6 + website = "http://something/*" # Regex of URI's the source is able to parse 7 7 _spider = None 8 8 9 - def __init__(self, config=None): 9 + def __init__(self): 10 10 """ 11 11 Initiation of a new Source 12 12 """ 13 - self.cfg = {} 14 - if config is not None: 15 - self.cfg = config 16 13 pass 17 14 18 15 def parse(self, response): ··· 30 27 :param compound: A compound name. 31 28 :return: A new Scrapy Request 32 29 """ 33 - # return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse) 30 + # return Request(url=self.website[:-1] + compound, callback=self.parse) 34 31 pass 35 32 36 33 def set_spider(self, spider):
+8 -12
FourmiCrawler/spider.py
··· 9 9 A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data. 10 10 """ 11 11 name = "FourmiSpider" 12 + _sources = [] 13 + synonyms = set() 12 14 13 - def __init__(self, compound=None, selected_attributes=None, *args, **kwargs): 15 + def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): 14 16 """ 15 17 Initiation of the Spider 16 18 :param compound: compound that will be searched. 17 19 :param selected_attributes: A list of regular expressions that the attributes should match. 18 20 """ 19 - self._sources = [] 20 - self.synonyms = set() 21 21 super(FourmiSpider, self).__init__(*args, **kwargs) 22 22 self.synonyms.add(compound) 23 - if selected_attributes is None: 24 - self.selected_attributes = [".*"] 25 - else: 26 - self.selected_attributes = selected_attributes 23 + self.selected_attributes = selected_attributes 27 24 28 25 def parse(self, response): 29 26 """ ··· 34 31 """ 35 32 for source in self._sources: 36 33 if re.match(source.website, response.url): 37 - log.msg("URL: " + response.url + " -> Source: " + source.website, level=log.DEBUG) 34 + log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG) 38 35 return source.parse(response) 39 - log.msg("URL: " + response.url + " -> No compatible source", level=log.INFO) 40 36 return None 41 37 42 - def get_synonym_requests(self, compound, force=False): 38 + def get_synonym_requests(self, compound): 43 39 """ 44 40 A function that generates new Scrapy Request for each source given a new synonym of a compound. 45 41 :param compound: A compound name 46 42 :return: A list of Scrapy Request objects 47 43 """ 48 44 requests = [] 49 - if force or compound not in self.synonyms: 45 + if compound not in self.synonyms: 50 46 self.synonyms.add(compound) 51 47 for parser in self._sources: 52 48 parser_requests = parser.new_compound_request(compound) ··· 61 57 """ 62 58 requests = [] 63 59 for synonym in self.synonyms: 64 - requests.extend(self.get_synonym_requests(synonym, force=True)) 60 + requests.extend(self.get_synonym_requests(synonym)) 65 61 return requests 66 62 67 63 def add_sources(self, sources):
-1
GUI/__init__.py
··· 1 - import gui
-30
GUI/configImporter.py
··· 1 - import ConfigParser 2 - 3 - 4 - class ConfigImporter(): 5 - def __init__(self, filename): 6 - """Read the filename into the parser.""" 7 - self.filename = filename 8 - self.parser = ConfigParser.ConfigParser() 9 - self.parser.read(self.filename) 10 - 11 - def load_common_attributes(self): 12 - """Loads common attributes from the initialized file.""" 13 - try: 14 - return self.parser.get('GUI', 'CommonParameters') 15 - except: 16 - return 'One, Two, Three' 17 - 18 - def load_output_types(self): 19 - """Loads output types from the initialized file.""" 20 - try: 21 - return self.parser.get('GUI', 'OutputTypes') 22 - except: 23 - return 'csv' 24 - 25 - def load_always_attributes(self): 26 - """Loads attributes that are always searched for from the initialized file.""" 27 - try: 28 - return self.parser.get('GUI', 'AlwaysParameters') 29 - except: 30 - return 'Name, Weight'
-196
GUI/gui.py
··· 1 - from Tkinter import * 2 - import os 3 - import shutil 4 - from tkFileDialog import asksaveasfilename 5 - 6 - from configImporter import * 7 - 8 - 9 - class GUI(): 10 - def __init__(self, search, config_file='GUI.cfg', sourceloader=None, in_source=True): 11 - """Boots the window, configuration.""" 12 - if not in_source: 13 - current_dir = os.path.dirname(os.path.abspath(__file__)) 14 - config_file = current_dir + '../' + config_file 15 - if not os.path.isfile(config_file): 16 - try: 17 - shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../GUI.cfg.sample", config_file) 18 - except IOError: 19 - print "GUI configuration couldn't be found and couldn't be created." 20 - sys.exit() 21 - self.configurator = ConfigImporter(config_file) 22 - self.sourceloader = sourceloader 23 - self.finish_with_search = False 24 - self.values = {} 25 - self.required_variables = ['substance'] 26 - self.search = search 27 - self.window, self.variables = self.generate_window(self.load_common_attributes(), self.load_output_types()) 28 - 29 - def load_common_attributes(self): 30 - """Calls the configuration parser for common attributes.""" 31 - return [x.strip() for x in self.configurator.load_common_attributes().split(',')] 32 - 33 - def load_output_types(self): 34 - """Calls the configuration parser for output types.""" 35 - return [x.strip() for x in self.configurator.load_output_types().split(',')] 36 - 37 - def load_always_attributes(self): 38 - """Calls the configuration parser for attributes that are always used.""" 39 - return ','.join([x.strip() for x in self.configurator.load_always_attributes().split(',')]) 40 - 41 - def set_output(self): 42 - self.variable_output_name.set(asksaveasfilename()) 43 - self.button_output_name.config(text=self.variable_output_name.get()) 44 - 45 - def generate_window(self, common_attributes, output_types): 46 - """Creates all widgets and variables in the window.""" 47 - window = Tk() 48 - window.wm_title("Fourmi Crawler") 49 - 50 - variables = {} 51 - 52 - variable_substance = StringVar(window) 53 - frame_substance = Frame(window) 54 - label_substance = Label(frame_substance, text="Substance: ") 55 - input_substance = Entry(frame_substance, font=("Helvetica", 12), width=25, textvariable=variable_substance) 56 - variables.update({"substance": variable_substance}) 57 - frame_substance.pack(side=TOP) 58 - label_substance.pack() 59 - input_substance.pack() 60 - input_substance.focus() 61 - 62 - frame_all_attributes = Frame(window) 63 - frame_selecting_attributes = Frame(frame_all_attributes) 64 - frame_new_attributes = Frame(frame_selecting_attributes) 65 - label_new_attributes = Label(frame_new_attributes, text="Parameters: ") 66 - input_new_attributes = Text(frame_new_attributes, font=("Helvetica", 8), width=25, height=7, padx=5, pady=5) 67 - variables.update({"new_attributes": input_new_attributes}) 68 - frame_new_attributes.pack(side=LEFT) 69 - label_new_attributes.pack() 70 - input_new_attributes.pack() 71 - 72 - frame_common_attributes = Frame(frame_selecting_attributes) 73 - label_common_attributes = Label(frame_common_attributes, text="Common Parameters: ") 74 - input_common_attributes = Listbox(frame_common_attributes, selectmode=MULTIPLE, height=7) 75 - scrollbar_common_attributes = Scrollbar(frame_common_attributes) 76 - input_common_attributes.config(yscrollcommand=scrollbar_common_attributes.set) 77 - scrollbar_common_attributes.config(command=input_common_attributes.yview) 78 - if common_attributes and len(common_attributes) > 0: 79 - input_common_attributes.insert(END, *common_attributes) 80 - variables.update({"common_attributes": input_common_attributes}) 81 - frame_common_attributes.pack(side=RIGHT) 82 - label_common_attributes.pack(side=TOP) 83 - input_common_attributes.pack(side=LEFT) 84 - scrollbar_common_attributes.pack(side=RIGHT, fill=Y) 85 - frame_selecting_attributes.pack() 86 - 87 - frame_last = Frame(window) 88 - search_button = Button(frame_last, text="Start search", command=self.prepare_search) 89 - cancel_button = Button(frame_last, text="Cancel", command=window.destroy) 90 - frame_last.pack(side=BOTTOM) 91 - search_button.pack(side=LEFT) 92 - cancel_button.pack(side=RIGHT) 93 - 94 - frame_name = Frame(window) 95 - frame_output_name = Frame(frame_name) 96 - label_output_name = Label(frame_output_name, text='Output file:') 97 - self.variable_output_name = StringVar() 98 - self.variable_output_name.set('results.csv') 99 - variables.update({'output_name':self.variable_output_name}) 100 - self.button_output_name = Button(frame_output_name, command=self.set_output, text="Select file") 101 - frame_output_name.pack(side=LEFT) 102 - label_output_name.pack() 103 - self.button_output_name.pack() 104 - frame_name.pack(side=BOTTOM) 105 - 106 - 107 - frame_checkboxes = Frame(window) 108 - frame_checkbox_attributes = Frame(frame_checkboxes) 109 - variable_all_attributes = BooleanVar() 110 - variable_all_attributes.set(True) 111 - input_all_attributes = Checkbutton(frame_checkbox_attributes, text="Search ALL parameters", 112 - variable=variable_all_attributes) 113 - variables.update({"all_attributes": variable_all_attributes}) 114 - frame_checkbox_attributes.pack(side=LEFT) 115 - input_all_attributes.pack() 116 - 117 - frame_logging = Frame(frame_checkboxes) 118 - variable_logging = BooleanVar() 119 - variable_logging.set(False) 120 - input_logging = Checkbutton(frame_logging, text="Verbose logging", variable=variable_logging) 121 - variables.update({'logging':variable_logging}) 122 - frame_logging.pack(side=RIGHT) 123 - frame_checkboxes.pack(side=BOTTOM) 124 - input_logging.pack() 125 - frame_all_attributes.pack() 126 - 127 - return window, variables 128 - 129 - def prepare_search(self): 130 - """Saves the values from the window for later retrieval.""" 131 - variables = self.variables 132 - values = {} 133 - 134 - values.update({"Always attributes": self.load_always_attributes()}) 135 - for name, var in variables.iteritems(): 136 - if var.__class__ is StringVar: 137 - values.update({name: var.get()}) 138 - elif var.__class__ is BooleanVar: 139 - values.update({name: var.get()}) 140 - elif var.__class__ is Text: 141 - values.update({name: str(var.get("1.0", END)).strip()}) 142 - elif var.__class__ is Listbox: 143 - values.update({name: ", ".join([var.get(int(i)) for i in var.curselection()])}) 144 - else: 145 - print "No known class, {}, {}".format(name, var) 146 - 147 - values.update({'output_name':self.variable_output_name.get()}) 148 - values.update({'output_type':self.check_output_type(values.get('output_name'))}) 149 - 150 - self.values = values 151 - if all([values.get(i) != '' for i in self.required_variables]): 152 - self.finish_with_search = True 153 - self.window.destroy() 154 - else: 155 - self.finish_with_search = False 156 - #tkMessageBox.showinfo('Not all required information was entered!') 157 - 158 - def execute_search(self): 159 - """Calls the Fourmi crawler with the values from the GUI""" 160 - if self.values.get('all_attributes'): 161 - attributes = ".*" 162 - else: 163 - attribute_types = ['attributes', 'Common attributes', 'Always attributes'] 164 - attributes = ','.join([str(self.values.get(attribute)) for attribute in attribute_types]) 165 - output_file = "file://" + str(self.values.get('output_name')) #Dealing with absolute paths 166 - 167 - arguments = {'--attributes': attributes, 168 - '--exclude': None, 169 - '--format': self.values.get('output_type'), 170 - '--help': False, 171 - '--include': None, 172 - '--log': 'log.txt', 173 - '--output': output_file, 174 - '-v': 0 if self.values.get('logging') else 3, 175 - '--version': False, 176 - '<compound>': self.values.get('substance'), 177 - 'list': False, 178 - 'search': True} 179 - 180 - self.search(arguments, self.sourceloader) 181 - 182 - def run(self): 183 - """Starts the window and the search.""" 184 - self.window.mainloop() 185 - if self.finish_with_search: 186 - self.execute_search() 187 - 188 - def check_output_type(self, filename): 189 - parts = str(filename).split('.') 190 - output_types = self.load_output_types() 191 - extension = parts[-1] 192 - 193 - for type in output_types: 194 - if extension==type: 195 - return extension 196 - return output_types[0]
-10
GUI.cfg.sample
··· 1 - [GUI] 2 - # Personalize options in your User Interface 3 - 4 - # Commonly used parameters are listed in the GUI for easy selection 5 - CommonParameters = Weight, Polarity, Viscosity, Solubility, Name 6 - 7 - # Parameters that are always used in the search 8 - AlwaysParameters = Name 9 - 10 - OutputTypes = csv, json, jsonlines, xml
+12 -7
README.md
··· 1 1 # Fourmi 2 2 3 - **Master branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=master)](https://travis-ci.org/jjdekker/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=master) 3 + **Master branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi) 4 4 5 - **Developing branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=develop)](https://travis-ci.org/jjdekker/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=develop) 5 + **Developing branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi) 6 6 7 7 Fourmi is an web scraper for chemical substances. The program is designed to be 8 8 used as a search engine to search multiple chemical databases for a specific ··· 23 23 24 24 ### Installing 25 25 26 - If you're installing Fourmi, please take a look at our installation guides 27 - on our [wiki](https://github.com/jjdekker/Fourmi/wiki). When you've installed the application, make sure to check our 28 - usage guide on the [Command Line Interface](https://github.com/jjdekker/Fourmi/wiki/CLI) and on the [Graphical User Interface](https://github.com/jjdekker/Fourmi/wiki/GUI). 26 + If you're installing Fourmi, please take a look at our [installation guide](...) 27 + on our wiki. When you've installed the application, make sure to check our 28 + [usage guide](...). 29 29 30 30 ### Using the Source 31 31 32 32 To use the Fourmi source code multiple dependencies are required. Take a look at 33 - our [wiki pages](https://github.com/jjdekker/Fourmi/wiki) on using the application source code in our a step by step 33 + the [wiki page](...) on using the application source code for a step by step 34 34 installation guide. 35 35 36 36 When developing for the Fourmi project keep in mind that code readability is a 37 37 must. To maintain the readability, code should be conform with the 38 38 [PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python 39 39 code. More information about the different structures and principles of the 40 - Fourmi application can be found on our [wiki](https://github.com/jjdekker/Fourmi/wiki). 40 + Fourmi application can be found on our [wiki](...). 41 41 42 42 ### To Do 43 43 ··· 45 45 46 46 __Main goals:__ 47 47 48 + - Improve our documentation and guides. (Assignee: Dekker) 48 49 - Build an graphical user interface(GUI) as alternative for the command line 49 50 interface(CLI). (Assignee: Harmen) 50 51 - Compiling the source into an windows executable. (Assignee: Bas) 52 + - Create an configuration file to hold logins and API keys. 53 + - Determine reliability of our data point. 54 + - Create an module to gather data from NIST. (Assignee: Rob) 55 + - Create an module to gather data from PubChem. (Assignee: Nout) 51 56 52 57 __Side goals:__ 53 58
-108
SIGNED.md
··· 1 - ##### Signed by https://keybase.io/jdekker 2 - ``` 3 - -----BEGIN PGP SIGNATURE----- 4 - Version: GnuPG v1.4.11 (GNU/Linux) 5 - 6 - iQIcBAABAgAGBQJTpMZAAAoJEJrQ9RIUCT6/Hf8P/AyX9ZD5zj6rBi2CwDOTs5aa 7 - flVqw9syvdqTzVfXQaR4UrCSOuyuOeAkiqub0BMjxyCurqAwN/SCPf3uOJ/tGXmt 8 - ZPtYVHjevJ4mbojLhZiJ2av8LC9VOh3Zl+reR3L2cLuBD4rVSrfUMJtczbbtNlk+ 9 - +mczRcTpzNvHQW6mKqyUoKn8xqNnLC7C+p5ybNZ5EADUfoKIF1xyTN6je6fpYZ1U 10 - IHxiUzeOvfX9ohmbfnfkpkuSll1nUJWsTgUPKhthJuxEhwCQ1xMdWhxfcyZJaMT2 11 - Pxgo8C8S6lzAk4PxBRBoePjgWAeaFmbr317WXHvw6SSHPIdzToKZgDiDC5LWvKxb 12 - RRdLZ6w7tg0/FSUexekrUafGT8Je0oIoLUQlNaEQzrPNhDpma1uHFfZg0vb2m4Hq 13 - WHLLKTCr6FMczhP1TmuIEtdjKtymT+rO+Ls4ciw+654R7MtBYcmTr+RqmAd+GadJ 14 - vJNmGDod2oPwCydEps8bYAbksqRhMmk3xwco/g6dWYh5/+1GzCr80J7fYpqtoPFH 15 - V5qKyDQovF5jPlb/buq4mH8XYVT1z4Sx8azKVctMLig57zRnvN0WyskpT09oY7dK 16 - TPvIqwTixekndYLcM3QacVq/NhVOOQPFvD0PwU18eKs4EfD2L7iWd2XjV9Az++aD 17 - jUY6EwEuOzDCexWP4eM8 18 - =h6TK 19 - -----END PGP SIGNATURE----- 20 - 21 - ``` 22 - 23 - <!-- END SIGNATURES --> 24 - 25 - ### Begin signed statement 26 - 27 - #### Expect 28 - 29 - ``` 30 - size exec file contents 31 - ./ 32 - 412 .gitignore 25059da2ee328837ece01b979cd5c1083ed1679372f06c14c1c58035d8120614 33 - 548 .travis.yml 7f11bc58a8e94276ef949afeb107f9f1e184c0dbb84f821705ea2245902ed546 34 - 846 Changelog.md 345f9aea4812b37b1b2714703ea0d5edd27414c0f839ec3e322450ad5ec5c6ed 35 - FourmiCrawler/ 36 - 0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 37 - 304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806 38 - 2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49 39 - 677 settings.py f1e7d21b899ffc2523516c0ebe67d967dc62495b90c2fe34651042a3049fcd94 40 - sources/ 41 - 12103 ChemSpider.py f647d70acf9b3f1ee7bde75586aa45156331f977ca7fe836ceac4477a2c0d4ce 42 - 12400 NIST.py cdb4c423355ac8fb1097197a9f8df44f667925a785c6bae7c583820da08908ee 43 - 6121 PubChem.py 8f8ad40459090b818a384a202e739fe4696a04154df2b8419aee896b0fa02481 44 - 6930 WikipediaParser.py ae9f57bbf2aad9c371abcd143fd2dda5995a196cb700734a5035dd94b1988870 45 - 0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 46 - 1281 source.py 7927fda259ff2c8096fa526db1f08586de6e04473a491e19a07b092fdeed81fc 47 - 3111 spider.py ec7c946907fea10c17ee6dd88a506f3e3bf2cd748e3eb09200487fcec2ae7ba3 48 - GUI/ 49 - 11 __init__.py 40567015c415e853210425c1b4f3834dbc2a3165e3713e04dd3424b79bc90aa3 50 - 940 configImporter.py 5d731d63a3117b25b7e556a746a1dd5b16e8cbb60e57be46de333c31c8c00271 51 - 8776 gui.py 20b2220bc3ca55ebfd6d04e8c0bebbf1ae316c85a54db60b8fc02d22642f19d5 52 - 299 GUI.cfg.sample 4ee27f7099d588c21358cd645a21621e631d80712f1b514dad898faa5fee2483 53 - 1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c 54 - 3900 README.md f4a1e3ea1700d2b415acfad661cb45f960fe8e8ffbe98dbecb6c7ed071a101ac 55 - 3846 x fourmi.py f0b11f5f153f96f6af2e504cdf369e43c04316752de131a659eb6246fd80212a 56 - 261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85 57 - 416 sources.cfg.sample 11cd0fc18693da17883c98d25a384ae1b6158adfef13778b6dd02b878f6b8a70 58 - tests/ 59 - 107 __init__.py ce90e54e58a0912cadbe3adcf5166dc72477bf9ce289bf427f8e2f5b25406670 60 - 2870 test_configurator.py 318d542b1cda5075a2a9a6be97e9e7a79372ee58e1ab3014c161534094f7364d 61 - 1315 test_gui.py 0fb95d0b542765bf52bcebb037bf2ed1299209beab23448af741a93c9fbb1ca8 62 - 1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031 63 - 1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869 64 - 2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299 65 - utils/ 66 - 40 __init__.py f1237ae74693e2ec1b3154e57aec27438a80a735e5ccf2411aecd194ef443b6a 67 - 4047 configurator.py 8b566a0435a9f105a8ec616b16c3e21edb9b82f8debe1ef9f1df6bbbf20949d5 68 - 2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37 69 - ``` 70 - 71 - #### Ignore 72 - 73 - ``` 74 - /SIGNED.md 75 - ``` 76 - 77 - #### Presets 78 - 79 - ``` 80 - git # ignore .git and anything as described by .gitignore files 81 - dropbox # ignore .dropbox-cache and other Dropbox-related files 82 - kb # ignore anything as described by .kbignore files 83 - ``` 84 - 85 - <!-- summarize version = 0.0.9 --> 86 - 87 - ### End signed statement 88 - 89 - <hr> 90 - 91 - #### Notes 92 - 93 - With keybase you can sign any directory's contents, whether it's a git repo, 94 - source code distribution, or a personal documents folder. It aims to replace the drudgery of: 95 - 96 - 1. comparing a zipped file to a detached statement 97 - 2. downloading a public key 98 - 3. confirming it is in fact the author's by reviewing public statements they've made, using it 99 - 100 - All in one simple command: 101 - 102 - ```bash 103 - keybase dir verify 104 - ``` 105 - 106 - There are lots of options, including assertions for automating your checks. 107 - 108 - For more info, check out https://keybase.io/docs/command_line/code_signing
+51 -23
fourmi.py
··· 1 - #!/usr/bin/env python 1 + # !/usr/bin/env python 2 2 """ 3 - Fourmi, a web scraper build to search specific information for a given compound (and its pseudonyms). 3 + Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms). 4 4 5 5 Usage: 6 - fourmi 7 6 fourmi search <compound> 8 7 fourmi [options] search <compound> 9 - fourmi [options] [-v | -vv | -vvv] [--include=<sourcename> | --exclude=<sourcename>] search <compound> 8 + fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound> 10 9 fourmi list 11 10 fourmi [--include=<sourcename> | --exclude=<sourcename>] list 12 11 fourmi -h | --help ··· 16 15 --attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*] 17 16 -h --help Show this screen. 18 17 --version Show version. 19 - -v Verbose logging output. (Multiple occurrences increase logging level) 18 + --verbose Verbose logging output. 20 19 --log=<file> Save log to an file. 21 - -o <file> --output=<file> Output file [default: <compound>.*format*] 22 - -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv] 20 + -o <file> --output=<file> Output file [default: result.*format*] 21 + -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines] 23 22 --include=<regex> Include only sources that match these regular expressions split by a comma. 24 23 --exclude=<regex> Exclude the sources that match these regular expressions split by a comma. 25 24 """ 26 25 27 26 from twisted.internet import reactor 28 27 from scrapy.crawler import Crawler 29 - from scrapy import signals, log 28 + from scrapy import log, signals 29 + from scrapy.utils.project import get_project_settings 30 30 import docopt 31 31 32 32 from FourmiCrawler.spider import FourmiSpider 33 - from utils.configurator import Configurator 34 - from utils.sourceloader import SourceLoader 35 - from GUI import gui 33 + from sourceloader import SourceLoader 36 34 37 35 38 36 def setup_crawler(compound, settings, source_loader, attributes): ··· 52 50 crawler.start() 53 51 54 52 53 + def scrapy_settings_manipulation(docopt_arguments): 54 + """ 55 + This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi 56 + project these are command line arguments. 57 + :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. 58 + """ 59 + settings = get_project_settings() 60 + 61 + if docopt_arguments["--output"] != 'result.*format*': 62 + settings.overrides["FEED_URI"] = docopt_arguments["--output"] 63 + elif docopt_arguments["--format"] == "jsonlines": 64 + settings.overrides["FEED_URI"] = "results.json" 65 + elif docopt_arguments["--format"] is not None: 66 + settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"] 67 + 68 + if docopt_arguments["--format"] is not None: 69 + settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"] 70 + 71 + return settings 72 + 73 + 74 + def start_log(docopt_arguments): 75 + """ 76 + This function starts the logging functionality of Scrapy using the settings given by the CLI. 77 + :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. 78 + """ 79 + if docopt_arguments["--log"] is not None: 80 + if docopt_arguments["--verbose"]: 81 + log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG) 82 + else: 83 + log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING) 84 + else: 85 + if docopt_arguments["--verbose"]: 86 + log.start(logstdout=False, loglevel=log.DEBUG) 87 + else: 88 + log.start(logstdout=True, loglevel=log.WARNING) 89 + 90 + 55 91 def search(docopt_arguments, source_loader): 56 92 """ 57 93 The function that facilitates the search for a specific compound. 58 94 :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. 59 95 :param source_loader: An initiated SourceLoader object pointed at the directory with the sources. 60 96 """ 61 - conf = Configurator() 62 - conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"]) 63 - conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments["<compound>"]) 64 - setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, 65 - source_loader, docopt_arguments["--attributes"].split(',')) 66 - if conf.scrapy_settings.getbool("LOG_ENABLED"): 67 - log.start(conf.scrapy_settings.get("LOG_FILE"), 68 - conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) 97 + start_log(docopt_arguments) 98 + settings = scrapy_settings_manipulation(docopt_arguments) 99 + setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(',')) 69 100 reactor.run() 70 101 71 102 72 103 # The start for the Fourmi Command Line interface. 73 104 if __name__ == '__main__': 74 - arguments = docopt.docopt(__doc__, version='Fourmi - V0.6.0') 105 + arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.1') 75 106 loader = SourceLoader() 76 107 77 108 if arguments["--include"]: ··· 84 115 elif arguments["list"]: 85 116 print "-== Available Sources ==-" 86 117 print str(loader) 87 - else: 88 - gui_window = gui.GUI(search, sourceloader=SourceLoader()) 89 - gui_window.run()
+18
setup.py
··· 1 + import sys 2 + from cx_Freeze import setup, Executable 3 + 4 + # After running the setup file (python setup.py build) the scrapy/VERSION file has to be manually put into the 5 + # library.zip, also the FourmiCrawler map has to be copied to both the library and the exe.win32-2.7 folder. after 6 + # putting the files in the library the library has to be zipped and replace the old library. 7 + # Dependencies are automatically detected, but it might need fine tuning. 8 + build_exe_options = {"packages": ["os", "scrapy", "lxml", "w3lib", "pkg_resources", "zope.interface", "twisted.internet"], "excludes": []} 9 + 10 + # GUI applications require a different base on Windows (the default is for a 11 + # console application). 12 + base = None 13 + 14 + setup( name = "Scrapy", 15 + version = "0.1", 16 + description = "My GUI application!", 17 + options = {"build_exe": build_exe_options}, 18 + executables = [Executable("fourmi.py", base=base)])
+60
sourceloader.py
··· 1 + import inspect 2 + import sys 3 + import os 4 + import re 5 + 6 + from FourmiCrawler.sources.source import Source 7 + 8 + 9 + class SourceLoader: 10 + sources = [] 11 + 12 + def __init__(self, rel_dir="FourmiCrawler/sources"): 13 + 14 + if hasattr(sys,'frozen'): 15 + path = os.path.dirname(sys.executable) 16 + else: 17 + path = os.path.dirname(os.path.abspath(__file__)) 18 + 19 + path += "/" + rel_dir 20 + known_parser = set() 21 + 22 + for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: 23 + mod = __import__('.'.join([rel_dir.replace('/', "."), py]), fromlist=[py]) 24 + classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] 25 + for cls in classes: 26 + if issubclass(cls, Source) and cls not in known_parser: 27 + self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers? 28 + # known_parser.add(cls) 29 + 30 + def include(self, source_names): 31 + """ 32 + This function excludes all sources that don't match the given regular expressions. 33 + :param source_names: A list of regular expression (strings) 34 + """ 35 + new = set() 36 + for name in source_names: 37 + new.update([src for src in self.sources if re.match(name, src.__class__.__name__)]) 38 + self.sources = list(new) 39 + 40 + def exclude(self, source_names): 41 + """ 42 + This function excludes all sources that match the given regular expressions. 43 + :param source_names: A list of regular expression (strings) 44 + """ 45 + exclude = [] 46 + for name in source_names: 47 + exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)]) 48 + self.sources = [src for src in self.sources if src not in exclude] 49 + 50 + def __str__(self): 51 + """ 52 + This function returns a string with all sources currently available in the SourceLoader. 53 + :return: a string with all available sources. 54 + """ 55 + string = "" 56 + for src in self.sources: 57 + string += "Source: " + src.__class__.__name__ 58 + string += " - " 59 + string += "URI: " + src.website + "\n" 60 + return string
-19
sources.cfg.sample
··· 1 - [DEFAULT] 2 - reliability = Unknown 3 - 4 - #For each source listed in FourmiCrawler/sources there should be a section 5 - #named exactly as the filename in here. If not present, the DEFAULT value is 6 - #used for reliability of that source. 7 - 8 - [ChemSpider] 9 - reliability = High 10 - #token=Paste ChemSpider API token here and remove the hashtag 11 - 12 - [NIST] 13 - reliability = High 14 - 15 - [WikipediaParser] 16 - reliability = Medium 17 - 18 - [PubChem] 19 - reliability = High
-5
tests/__init__.py
··· 1 - import test_configurator 2 - import test_gui 3 - import test_pipeline 4 - import test_sourceloader 5 - import test_spider 6 1
-68
tests/test_configurator.py
··· 1 - import unittest 2 - import ConfigParser 3 - 4 - from utils.configurator import Configurator 5 - 6 - 7 - class TestConfigurator(unittest.TestCase): 8 - 9 - def setUp(self): 10 - self.conf = Configurator() 11 - 12 - def test_set_output(self): 13 - self.conf.set_output(filename="test.txt", fileformat="csv", compound="test") 14 - self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt") 15 - self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") 16 - 17 - self.conf.set_output("<compound>.*format*", "jsonlines", "test") 18 - self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.json") 19 - self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines") 20 - 21 - self.conf.set_output("<compound>.*format*", "csv", "test") 22 - self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.csv") 23 - self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") 24 - 25 - def test_start_log(self): 26 - for i in range(0, 3): 27 - self.conf.set_logging("TEST", i) 28 - self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), "TEST") 29 - if i > 0: 30 - self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), True) 31 - if i > 1: 32 - self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), False) 33 - else: 34 - self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True) 35 - else: 36 - self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), False) 37 - self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True) 38 - if i == 1: 39 - self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "WARNING") 40 - elif i == 2: 41 - self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "INFO") 42 - elif i == 3: 43 - self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "DEBUG") 44 - 45 - self.conf.set_logging(verbose=i) 46 - self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), None) 47 - 48 - def test_read_sourceconfiguration(self): 49 - config = self.conf.read_sourceconfiguration() 50 - self.assertIsInstance(config, ConfigParser.ConfigParser) 51 - 52 - def test_get_section(self): 53 - config = ConfigParser.ConfigParser() 54 - section = self.conf.get_section(config, 'test') 55 - self.assertIn('reliability', section) 56 - self.assertEquals(section['reliability'], '') 57 - 58 - config.set('DEFAULT', 'reliability', 'Low') 59 - 60 - section = self.conf.get_section(config, 'test') 61 - self.assertEquals(section['reliability'], 'Low') 62 - 63 - config.add_section('test') 64 - config.set('test', 'var', 'Maybe') 65 - 66 - section = self.conf.get_section(config, 'test') 67 - self.assertEquals(section['reliability'], 'Low') 68 - self.assertEqual(section['var'], 'Maybe')
-32
tests/test_gui.py
··· 1 - import unittest 2 - 3 - from GUI import gui 4 - 5 - class TestGUI(unittest.TestCase): 6 - def setUp(self): 7 - pass 8 - 9 - def test_empty_attributes(self): 10 - self.test_gui = gui.GUI(None, config_file="../GUI.cfg.sample", in_source=True) 11 - self.test_gui.window.after(9, self.test_gui.prepare_search) 12 - self.test_gui.window.after(11, self.test_gui.window.destroy) 13 - self.test_gui.run() 14 - 15 - output_type = self.test_gui.configurator.load_output_types().split(',')[0] 16 - 17 - self.assertEqual(self.test_gui.values.get('substance'), '') 18 - self.assertEqual(self.test_gui.values.get('output_type'), output_type) 19 - self.assertEqual(self.test_gui.values.get('output_name'), 'results.csv') 20 - 21 - 22 - def test_no_configurations(self): 23 - self.test_gui = gui.GUI(None, config_file="../GUI.cfg.sample") 24 - self.test_gui.configurator = gui.ConfigImporter('') 25 - self.test_gui.finish_with_search = True 26 - self.test_gui.window.after(9, self.test_gui.prepare_search) 27 - self.test_gui.window.after(11, self.test_gui.window.destroy) 28 - self.test_gui.run() 29 - 30 - self.assertEqual(self.test_gui.values.get('substance'), '') 31 - self.assertEqual(self.test_gui.values.get('output_type'), 'csv') 32 - self.assertEqual(self.test_gui.values.get('output_name'), 'results.csv')
-1
tests/test_pipeline.py
··· 13 13 def test_none_pipeline(self): 14 14 # Testing the pipeline that replaces the None values in items. 15 15 self.testItem["value"] = "abc" 16 - self.testItem["source"] = None 17 16 pipe = pipelines.RemoveNonePipeline() 18 17 processed = pipe.process_item(self.testItem, spider.FourmiSpider()) 19 18
+1 -1
tests/test_sourceloader.py
··· 1 1 import unittest 2 2 3 - from utils.sourceloader import SourceLoader 3 + from sourceloader import SourceLoader 4 4 5 5 6 6 class TestSourceloader(unittest.TestCase):
+5 -7
tests/test_spider.py
··· 3 3 from scrapy.http import Request 4 4 5 5 from FourmiCrawler import spider 6 - from FourmiCrawler.sources.NIST import NIST 6 + from FourmiCrawler.sources.ChemSpider import ChemSpider 7 7 from FourmiCrawler.sources.source import Source 8 8 9 9 ··· 41 41 self.spi.add_source(src) 42 42 self.assertEqual(self.spi.start_requests(), []) 43 43 44 - src2 = NIST() 44 + src2 = ChemSpider() 45 45 self.spi.add_source(src2) 46 - requests = self.spi.start_requests() 47 - self.assertGreater(len(requests), 0) 48 - self.assertIsInstance(requests[0], Request) 46 + self.assertIsNotNone(self.spi.start_requests()) 49 47 50 48 def test_synonym_requests(self): 51 49 # A test for the synonym request function ··· 56 54 self.assertEqual(self.spi.get_synonym_requests("new_compound"), []) 57 55 self.assertIn("new_compound", self.spi.synonyms) 58 56 59 - src2 = NIST() 57 + src2 = ChemSpider() 60 58 self.spi.add_source(src2) 61 59 self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request) 62 60 self.assertIn("other_compound", self.spi.synonyms) 63 - self.assertEqual(self.spi.get_synonym_requests("other_compound"), []) 61 + self.assertEqual(self.spi.get_synonym_requests("other_compound"), [])
-2
utils/__init__.py
··· 1 - import configurator 2 - import sourceloader
-101
utils/configurator.py
··· 1 - import ConfigParser 2 - import os 3 - import shutil 4 - 5 - from scrapy.utils.project import get_project_settings 6 - 7 - 8 - class Configurator: 9 - """ 10 - A helper class in the fourmi class. This class is used to process the settings as set 11 - from one of the Fourmi applications. 12 - """ 13 - 14 - def __init__(self): 15 - self.scrapy_settings = get_project_settings() 16 - 17 - def set_output(self, filename, fileformat, compound): 18 - """ 19 - This function manipulates the Scrapy output file settings that normally would be set in the settings file. 20 - In the Fourmi project these are command line arguments. 21 - :param filename: The filename of the file where the output will be put. 22 - :param fileformat: The format in which the output will be. 23 - """ 24 - 25 - if filename != '<compound>.*format*': 26 - self.scrapy_settings.overrides["FEED_URI"] = filename 27 - elif fileformat == "jsonlines": 28 - self.scrapy_settings.overrides["FEED_URI"] = compound + ".json" 29 - elif fileformat is not None: 30 - self.scrapy_settings.overrides["FEED_URI"] = compound + "." + fileformat 31 - 32 - if fileformat is not None: 33 - self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat 34 - 35 - def set_logging(self, logfile=None, verbose=0): 36 - """ 37 - This function changes the default settings of Scapy's logging functionality 38 - using the settings given by the CLI. 39 - :param logfile: The location where the logfile will be saved. 40 - :param verbose: A integer value to switch between loglevels. 41 - """ 42 - if verbose != 0: 43 - self.scrapy_settings.overrides["LOG_ENABLED"] = True 44 - else: 45 - self.scrapy_settings.overrides["LOG_ENABLED"] = False 46 - 47 - if verbose == 1: 48 - self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING" 49 - elif verbose == 2: 50 - self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO" 51 - else: 52 - self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG" 53 - 54 - if verbose > 1: 55 - self.scrapy_settings.overrides["LOG_STDOUT"] = False 56 - else: 57 - self.scrapy_settings.overrides["LOG_STDOUT"] = True 58 - 59 - if logfile is not None: 60 - self.scrapy_settings.overrides["LOG_FILE"] = logfile 61 - else: 62 - self.scrapy_settings.overrides["LOG_FILE"] = None 63 - 64 - @staticmethod 65 - def read_sourceconfiguration(): 66 - """ 67 - This function reads sources.cfg in the main folder for configuration 68 - variables for sources 69 - :return a ConfigParser object of sources.cfg 70 - """ 71 - current_dir = os.path.dirname(os.path.abspath(__file__)) 72 - config_path = current_dir + '/../sources.cfg' 73 - # [TODO]: location of sources.cfg should be softcoded eventually 74 - if not os.path.isfile(config_path): 75 - try: 76 - shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../sources.cfg.sample", config_path) 77 - except IOError: 78 - print "WARNING: Source configuration couldn't be found and couldn't be created." 79 - config = ConfigParser.ConfigParser() 80 - config.read(config_path) 81 - return config 82 - 83 - @staticmethod 84 - def get_section(config, sourcename): 85 - """ 86 - This function reads a config section labeled in variable sourcename and 87 - tests whether the reliability variable is set else set to empty string. 88 - Return the default section if the labeled config section does not exist 89 - :param config: a ConfigParser object 90 - :param sourcename: the name of the section to be read 91 - :return a dictionary of the section in the config labeled in sourcename 92 - """ 93 - section = dict() 94 - if config.has_section(sourcename): 95 - section = dict(config.items(sourcename)) 96 - elif config.defaults(): 97 - section = config.defaults() 98 - if 'reliability' not in section: 99 - print 'WARNING: Reliability not set for %s' % sourcename 100 - section['reliability'] = '' 101 - return section
-64
utils/sourceloader.py
··· 1 - import inspect 2 - import os 3 - import re 4 - 5 - from FourmiCrawler.sources.source import Source 6 - from utils.configurator import Configurator 7 - 8 - 9 - class SourceLoader: 10 - sources = [] 11 - 12 - def __init__(self, rel_dir="../FourmiCrawler/sources"): 13 - """ 14 - The initiation of a SourceLoader, selects and indexes a directory for usable sources. 15 - Also loads a configuration file for Sources and passes the arguments in 16 - the named section to the source 17 - :param rel_dir: A relative path to a directory. 18 - """ 19 - path = os.path.dirname(os.path.abspath(__file__)) 20 - path += "/" + rel_dir 21 - known_parser = set() 22 - 23 - config = Configurator.read_sourceconfiguration() 24 - 25 - for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: 26 - mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py]) 27 - classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] 28 - for cls in classes: 29 - if issubclass(cls, Source) and cls not in known_parser: 30 - sourcecfg = Configurator.get_section(config, cls.__name__) 31 - self.sources.append(cls(sourcecfg)) 32 - known_parser.add(cls) 33 - 34 - def include(self, source_names): 35 - """ 36 - This function excludes all sources that don't match the given regular expressions. 37 - :param source_names: A list of regular expression (strings) 38 - """ 39 - new = set() 40 - for name in source_names: 41 - new.update([src for src in self.sources if re.match(name, src.__class__.__name__)]) 42 - self.sources = list(new) 43 - 44 - def exclude(self, source_names): 45 - """ 46 - This function excludes all sources that match the given regular expressions. 47 - :param source_names: A list of regular expression (strings) 48 - """ 49 - exclude = [] 50 - for name in source_names: 51 - exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)]) 52 - self.sources = [src for src in self.sources if src not in exclude] 53 - 54 - def __str__(self): 55 - """ 56 - This function returns a string with all sources currently available in the SourceLoader. 57 - :return: a string with all available sources. 58 - """ 59 - string = "" 60 - for src in self.sources: 61 - string += "Source: " + src.__class__.__name__ 62 - string += " - " 63 - string += "URI: " + src.website + "\n" 64 - return string