A web scraper build to search specific information for a given compound (and its pseudonyms)

Compare changes

Choose any two refs to compare.

+1710 -248
+5
.gitignore
··· 4 4 #Python Specific ignores 5 5 *.pyc 6 6 7 + #may contain authentication information 8 + sources.cfg 9 + #Another of our config files 10 + GUI.cfg 11 + 7 12 #THINGS WE WOULD NEVER EVER WANT! 8 13 #ignore thumbnails created by windows 9 14 Thumbs.db
+23
.travis.yml
··· 1 + # Config file for automatic testing at travis-ci.org 2 + 3 + language: python 4 + python: 2.7 5 + 6 + before_install: 7 + - "export DISPLAY=:99.0" 8 + - "sh -e /etc/init.d/xvfb start" 9 + 10 + # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors 11 + install: 12 + - pip install Scrapy docopt 13 + - pip install coveralls 14 + 15 + # command to run tests, e.g. python setup.py test 16 + script: 17 + - nosetests --with-coverage --cover-package=FourmiCrawler,utils,GUI tests 18 + 19 + notifications: 20 + slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM 21 + 22 + after_success: 23 + coveralls --verbose
+20
Changelog.md
··· 1 + ### v0.6.0 2 + - Feature: Added a Graphical User interface 3 + - Feature: Automatic config file createion from config samples 4 + - FIX: The default name of the output files will now consist of the compound name and the file format when using the CLI 5 + - FIX: A lot of bugfixes of the PubChem plugin, as is wasn't working as it should 6 + - FIX: Using absolute path for configuration files 7 + - DEV: General Code cleanup in documentation 8 + 9 + ### v0.5.3 10 + - FIX: It is now again possible to use both verbose and the source inclusion/exclusion options 11 + - FIX: Logging is now "actually" disabled if not using the verbose option. 12 + - FEATURE: Added support for PubChem 13 + 14 + ### v0.5.2 15 + - FIX: Signatured used to contain untracked and older files, current signature 16 + should be correct. 17 + 18 + ### v0.5.1 19 + - UPDATED: Logging functionality from command line 20 + - DEV: Code cleanup and extra tests
+1 -3
FourmiCrawler/items.py
··· 1 - # Define here the models for your scraped items 2 - # 3 - # See documentation in: 1 + # For more information on item definitions, see the Scrapy documentation in: 4 2 # http://doc.scrapy.org/en/latest/topics/items.html 5 3 6 4 from scrapy.item import Item, Field
+26 -9
FourmiCrawler/pipelines.py
··· 1 - # Define your item pipelines here 2 - # 3 - # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 - # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 1 + # For more information on item pipelines, see the Scrapy documentation in: 2 + # http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 3 import re 4 + 6 5 from scrapy.exceptions import DropItem 7 6 8 7 9 - class DuplicatePipeline(object): 8 + class RemoveNonePipeline(object): 9 + def __init__(self): 10 + pass 11 + 12 + @staticmethod 13 + def process_item(item, spider): 14 + """ 15 + Processing the items so None values are replaced by empty strings 16 + :param item: The incoming item 17 + :param spider: The spider which scraped the spider 18 + :return: :raise DropItem: Returns the item if unique or drops them if it's already known 19 + """ 20 + for key in item: 21 + if item[key] is None: 22 + item[key] = "" 23 + return item 10 24 25 + 26 + class DuplicatePipeline(object): 11 27 def __init__(self): 12 28 self.known_values = set() 13 29 ··· 20 36 """ 21 37 value = (item['attribute'], item['value'], item['conditions']) 22 38 if value in self.known_values: 23 - raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item. 39 + raise DropItem("Duplicate item found: %s" % item) # [todo] append sources of first item. 24 40 else: 25 41 self.known_values.add(value) 26 42 return item 27 43 28 - class AttributeSelectionPipeline(object): 29 44 45 + class AttributeSelectionPipeline(object): 30 46 def __init__(self): 31 - pass; 47 + pass 32 48 33 - def process_item(self, item, spider): 49 + @staticmethod 50 + def process_item(item, spider): 34 51 """ 35 52 The items are processed using the selected attribute list available in the spider, 36 53 items that don't match the selected items are dropped.
+5 -5
FourmiCrawler/settings.py
··· 3 3 # For simplicity, this file contains only the most important settings by 4 4 # default. All the other settings are documented here: 5 5 # 6 - # http://doc.scrapy.org/en/latest/topics/settings.html 6 + # http://doc.scrapy.org/en/latest/topics/settings.html 7 7 # 8 8 9 9 BOT_NAME = 'FourmiCrawler' ··· 11 11 SPIDER_MODULES = ['FourmiCrawler'] 12 12 NEWSPIDER_MODULE = 'FourmiCrawler' 13 13 ITEM_PIPELINES = { 14 - 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 100, 15 - 'FourmiCrawler.pipelines.DuplicatePipeline': 200, 14 + "FourmiCrawler.pipelines.RemoveNonePipeline": 100, 15 + 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 200, 16 + 'FourmiCrawler.pipelines.DuplicatePipeline': 300, 16 17 } 17 18 FEED_URI = 'results.json' 18 19 FEED_FORMAT = 'jsonlines' 19 - 20 20 21 21 # Crawl responsibly by identifying yourself (and your website) on the 22 22 # user-agent 23 23 24 - # USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)' 24 + USER_AGENT = 'Fourmi'
+131 -63
FourmiCrawler/sources/ChemSpider.py
··· 1 - from source import Source 1 + import re 2 + 2 3 from scrapy import log 3 4 from scrapy.http import Request 4 5 from scrapy.selector import Selector 6 + 7 + from source import Source 5 8 from FourmiCrawler.items import Result 6 - import re 9 + 7 10 8 11 # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. 9 12 10 - 11 13 class ChemSpider(Source): 12 - """ChemSpider scraper for synonyms and properties 13 - 14 + """ 15 + ChemSpider scraper for synonyms and properties 14 16 This parser will manage searching for chemicals through the 15 17 ChemsSpider API, and parsing the resulting ChemSpider page. 16 18 The token required for the API should be in a configuration file 17 19 somewhere. 18 20 """ 19 21 20 - def __init__(self): 21 - Source.__init__(self) 22 + website = 'http://www\\.chemspider\\.com/.*' 22 23 23 - website = 'http://www.chemspider.com/*' 24 - 25 - # [TODO] - Save and access token of specific user. 26 - search = ('Search.asmx/SimpleSearch?query=%s&token=' 27 - '052bfd06-5ce4-43d6-bf12-89eabefd2338') 24 + search = 'Search.asmx/SimpleSearch?query=%s&token=' 28 25 structure = 'Chemical-Structure.%s.html' 29 - extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' 30 - '052bfd06-5ce4-43d6-bf12-89eabefd2338') 26 + extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' 31 27 32 - ignore_list = [] 28 + def __init__(self, config=None): 29 + """ 30 + Initialization of ChemSpider scraper 31 + :param config: a dictionary of settings for this scraper, must contain 32 + 'reliability' key 33 + """ 34 + Source.__init__(self, config) 35 + self.ignore_list = [] 36 + if 'token' not in self.cfg or self.cfg['token'] == '': 37 + log.msg('ChemSpider token not set or empty, search/MassSpec API ' 38 + 'not available', level=log.WARNING) 39 + self.cfg['token'] = '' 40 + self.search += self.cfg['token'] 41 + self.extendedinfo += self.cfg['token'] 33 42 34 43 def parse(self, response): 44 + """ 45 + This function is called when a Response matching the variable 46 + 'website' is available for parsing the Response object. 47 + :param response: the Scrapy Response object to be parsed 48 + :return: a list of Result items and Request objects 49 + """ 35 50 sel = Selector(response) 36 51 requests = [] 37 52 requests_synonyms = self.parse_synonyms(sel) ··· 41 56 42 57 return requests 43 58 44 - @staticmethod 45 - def parse_properties(sel): 46 - """scrape Experimental Data and Predicted ACD/Labs tabs""" 59 + def parse_properties(self, sel): 60 + """ 61 + This function scrapes the Experimental Data and Predicted ACD/Labs tabs 62 + :param sel: a Selector object of the whole page 63 + :return: a list of Result items 64 + """ 47 65 properties = [] 48 66 49 - # Predicted - ACD/Labs tab 67 + properties.extend(self.parse_acdlabstab(sel)) 68 + properties.extend(self.parse_experimentaldatatab(sel)) 69 + 70 + return properties 71 + 72 + def parse_acdlabstab(self, sel): 73 + """ 74 + This function scrapes the 'Predicted ACD/Labs tab' under Properties 75 + :param sel: a Selector object of the whole page 76 + :return: a list of Request objects 77 + """ 78 + properties = [] 79 + 50 80 td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath( 51 81 'normalize-space(string())') 52 82 prop_names = td_list[::2] ··· 59 89 60 90 # Test for properties without values, with one hardcoded exception 61 91 if (not re.match(r'^\d', prop_value) or 62 - (prop_name == 'Polarizability' and 63 - prop_value == '10-24cm3')): 92 + (prop_name == 'Polarizability' and prop_value == '10-24cm3')): 64 93 continue 65 94 66 - # Match for condition in parentheses 67 95 m = re.match(r'(.*) \((.*)\)', prop_name) 68 96 if m: 69 97 prop_name = m.group(1) 70 98 prop_conditions = m.group(2) 71 99 72 - # Match for condition in value seperated by an 'at' 73 100 m = re.match(r'(.*) at (.*)', prop_value) 74 101 if m: 75 102 prop_value = m.group(1) 76 103 prop_conditions = m.group(2) 77 104 78 - new_prop = Result({ 79 - 'attribute': prop_name, 80 - 'value': prop_value, 81 - 'source': 'ChemSpider Predicted - ACD/Labs Tab', 82 - 'reliability': 'Unknown', 83 - 'conditions': prop_conditions 84 - }) 105 + new_prop = self.newresult( 106 + attribute=prop_name, 107 + value=prop_value, 108 + source='ChemSpider Predicted - ACD/Labs Tab', 109 + conditions=prop_conditions 110 + ) 85 111 properties.append(new_prop) 86 - log.msg('CS prop: |%s| |%s| |%s|' % 87 - (new_prop['attribute'], new_prop['value'], new_prop['source']), 88 - level=log.DEBUG) 89 112 90 - # Experimental Data Tab, Physico-chemical properties in particular 113 + return properties 114 + 115 + def parse_experimentaldatatab(self, sel): 116 + """ 117 + This function scrapes Experimental Data tab, Physico-chemical 118 + properties in particular. 119 + :param sel: a Selector object of the whole page 120 + :return: a list of Result items 121 + """ 122 + properties = [] 123 + 91 124 scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical ' 92 125 'Properties"]//li/table/tr/td') 93 126 if not scraped_list: ··· 99 132 if line.xpath('span/text()'): 100 133 property_name = line.xpath('span/text()').extract()[0].rstrip() 101 134 else: 102 - new_prop = Result({ 103 - 'attribute': property_name[:-1], 104 - 'value': line.xpath('text()').extract()[0].rstrip(), 105 - 'source': line.xpath( 106 - 'strong/text()').extract()[0].rstrip(), 107 - 'reliability': 'Unknown', 108 - 'conditions': '' 109 - }) 110 - properties.append(new_prop) 111 - log.msg('CS prop: |%s| |%s| |%s|' % 112 - (new_prop['attribute'], new_prop['value'], 113 - new_prop['source']), level=log.DEBUG) 135 + new_prop = self.newresult( 136 + attribute=property_name[:-1], 137 + value=line.xpath('text()').extract()[0].rstrip(), 138 + source=line.xpath('strong/text()').extract()[0].rstrip(), 139 + ) 140 + properties.append(new_prop) 114 141 115 142 return properties 116 143 117 144 def parse_synonyms(self, sel): 118 - """Scrape list of Names and Identifiers""" 145 + """ 146 + This function scrapes the list of Names and Identifiers 147 + :param sel: a Selector object of the whole page 148 + :return: a list of Requests 149 + """ 119 150 requests = [] 120 151 synonyms = [] 121 152 ··· 147 178 return requests 148 179 149 180 def new_synonym(self, sel, name, category): 150 - """Scrape for a single synonym at a given HTML tag""" 181 + """ 182 + This function scrapes for a single synonym at a given HTML tag 183 + :param sel: a Selector object of the given HTML tag 184 + :param name: the name of the synonym in the tag 185 + :param category: the name of the category the synonym is labeled as 186 + :return: a dictionary containing data on the synonym 187 + """ 151 188 self.ignore_list.append(name) 152 189 language = sel.xpath('span[@class="synonym_language"]/text()') 153 190 if language: ··· 182 219 } 183 220 return synonym 184 221 185 - @staticmethod 186 - def parse_extendedinfo(response): 187 - """Scrape data from the ChemSpider GetExtendedCompoundInfo API""" 222 + def parse_extendedinfo(self, response): 223 + """ 224 + This function scrapes data from the ChemSpider GetExtendedCompoundInfo 225 + API, if a token is present in the configuration settings 226 + :param response: a Response object to be parsed 227 + :return: a list of Result items 228 + """ 188 229 sel = Selector(response) 189 230 properties = [] 190 231 names = sel.xpath('*').xpath('name()').extract() 191 232 values = sel.xpath('*').xpath('text()').extract() 192 233 for (name, value) in zip(names, values): 193 - result = Result({ 194 - 'attribute': name, 195 - 'value': value, # These values have no unit! 196 - 'source': 'ChemSpider ExtendedCompoundInfo', 197 - 'reliability': 'Unknown', 198 - 'conditions': '' 199 - }) 234 + result = self.newresult( 235 + attribute=name, 236 + value=value, # These values have no unit! 237 + source='ChemSpider ExtendedCompoundInfo', 238 + ) 200 239 if result['value']: 201 240 properties.append(result) 202 241 return properties 203 242 243 + def newresult(self, attribute, value, conditions='', source='ChemSpider'): 244 + """ 245 + This function abstracts from the Result item and provides default 246 + values. 247 + :param attribute: the name of the attribute 248 + :param value: the value of the attribute 249 + :param conditions: optional conditions regarding the value 250 + :param source: the name of the source if it is not ChemSpider 251 + :return: A Result item 252 + """ 253 + return Result({ 254 + 'attribute': attribute, 255 + 'value': value, 256 + 'source': source, 257 + 'reliability': self.cfg['reliability'], 258 + 'conditions': conditions 259 + }) 260 + 204 261 def parse_searchrequest(self, response): 205 - """Parse the initial response of the ChemSpider Search API """ 262 + """ 263 + This function parses the initial response of the ChemSpider Search API 264 + Requires a valid token to function. 265 + :param response: the Response object to be parsed 266 + :return: A Request for the information page and a Request for the 267 + extendedinfo API call 268 + """ 206 269 sel = Selector(response) 207 270 log.msg('chemspider parse_searchrequest', level=log.DEBUG) 208 271 sel.register_namespace('cs', 'http://www.chemspider.com/') ··· 214 277 log.msg('ChemSpider found multiple substances, taking first ' 215 278 'element', level=log.DEBUG) 216 279 csid = csids[0] 217 - structure_url = self.website[:-1] + self.structure % csid 218 - extendedinfo_url = self.website[:-1] + self.extendedinfo % csid 280 + structure_url = self.website[:-2].replace("\\", "") + self.structure % csid 281 + extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid 219 282 log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG) 220 283 return [Request(url=structure_url, 221 284 callback=self.parse), ··· 223 286 callback=self.parse_extendedinfo)] 224 287 225 288 def new_compound_request(self, compound): 226 - if compound in self.ignore_list: # [TODO] - add regular expression 289 + """ 290 + This function is called when a new synonym is returned to the spider 291 + to generate new requests 292 + :param compound: the name of the compound to search for 293 + """ 294 + if compound in self.ignore_list or self.cfg['token'] == '': 227 295 return None 228 - searchurl = self.website[:-1] + self.search % compound 296 + searchurl = self.website[:-2].replace("\\", "") + self.search % compound 229 297 log.msg('chemspider compound', level=log.DEBUG) 230 298 return Request(url=searchurl, callback=self.parse_searchrequest)
+334
FourmiCrawler/sources/NIST.py
··· 1 + import re 2 + 3 + from scrapy import log 4 + from scrapy.http import Request 5 + from scrapy.selector import Selector 6 + 7 + from source import Source 8 + from FourmiCrawler.items import Result 9 + 10 + 11 + # [TODO]: values can be '128.', perhaps remove the dot in that case? 12 + # [TODO]: properties have references and comments which do not exist in the 13 + # Result item, but should be included eventually. 14 + 15 + class NIST(Source): 16 + """ 17 + NIST Scraper plugin 18 + This plugin manages searching for a chemical on the NIST website 19 + and parsing the resulting page if the chemical exists on NIST. 20 + """ 21 + website = "http://webbook\\.nist\\.gov/.*" 22 + 23 + search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' 24 + 25 + def __init__(self, config=None): 26 + """ 27 + Initialization of NIST scraper 28 + :param config: configuration variables for this scraper, must contain 29 + 'reliability' key. 30 + """ 31 + Source.__init__(self, config) 32 + self.ignore_list = set() 33 + 34 + def parse(self, response): 35 + """ 36 + This function is called when a Response matching the variable 37 + 'website' is available for parsing the Response object. 38 + :param response: The Scrapy Response object to be parsed 39 + :return: a list of Result items and Request objects 40 + """ 41 + sel = Selector(response) 42 + 43 + title = sel.xpath('head/title/text()').extract()[0] 44 + if title == 'Name Not Found': 45 + log.msg('NIST: Chemical not found!', level=log.ERROR) 46 + return 47 + if title not in self.ignore_list: 48 + self.ignore_list.update(title) 49 + log.msg('NIST emit synonym: %s' % title, level=log.DEBUG) 50 + self._spider.get_synonym_requests(title) 51 + 52 + requests = [] 53 + 54 + requests.extend(self.parse_generic_info(sel)) 55 + 56 + symbol_table = {} 57 + tds = sel.xpath('//table[@class="symbol_table"]/tr/td') 58 + for (symbol_td, name_td) in zip(tds[::2], tds[1::2]): 59 + symbol = ''.join(symbol_td.xpath('node()').extract()) 60 + name = name_td.xpath('text()').extract()[0] 61 + symbol_table[symbol] = name 62 + log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name), 63 + level=log.DEBUG) 64 + 65 + requests.extend(self.parse_tables(sel, symbol_table)) 66 + 67 + return requests 68 + 69 + def parse_tables(self, sel, symbol_table): 70 + """ 71 + This function identifies and distributes parsing of tables to other 72 + functions below. 73 + :param sel: A Selector object of the whole page 74 + :param symbol_table: a dictionary containing translations of raw HTML 75 + tags to human readable names 76 + :return: a list of Result items and Requests 77 + """ 78 + requests = [] 79 + 80 + for table in sel.xpath('//table[@class="data"]'): 81 + summary = table.xpath('@summary').extract()[0] 82 + if summary == 'One dimensional data': 83 + log.msg('NIST table: Aggregrate data', level=log.DEBUG) 84 + requests.extend( 85 + self.parse_aggregate_data(table, symbol_table)) 86 + elif table.xpath('tr/th="Initial Phase"').extract()[0] == '1': 87 + log.msg('NIST table; Enthalpy/entropy of phase transition', 88 + level=log.DEBUG) 89 + requests.extend(self.parse_transition_data(table, summary)) 90 + elif table.xpath('tr[1]/td'): 91 + log.msg('NIST table: Horizontal table', level=log.DEBUG) 92 + elif summary == 'Antoine Equation Parameters': 93 + log.msg('NIST table: Antoine Equation Parameters', 94 + level=log.DEBUG) 95 + requests.extend(self.parse_antoine_data(table, summary)) 96 + elif len(table.xpath('tr[1]/th')) == 5: 97 + log.msg('NIST table: generic 5 columns', level=log.DEBUG) 98 + # Symbol (unit) Temperature (K) Method Reference Comment 99 + requests.extend(self.parse_generic_data(table, summary)) 100 + elif len(table.xpath('tr[1]/th')) == 4: 101 + log.msg('NIST table: generic 4 columns', level=log.DEBUG) 102 + # Symbol (unit) Temperature (K) Reference Comment 103 + requests.extend(self.parse_generic_data(table, summary)) 104 + else: 105 + log.msg('NIST table: NOT SUPPORTED', level=log.WARNING) 106 + continue # Assume unsupported 107 + return requests 108 + 109 + def parse_generic_info(self, sel): 110 + """ 111 + This function parses: synonyms, chemical formula, molecular weight, 112 + InChI, InChiKey, CAS number 113 + :param sel: A Selector object of the entire page in the original 114 + response 115 + :return: a list of Result items 116 + """ 117 + ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]') 118 + 119 + raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract() 120 + for synonym in raw_synonyms[0].strip().split(';\n'): 121 + log.msg('NIST synonym: %s' % synonym, level=log.DEBUG) 122 + self.ignore_list.update(synonym) 123 + self._spider.get_synonym_requests(synonym) 124 + 125 + data = {} 126 + 127 + raw_formula = ul.xpath('li[strong/a="Formula"]//text()').extract() 128 + data['Chemical formula'] = ''.join(raw_formula[2:]).strip() 129 + 130 + raw_mol_weight = ul.xpath('li[strong/a="Molecular weight"]/text()') 131 + data['Molecular weight'] = raw_mol_weight.extract()[0].strip() 132 + 133 + raw_inchi = ul.xpath('li[strong="IUPAC Standard InChI:"]//tt/text()') 134 + data['IUPAC Standard InChI'] = raw_inchi.extract()[0] 135 + 136 + raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]' 137 + '/tt/text()') 138 + data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0] 139 + 140 + raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()') 141 + data['CAS Registry Number'] = raw_cas_number.extract()[0].strip() 142 + 143 + requests = [] 144 + for key, value in data.iteritems(): 145 + result = self.newresult( 146 + attribute=key, 147 + value=value 148 + ) 149 + requests.append(result) 150 + 151 + return requests 152 + 153 + def parse_aggregate_data(self, table, symbol_table): 154 + """ 155 + This function parses the table(s) which contain possible links to 156 + individual data points 157 + :param table: a Selector object of the table to be parsed 158 + :param symbol_table: a dictionary containing translations of raw HTML 159 + tags to human readable names 160 + :return: a list of Result items and Request objects 161 + """ 162 + results = [] 163 + for tr in table.xpath('tr[td]'): 164 + extra_data_url = tr.xpath('td[last()][a="Individual data points"]' 165 + '/a/@href').extract() 166 + if extra_data_url: 167 + request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0], 168 + callback=self.parse_individual_datapoints) 169 + results.append(request) 170 + continue 171 + data = [] 172 + for td in tr.xpath('td'): 173 + data.append(''.join(td.xpath('node()').extract())) 174 + 175 + name = symbol_table[data[0]] 176 + condition = '' 177 + 178 + m = re.match(r'(.*) at (.*)', name) 179 + if m: 180 + name = m.group(1) 181 + condition = m.group(2) 182 + 183 + result = self.newresult( 184 + attribute=name, 185 + value=data[1] + ' ' + data[2], 186 + conditions=condition 187 + ) 188 + log.msg('NIST: |%s|' % data, level=log.DEBUG) 189 + results.append(result) 190 + return results 191 + 192 + def parse_transition_data(self, table, summary): 193 + """ 194 + This function parses the table containing properties regarding phase 195 + changes 196 + :param table: a Selector object of the table to be parsed 197 + :param summary: the name of the property 198 + :return: a list of Result items 199 + """ 200 + results = [] 201 + 202 + unit = self.get_unit(table) 203 + 204 + for tr in table.xpath('tr[td]'): 205 + tds = tr.xpath('td/text()').extract() 206 + result = self.newresult( 207 + attribute=summary, 208 + value=tds[0] + ' ' + unit, 209 + conditions='%s K, (%s -> %s)' % (tds[1], tds[2], tds[3]) 210 + ) 211 + results.append(result) 212 + 213 + return results 214 + 215 + def parse_generic_data(self, table, summary): 216 + """ 217 + Parses the common tables of 4 and 5 rows. Assumes they are of the 218 + form: 219 + Symbol (unit)|Temperature (K)|Method|Reference|Comment 220 + Symbol (unit)|Temperature (K)|Reference|Comment 221 + :param table: a Selector object of the table to be parsed 222 + :param summary: the name of the property 223 + :return: a list of Result items 224 + """ 225 + results = [] 226 + 227 + unit = self.get_unit(table) 228 + 229 + for tr in table.xpath('tr[td]'): 230 + tds = tr.xpath('td/text()').extract() 231 + result = self.newresult( 232 + attribute=summary, 233 + value=tds[0] + ' ' + unit, 234 + conditions='%s K' % tds[1] 235 + ) 236 + results.append(result) 237 + return results 238 + 239 + def parse_antoine_data(self, table, summary): 240 + """ 241 + This function parses the table containing parameters for the Antione 242 + equation 243 + :param table: a Selector object of the table to be parsed 244 + :param summary: the name of the property 245 + :return: a list of Result items 246 + """ 247 + results = [] 248 + 249 + for tr in table.xpath('tr[td]'): 250 + tds = tr.xpath('td/text()').extract() 251 + result = self.newresult( 252 + attribute=summary, 253 + value='A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]), 254 + conditions='%s K' % tds[0] 255 + ) 256 + results.append(result) 257 + 258 + return results 259 + 260 + def parse_individual_datapoints(self, response): 261 + """ 262 + This function parses the 'individual data points' page linked from 263 + the aggregate data table(s) 264 + :param response: the Scrapy Response object to be parsed 265 + :return: a list of Result items 266 + """ 267 + sel = Selector(response) 268 + table = sel.xpath('//table[@class="data"]')[0] 269 + 270 + results = [] 271 + 272 + name = table.xpath('@summary').extract()[0] 273 + condition = '' 274 + m = re.match(r'(.*) at (.*)', name) 275 + if m: 276 + name = m.group(1) 277 + condition = m.group(2) 278 + 279 + unit = self.get_unit(table) 280 + 281 + for tr in table.xpath('tr[td]'): 282 + tds = tr.xpath('td/text()').extract() 283 + uncertainty = '' 284 + m = re.search('Uncertainty assigned by TRC = (.*?) ', tds[-1]) 285 + if m: 286 + uncertainty = '+- %s ' % m.group(1) 287 + # [TODO]: get the plusminus sign working in here 288 + result = self.newresult( 289 + attribute=name, 290 + value='%s %s%s' % (tds[0], uncertainty, unit), 291 + conditions=condition 292 + ) 293 + results.append(result) 294 + 295 + return results 296 + 297 + @staticmethod 298 + def get_unit(table): 299 + tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract()) 300 + m = re.search(r'\((.*)\)', tr_unit) 301 + unit = '!' 302 + if m: 303 + unit = m.group(1) 304 + 305 + return unit 306 + 307 + def newresult(self, attribute, value, conditions=''): 308 + """ 309 + This function abstracts from the Result item and provides default 310 + values 311 + :param attribute: the name of the attribute 312 + :param value: the value of the attribute 313 + :param conditions: optional conditions regarding the value 314 + :return: A Result item 315 + """ 316 + return Result( 317 + { 318 + 'attribute': attribute, 319 + 'value': value, 320 + 'source': 'NIST', 321 + 'reliability': self.cfg['reliability'], 322 + 'conditions': conditions 323 + }) 324 + 325 + def new_compound_request(self, compound): 326 + """ 327 + This function is called when a new synonym is returned to the spider 328 + to generate new requests 329 + :param compound: the name of the compound to search for 330 + """ 331 + if compound not in self.ignore_list: 332 + self.ignore_list.update(compound) 333 + return Request(url=self.website[:-2].replace("\\", "") + self.search % compound, 334 + callback=self.parse)
+149
FourmiCrawler/sources/PubChem.py
··· 1 + import re 2 + 3 + from scrapy.http import Request 4 + from scrapy import log 5 + from scrapy.selector import Selector 6 + 7 + from source import Source 8 + from FourmiCrawler.items import Result 9 + 10 + 11 + class PubChem(Source): 12 + """ PubChem scraper for chemical properties 13 + 14 + This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance, 15 + including sources of the values of properties. 16 + """ 17 + 18 + # PubChem has its data on compound name, properties and their values on different html pages, so different URLs used 19 + website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*' 20 + website_www = 'http://www.ncbi.nlm.nih.gov/*' 21 + website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*' 22 + search = 'pccompound?term=%s' 23 + data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s' 24 + 25 + __spider = None 26 + searched_compounds = set() 27 + 28 + def __init__(self, config): 29 + Source.__init__(self, config) 30 + self.cfg = config 31 + 32 + def parse(self, response): 33 + """ 34 + Distributes the above described behaviour 35 + :param response: The incoming search request 36 + :return Returns the found properties if response is unique or returns none if it's already known 37 + """ 38 + requests = [] 39 + log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) 40 + 41 + sel = Selector(response) 42 + compound = sel.xpath('//h1/text()').extract()[0] 43 + if compound in self.searched_compounds: 44 + return None 45 + 46 + self.searched_compounds.update(compound) 47 + raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0] 48 + for synonym in raw_synonyms.strip().split(', '): 49 + log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG) 50 + self.searched_compounds.update(synonym) 51 + self._spider.get_synonym_requests(synonym) 52 + log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG) 53 + 54 + n = re.search(r'cid=(\d+)', response.url) 55 + if n: 56 + cid = n.group(1) 57 + log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach 58 + # the seperate html page which contains the properties and their values 59 + 60 + # using this cid to get the right url and scrape it 61 + requests.append( 62 + Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data)) 63 + return requests 64 + 65 + def parse_data(self, response): 66 + """ 67 + Parse data found in 'Chemical and Physical properties' part of a substance page. 68 + :param response: The response with the page to parse 69 + :return: requests: Returns a list of properties with their values, source, etc. 70 + """ 71 + log.msg('parsing data', level=log.DEBUG) 72 + requests = [] 73 + 74 + sel = Selector(response) 75 + props = sel.xpath('//div') 76 + 77 + for prop in props: 78 + prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing 79 + if prop.xpath('a'): # parsing for single value in property 80 + prop_source = ''.join(prop.xpath('a/@title').extract()) 81 + prop_value = ''.join(prop.xpath('a/text()').extract()) 82 + new_prop = Result({ 83 + 'attribute': prop_name, 84 + 'value': prop_value, 85 + 'source': prop_source, 86 + 'reliability': self.cfg['reliability'], 87 + 'conditions': '' 88 + }) 89 + log.msg('PubChem prop: |%s| |%s| |%s|' % 90 + (new_prop['attribute'], new_prop['value'], 91 + new_prop['source']), level=log.DEBUG) 92 + requests.append(new_prop) 93 + elif prop.xpath('ul'): # parsing for multiple values (list) in property 94 + prop_values = prop.xpath('ul//li') 95 + for prop_li in prop_values: 96 + prop_value = ''.join(prop_li.xpath('a/text()').extract()) 97 + prop_source = ''.join(prop_li.xpath('a/@title').extract()) 98 + new_prop = Result({ 99 + 'attribute': prop_name, 100 + 'value': prop_value, 101 + 'source': prop_source, 102 + 'reliability': self.cfg['reliability'], 103 + 'conditions': '' 104 + }) 105 + log.msg('PubChem prop: |%s| |%s| |%s|' % 106 + (new_prop['attribute'], new_prop['value'], 107 + new_prop['source']), level=log.DEBUG) 108 + requests.append(new_prop) 109 + 110 + return requests 111 + 112 + def parse_searchrequest(self, response): 113 + """ 114 + This function parses the response to the new_compound_request Request 115 + :param response: the Response object to be parsed 116 + :return: A Request for the compound page or what self.parse returns in 117 + case the search request forwarded to the compound page 118 + """ 119 + 120 + # check if pubchem forwarded straight to compound page 121 + m = re.match(self.website_pubchem, response.url) 122 + if m: 123 + log.msg('PubChem search forwarded to compound page', 124 + level=log.DEBUG) 125 + return self.parse(response) 126 + 127 + sel = Selector(response) 128 + 129 + results = sel.xpath('//div[@class="rsltcont"]') 130 + if results: 131 + url = results[0].xpath('div/p/a[1]/@href') 132 + else: 133 + log.msg('PubChem search found nothing or xpath failed', 134 + level=log.DEBUG) 135 + return None 136 + 137 + if url: 138 + url = 'http:' + ''.join(url[0].extract()) 139 + log.msg('PubChem compound page: %s' % url, level=log.DEBUG) 140 + else: 141 + log.msg('PubChem search found results, but no url in first result', 142 + level=log.DEBUG) 143 + return None 144 + 145 + return Request(url=url, callback=self.parse) 146 + 147 + def new_compound_request(self, compound): 148 + return Request(url=self.website_www[:-1] + self.search % compound, 149 + callback=self.parse_searchrequest)
+97 -45
FourmiCrawler/sources/WikipediaParser.py
··· 1 + import re 2 + 1 3 from scrapy.http import Request 2 4 from scrapy import log 5 + from scrapy.selector import Selector 6 + 3 7 from source import Source 4 - from scrapy.selector import Selector 5 8 from FourmiCrawler.items import Result 6 - import re 7 9 8 10 9 11 class WikipediaParser(Source): 10 12 """ Wikipedia scraper for chemical properties 11 13 12 14 This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values. 13 - It also returns requests with other external sources which contain information on parsed subject. 15 + It also returns requests with other external sources which contain information on parsed subject. 14 16 """ 15 17 16 - website = "http://en.wikipedia.org/wiki/*" 18 + website = "http://en\\.wikipedia\\.org/wiki/.*" 17 19 __spider = None 18 20 searched_compounds = [] 19 21 20 - def __init__(self): 21 - Source.__init__(self) 22 + def __init__(self, config=None): 23 + Source.__init__(self, config) 22 24 23 25 def parse(self, response): 24 - """ Distributes the above described behaviour """ 26 + """ 27 + Distributes the above described behaviour 28 + :param response: The incoming search request 29 + :return: Returns the found properties if response is unique or returns none if it's already known 30 + """ 25 31 log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) 26 32 sel = Selector(response) 27 33 compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page ··· 33 39 return items 34 40 35 41 def parse_infobox(self, sel): 36 - """ scrape data from infobox on wikipedia. """ 42 + """ 43 + Scrape data from infobox on wikipedia. 44 + 45 + Data from two types of infoboxes: class="infobox bordered" and class="infobox" is scraped and 46 + :param sel: The selector with the html-information of the page to parse 47 + :return: item_list: Returns a list of properties with their values, source, etc.. 48 + """ 49 + 37 50 items = [] 38 51 39 - #be sure to get chembox (wikipedia template) 40 - tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \ 41 - xpath('normalize-space(string())') 42 - prop_names = tr_list[::2] 43 - prop_values = tr_list[1::2] 44 - for i, prop_name in enumerate(prop_names): 45 - item = Result({ 46 - 'attribute': prop_name.extract().encode('utf-8'), 47 - 'value': prop_values[i].extract().encode('utf-8'), 48 - 'source': "Wikipedia", 49 - 'reliability': "Unknown", 50 - 'conditions': "" 51 - }) 52 - items.append(item) 53 - log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) 52 + # scrape the chembox (wikipedia template) 53 + items = self.parse_chembox(sel, items) 54 54 55 - #scrape the drugbox (wikipedia template) 56 - tr_list2 = sel.xpath('.//table[@class="infobox"]//tr') 57 - log.msg('dit: %s' % tr_list2, level=log.DEBUG) 58 - for tablerow in tr_list2: 59 - log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG) 60 - if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath( 61 - 'normalize-space(string())'): 62 - item = Result({ 63 - 'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'), 64 - 'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'), 65 - 'source': "Wikipedia", 66 - 'reliability': "Unknown", 67 - 'conditions': "" 68 - }) 69 - items.append(item) 70 - log.msg( 71 - 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']), 72 - level=log.DEBUG) 55 + # scrape the drugbox (wikipedia template) 56 + items = self.parse_drugbox(sel, items) 73 57 74 58 items = filter(lambda a: a['value'] != '', items) # remove items with an empty value 75 59 item_list = self.clean_items(items) ··· 93 77 94 78 return item_list 95 79 80 + def parse_chembox(self, sel, items): 81 + """ 82 + Scrape data from chembox infobox on wikipedia. 83 + 84 + :param sel: The selector with the html-information of the page to parse 85 + :param items: the list of items where the result have to be stored in 86 + :return: items: the list of items with the new found and stored items 87 + """ 88 + tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \ 89 + xpath('normalize-space(string())') 90 + prop_names = tr_list[::2] 91 + prop_values = tr_list[1::2] 92 + for i, prop_name in enumerate(prop_names): 93 + item = self.newresult( 94 + attribute=prop_name.extract().encode('utf-8'), 95 + value=prop_values[i].extract().encode('utf-8') 96 + ) 97 + items.append(item) 98 + log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) 99 + return items 100 + 101 + def parse_drugbox(self, sel, items): 102 + """ 103 + Scrape data from drugbox infobox on wikipedia. 104 + 105 + :param sel: The selector with the html-information of the page to parse 106 + :param items: the list of items where the result have to be stored in 107 + :return: items: the list of items with the new found and stored items 108 + """ 109 + tr_list2 = sel.xpath('.//table[@class="infobox"]//tr') 110 + log.msg('dit: %s' % tr_list2, level=log.DEBUG) 111 + for tablerow in tr_list2: 112 + log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG) 113 + if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath( 114 + 'normalize-space(string())'): 115 + item = self.newresult( 116 + attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'), 117 + value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'), 118 + ) 119 + items.append(item) 120 + log.msg( 121 + 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']), 122 + level=log.DEBUG) 123 + return items 124 + 96 125 def new_compound_request(self, compound): 97 - return Request(url=self.website[:-1] + compound, callback=self.parse) 126 + return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse) 98 127 99 128 @staticmethod 100 129 def clean_items(items): 101 - """ clean up properties using regex, makes it possible to split the values from the units """ 130 + 131 + """ 132 + Clean up properties using regex, makes it possible to split the values from the units 133 + 134 + Almost not in use, only cleans J/K/mol values and boiling/melting points. 135 + 136 + :param items: List of properties with their values, source, etc.. 137 + :return: items: List of now cleaned up items 138 + """ 102 139 for item in items: 103 140 value = item['value'] 104 141 m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F) ··· 111 148 112 149 @staticmethod 113 150 def get_identifiers(sel): 114 - """ find external links, named 'Identifiers' to different sources. """ 151 + """ 152 + Find external links, named 'Identifiers' to different sources. 153 + 154 + :param sel: The selector with the html-information of the page to parse 155 + :return: links: New links which can be used to expand the crawlers search 156 + """ 115 157 links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a' 116 158 '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() 117 - return links 159 + return links 160 + 161 + def newresult(self, attribute, value): 162 + return Result( 163 + { 164 + 'attribute': attribute, 165 + 'value': value, 166 + 'source': 'Wikipedia', 167 + 'reliability': self.cfg['reliability'], 168 + 'conditions': '' 169 + })
+25 -5
FourmiCrawler/sources/source.py
··· 3 3 4 4 5 5 class Source: 6 - website = "http://something/*" # Regex of URI's the source is able to parse 6 + website = "http://something/.*" # Regex of URI's the source is able to parse 7 7 _spider = None 8 8 9 - def __init__(self): 9 + def __init__(self, config=None): 10 + """ 11 + Initiation of a new Source 12 + """ 13 + self.cfg = {} 14 + if config is not None: 15 + self.cfg = config 10 16 pass 11 17 12 - def parse(self, reponse): 13 - log.msg("The parse function of the empty parser was used.", level=log.WARNING) 18 + def parse(self, response): 19 + """ 20 + This function should be able to parse all Scrapy Response objects with a URL matching the website Regex. 21 + :param response: A Scrapy Response object 22 + :return: A list of Result items and new Scrapy Requests 23 + """ 24 + log.msg("The parse function of the empty source was used.", level=log.WARNING) 14 25 pass 15 26 16 27 def new_compound_request(self, compound): 17 - # return Request(url=self.website[:-1] + compound, callback=self.parse) 28 + """ 29 + This function should return a Scrapy Request for the given compound request. 30 + :param compound: A compound name. 31 + :return: A new Scrapy Request 32 + """ 33 + # return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse) 18 34 pass 19 35 20 36 def set_spider(self, spider): 37 + """ 38 + A Function to save the associated spider. 39 + :param spider: A FourmiSpider object 40 + """ 21 41 self._spider = spider
+61 -23
FourmiCrawler/spider.py
··· 1 + import re 2 + 1 3 from scrapy.spider import Spider 2 4 from scrapy import log 3 - import re 4 5 5 6 6 7 class FourmiSpider(Spider): 8 + """ 9 + A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data. 10 + """ 7 11 name = "FourmiSpider" 8 - __parsers = [] 9 - synonyms = [] 10 12 11 - def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): 13 + def __init__(self, compound=None, selected_attributes=None, *args, **kwargs): 14 + """ 15 + Initiation of the Spider 16 + :param compound: compound that will be searched. 17 + :param selected_attributes: A list of regular expressions that the attributes should match. 18 + """ 19 + self._sources = [] 20 + self.synonyms = set() 12 21 super(FourmiSpider, self).__init__(*args, **kwargs) 13 - self.synonyms.append(compound) 14 - self.selected_attributes = selected_attributes; 22 + self.synonyms.add(compound) 23 + if selected_attributes is None: 24 + self.selected_attributes = [".*"] 25 + else: 26 + self.selected_attributes = selected_attributes 15 27 16 - def parse(self, reponse): 17 - for parser in self.__parsers: 18 - if re.match(parser.website, reponse.url): 19 - log.msg("Url: " + reponse.url + " -> Source: " + parser.website, level=log.DEBUG) 20 - return parser.parse(reponse) 28 + def parse(self, response): 29 + """ 30 + The function that is called when a response to a request is available. This function distributes this to a 31 + source which should be able to handle parsing the data. 32 + :param response: A Scrapy Response object that should be parsed 33 + :return: A list of Result items and new Request to be handled by the scrapy core. 34 + """ 35 + for source in self._sources: 36 + if re.match(source.website, response.url): 37 + log.msg("URL: " + response.url + " -> Source: " + source.website, level=log.DEBUG) 38 + return source.parse(response) 39 + log.msg("URL: " + response.url + " -> No compatible source", level=log.INFO) 21 40 return None 22 41 23 - def get_synonym_requests(self, compound): 42 + def get_synonym_requests(self, compound, force=False): 43 + """ 44 + A function that generates new Scrapy Request for each source given a new synonym of a compound. 45 + :param compound: A compound name 46 + :return: A list of Scrapy Request objects 47 + """ 24 48 requests = [] 25 - for parser in self.__parsers: 26 - parser_requests = parser.new_compound_request(compound) 27 - if parser_requests is not None: 28 - requests.append(parser_requests) 49 + if force or compound not in self.synonyms: 50 + self.synonyms.add(compound) 51 + for parser in self._sources: 52 + parser_requests = parser.new_compound_request(compound) 53 + if parser_requests is not None: 54 + requests.append(parser_requests) 29 55 return requests 30 56 31 57 def start_requests(self): 58 + """ 59 + The function called by Scrapy for it's first Requests 60 + :return: A list of Scrapy Request generated from the known synonyms using the available sources. 61 + """ 32 62 requests = [] 33 63 for synonym in self.synonyms: 34 - requests.extend(self.get_synonym_requests(synonym)) 64 + requests.extend(self.get_synonym_requests(synonym, force=True)) 35 65 return requests 36 66 37 - def add_parsers(self, parsers): 38 - for parser in parsers: 39 - self.add_parser(parser) 67 + def add_sources(self, sources): 68 + """ 69 + A function to add a new Parser objects to the list of available sources. 70 + :param sources: A list of Source Objects. 71 + """ 72 + for parser in sources: 73 + self.add_source(parser) 40 74 41 - def add_parser(self, parser): 42 - self.__parsers.append(parser) 43 - parser.set_spider(self) 75 + def add_source(self, source): 76 + """ 77 + A function add a new Parser object to the list of available parsers. 78 + :param source: A Source Object 79 + """ 80 + self._sources.append(source) 81 + source.set_spider(self)
+1
GUI/__init__.py
··· 1 + import gui
+30
GUI/configImporter.py
··· 1 + import ConfigParser 2 + 3 + 4 + class ConfigImporter(): 5 + def __init__(self, filename): 6 + """Read the filename into the parser.""" 7 + self.filename = filename 8 + self.parser = ConfigParser.ConfigParser() 9 + self.parser.read(self.filename) 10 + 11 + def load_common_attributes(self): 12 + """Loads common attributes from the initialized file.""" 13 + try: 14 + return self.parser.get('GUI', 'CommonParameters') 15 + except: 16 + return 'One, Two, Three' 17 + 18 + def load_output_types(self): 19 + """Loads output types from the initialized file.""" 20 + try: 21 + return self.parser.get('GUI', 'OutputTypes') 22 + except: 23 + return 'csv' 24 + 25 + def load_always_attributes(self): 26 + """Loads attributes that are always searched for from the initialized file.""" 27 + try: 28 + return self.parser.get('GUI', 'AlwaysParameters') 29 + except: 30 + return 'Name, Weight'
+196
GUI/gui.py
··· 1 + from Tkinter import * 2 + import os 3 + import shutil 4 + from tkFileDialog import asksaveasfilename 5 + 6 + from configImporter import * 7 + 8 + 9 + class GUI(): 10 + def __init__(self, search, config_file='GUI.cfg', sourceloader=None, in_source=True): 11 + """Boots the window, configuration.""" 12 + if not in_source: 13 + current_dir = os.path.dirname(os.path.abspath(__file__)) 14 + config_file = current_dir + '../' + config_file 15 + if not os.path.isfile(config_file): 16 + try: 17 + shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../GUI.cfg.sample", config_file) 18 + except IOError: 19 + print "GUI configuration couldn't be found and couldn't be created." 20 + sys.exit() 21 + self.configurator = ConfigImporter(config_file) 22 + self.sourceloader = sourceloader 23 + self.finish_with_search = False 24 + self.values = {} 25 + self.required_variables = ['substance'] 26 + self.search = search 27 + self.window, self.variables = self.generate_window(self.load_common_attributes(), self.load_output_types()) 28 + 29 + def load_common_attributes(self): 30 + """Calls the configuration parser for common attributes.""" 31 + return [x.strip() for x in self.configurator.load_common_attributes().split(',')] 32 + 33 + def load_output_types(self): 34 + """Calls the configuration parser for output types.""" 35 + return [x.strip() for x in self.configurator.load_output_types().split(',')] 36 + 37 + def load_always_attributes(self): 38 + """Calls the configuration parser for attributes that are always used.""" 39 + return ','.join([x.strip() for x in self.configurator.load_always_attributes().split(',')]) 40 + 41 + def set_output(self): 42 + self.variable_output_name.set(asksaveasfilename()) 43 + self.button_output_name.config(text=self.variable_output_name.get()) 44 + 45 + def generate_window(self, common_attributes, output_types): 46 + """Creates all widgets and variables in the window.""" 47 + window = Tk() 48 + window.wm_title("Fourmi Crawler") 49 + 50 + variables = {} 51 + 52 + variable_substance = StringVar(window) 53 + frame_substance = Frame(window) 54 + label_substance = Label(frame_substance, text="Substance: ") 55 + input_substance = Entry(frame_substance, font=("Helvetica", 12), width=25, textvariable=variable_substance) 56 + variables.update({"substance": variable_substance}) 57 + frame_substance.pack(side=TOP) 58 + label_substance.pack() 59 + input_substance.pack() 60 + input_substance.focus() 61 + 62 + frame_all_attributes = Frame(window) 63 + frame_selecting_attributes = Frame(frame_all_attributes) 64 + frame_new_attributes = Frame(frame_selecting_attributes) 65 + label_new_attributes = Label(frame_new_attributes, text="Parameters: ") 66 + input_new_attributes = Text(frame_new_attributes, font=("Helvetica", 8), width=25, height=7, padx=5, pady=5) 67 + variables.update({"new_attributes": input_new_attributes}) 68 + frame_new_attributes.pack(side=LEFT) 69 + label_new_attributes.pack() 70 + input_new_attributes.pack() 71 + 72 + frame_common_attributes = Frame(frame_selecting_attributes) 73 + label_common_attributes = Label(frame_common_attributes, text="Common Parameters: ") 74 + input_common_attributes = Listbox(frame_common_attributes, selectmode=MULTIPLE, height=7) 75 + scrollbar_common_attributes = Scrollbar(frame_common_attributes) 76 + input_common_attributes.config(yscrollcommand=scrollbar_common_attributes.set) 77 + scrollbar_common_attributes.config(command=input_common_attributes.yview) 78 + if common_attributes and len(common_attributes) > 0: 79 + input_common_attributes.insert(END, *common_attributes) 80 + variables.update({"common_attributes": input_common_attributes}) 81 + frame_common_attributes.pack(side=RIGHT) 82 + label_common_attributes.pack(side=TOP) 83 + input_common_attributes.pack(side=LEFT) 84 + scrollbar_common_attributes.pack(side=RIGHT, fill=Y) 85 + frame_selecting_attributes.pack() 86 + 87 + frame_last = Frame(window) 88 + search_button = Button(frame_last, text="Start search", command=self.prepare_search) 89 + cancel_button = Button(frame_last, text="Cancel", command=window.destroy) 90 + frame_last.pack(side=BOTTOM) 91 + search_button.pack(side=LEFT) 92 + cancel_button.pack(side=RIGHT) 93 + 94 + frame_name = Frame(window) 95 + frame_output_name = Frame(frame_name) 96 + label_output_name = Label(frame_output_name, text='Output file:') 97 + self.variable_output_name = StringVar() 98 + self.variable_output_name.set('results.csv') 99 + variables.update({'output_name':self.variable_output_name}) 100 + self.button_output_name = Button(frame_output_name, command=self.set_output, text="Select file") 101 + frame_output_name.pack(side=LEFT) 102 + label_output_name.pack() 103 + self.button_output_name.pack() 104 + frame_name.pack(side=BOTTOM) 105 + 106 + 107 + frame_checkboxes = Frame(window) 108 + frame_checkbox_attributes = Frame(frame_checkboxes) 109 + variable_all_attributes = BooleanVar() 110 + variable_all_attributes.set(True) 111 + input_all_attributes = Checkbutton(frame_checkbox_attributes, text="Search ALL parameters", 112 + variable=variable_all_attributes) 113 + variables.update({"all_attributes": variable_all_attributes}) 114 + frame_checkbox_attributes.pack(side=LEFT) 115 + input_all_attributes.pack() 116 + 117 + frame_logging = Frame(frame_checkboxes) 118 + variable_logging = BooleanVar() 119 + variable_logging.set(False) 120 + input_logging = Checkbutton(frame_logging, text="Verbose logging", variable=variable_logging) 121 + variables.update({'logging':variable_logging}) 122 + frame_logging.pack(side=RIGHT) 123 + frame_checkboxes.pack(side=BOTTOM) 124 + input_logging.pack() 125 + frame_all_attributes.pack() 126 + 127 + return window, variables 128 + 129 + def prepare_search(self): 130 + """Saves the values from the window for later retrieval.""" 131 + variables = self.variables 132 + values = {} 133 + 134 + values.update({"Always attributes": self.load_always_attributes()}) 135 + for name, var in variables.iteritems(): 136 + if var.__class__ is StringVar: 137 + values.update({name: var.get()}) 138 + elif var.__class__ is BooleanVar: 139 + values.update({name: var.get()}) 140 + elif var.__class__ is Text: 141 + values.update({name: str(var.get("1.0", END)).strip()}) 142 + elif var.__class__ is Listbox: 143 + values.update({name: ", ".join([var.get(int(i)) for i in var.curselection()])}) 144 + else: 145 + print "No known class, {}, {}".format(name, var) 146 + 147 + values.update({'output_name':self.variable_output_name.get()}) 148 + values.update({'output_type':self.check_output_type(values.get('output_name'))}) 149 + 150 + self.values = values 151 + if all([values.get(i) != '' for i in self.required_variables]): 152 + self.finish_with_search = True 153 + self.window.destroy() 154 + else: 155 + self.finish_with_search = False 156 + #tkMessageBox.showinfo('Not all required information was entered!') 157 + 158 + def execute_search(self): 159 + """Calls the Fourmi crawler with the values from the GUI""" 160 + if self.values.get('all_attributes'): 161 + attributes = ".*" 162 + else: 163 + attribute_types = ['attributes', 'Common attributes', 'Always attributes'] 164 + attributes = ','.join([str(self.values.get(attribute)) for attribute in attribute_types]) 165 + output_file = "file://" + str(self.values.get('output_name')) #Dealing with absolute paths 166 + 167 + arguments = {'--attributes': attributes, 168 + '--exclude': None, 169 + '--format': self.values.get('output_type'), 170 + '--help': False, 171 + '--include': None, 172 + '--log': 'log.txt', 173 + '--output': output_file, 174 + '-v': 0 if self.values.get('logging') else 3, 175 + '--version': False, 176 + '<compound>': self.values.get('substance'), 177 + 'list': False, 178 + 'search': True} 179 + 180 + self.search(arguments, self.sourceloader) 181 + 182 + def run(self): 183 + """Starts the window and the search.""" 184 + self.window.mainloop() 185 + if self.finish_with_search: 186 + self.execute_search() 187 + 188 + def check_output_type(self, filename): 189 + parts = str(filename).split('.') 190 + output_types = self.load_output_types() 191 + extension = parts[-1] 192 + 193 + for type in output_types: 194 + if extension==type: 195 + return extension 196 + return output_types[0]
+10
GUI.cfg.sample
··· 1 + [GUI] 2 + # Personalize options in your User Interface 3 + 4 + # Commonly used parameters are listed in the GUI for easy selection 5 + CommonParameters = Weight, Polarity, Viscosity, Solubility, Name 6 + 7 + # Parameters that are always used in the search 8 + AlwaysParameters = Name 9 + 10 + OutputTypes = csv, json, jsonlines, xml
+9 -10
README.md
··· 1 1 # Fourmi 2 2 3 + **Master branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=master)](https://travis-ci.org/jjdekker/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=master) 4 + 5 + **Developing branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=develop)](https://travis-ci.org/jjdekker/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=develop) 6 + 3 7 Fourmi is an web scraper for chemical substances. The program is designed to be 4 8 used as a search engine to search multiple chemical databases for a specific 5 9 substance. The program will produce all available attributes of the substance ··· 19 23 20 24 ### Installing 21 25 22 - If you're installing Fourmi, please take a look at our [installation guide](...) 23 - on our wiki. When you've installed the application, make sure to check our 24 - [usage guide](...). 26 + If you're installing Fourmi, please take a look at our installation guides 27 + on our [wiki](https://github.com/jjdekker/Fourmi/wiki). When you've installed the application, make sure to check our 28 + usage guide on the [Command Line Interface](https://github.com/jjdekker/Fourmi/wiki/CLI) and on the [Graphical User Interface](https://github.com/jjdekker/Fourmi/wiki/GUI). 25 29 26 30 ### Using the Source 27 31 28 32 To use the Fourmi source code multiple dependencies are required. Take a look at 29 - the [wiki page](...) on using the application source code for a step by step 33 + our [wiki pages](https://github.com/jjdekker/Fourmi/wiki) on using the application source code in our a step by step 30 34 installation guide. 31 35 32 36 When developing for the Fourmi project keep in mind that code readability is a 33 37 must. To maintain the readability, code should be conform with the 34 38 [PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python 35 39 code. More information about the different structures and principles of the 36 - Fourmi application can be found on our [wiki](...). 40 + Fourmi application can be found on our [wiki](https://github.com/jjdekker/Fourmi/wiki). 37 41 38 42 ### To Do 39 43 ··· 41 45 42 46 __Main goals:__ 43 47 44 - - Improve our documentation and guides. (Assignee: Dekker) 45 48 - Build an graphical user interface(GUI) as alternative for the command line 46 49 interface(CLI). (Assignee: Harmen) 47 50 - Compiling the source into an windows executable. (Assignee: Bas) 48 - - Create an configuration file to hold logins and API keys. 49 - - Determine reliability of our data point. 50 - - Create an module to gather data from NIST. (Assignee: Rob) 51 - - Create an module to gather data from PubChem. (Assignee: Nout) 52 51 53 52 __Side goals:__ 54 53
+108
SIGNED.md
··· 1 + ##### Signed by https://keybase.io/jdekker 2 + ``` 3 + -----BEGIN PGP SIGNATURE----- 4 + Version: GnuPG v1.4.11 (GNU/Linux) 5 + 6 + iQIcBAABAgAGBQJTpMZAAAoJEJrQ9RIUCT6/Hf8P/AyX9ZD5zj6rBi2CwDOTs5aa 7 + flVqw9syvdqTzVfXQaR4UrCSOuyuOeAkiqub0BMjxyCurqAwN/SCPf3uOJ/tGXmt 8 + ZPtYVHjevJ4mbojLhZiJ2av8LC9VOh3Zl+reR3L2cLuBD4rVSrfUMJtczbbtNlk+ 9 + +mczRcTpzNvHQW6mKqyUoKn8xqNnLC7C+p5ybNZ5EADUfoKIF1xyTN6je6fpYZ1U 10 + IHxiUzeOvfX9ohmbfnfkpkuSll1nUJWsTgUPKhthJuxEhwCQ1xMdWhxfcyZJaMT2 11 + Pxgo8C8S6lzAk4PxBRBoePjgWAeaFmbr317WXHvw6SSHPIdzToKZgDiDC5LWvKxb 12 + RRdLZ6w7tg0/FSUexekrUafGT8Je0oIoLUQlNaEQzrPNhDpma1uHFfZg0vb2m4Hq 13 + WHLLKTCr6FMczhP1TmuIEtdjKtymT+rO+Ls4ciw+654R7MtBYcmTr+RqmAd+GadJ 14 + vJNmGDod2oPwCydEps8bYAbksqRhMmk3xwco/g6dWYh5/+1GzCr80J7fYpqtoPFH 15 + V5qKyDQovF5jPlb/buq4mH8XYVT1z4Sx8azKVctMLig57zRnvN0WyskpT09oY7dK 16 + TPvIqwTixekndYLcM3QacVq/NhVOOQPFvD0PwU18eKs4EfD2L7iWd2XjV9Az++aD 17 + jUY6EwEuOzDCexWP4eM8 18 + =h6TK 19 + -----END PGP SIGNATURE----- 20 + 21 + ``` 22 + 23 + <!-- END SIGNATURES --> 24 + 25 + ### Begin signed statement 26 + 27 + #### Expect 28 + 29 + ``` 30 + size exec file contents 31 + ./ 32 + 412 .gitignore 25059da2ee328837ece01b979cd5c1083ed1679372f06c14c1c58035d8120614 33 + 548 .travis.yml 7f11bc58a8e94276ef949afeb107f9f1e184c0dbb84f821705ea2245902ed546 34 + 846 Changelog.md 345f9aea4812b37b1b2714703ea0d5edd27414c0f839ec3e322450ad5ec5c6ed 35 + FourmiCrawler/ 36 + 0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 37 + 304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806 38 + 2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49 39 + 677 settings.py f1e7d21b899ffc2523516c0ebe67d967dc62495b90c2fe34651042a3049fcd94 40 + sources/ 41 + 12103 ChemSpider.py f647d70acf9b3f1ee7bde75586aa45156331f977ca7fe836ceac4477a2c0d4ce 42 + 12400 NIST.py cdb4c423355ac8fb1097197a9f8df44f667925a785c6bae7c583820da08908ee 43 + 6121 PubChem.py 8f8ad40459090b818a384a202e739fe4696a04154df2b8419aee896b0fa02481 44 + 6930 WikipediaParser.py ae9f57bbf2aad9c371abcd143fd2dda5995a196cb700734a5035dd94b1988870 45 + 0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 46 + 1281 source.py 7927fda259ff2c8096fa526db1f08586de6e04473a491e19a07b092fdeed81fc 47 + 3111 spider.py ec7c946907fea10c17ee6dd88a506f3e3bf2cd748e3eb09200487fcec2ae7ba3 48 + GUI/ 49 + 11 __init__.py 40567015c415e853210425c1b4f3834dbc2a3165e3713e04dd3424b79bc90aa3 50 + 940 configImporter.py 5d731d63a3117b25b7e556a746a1dd5b16e8cbb60e57be46de333c31c8c00271 51 + 8776 gui.py 20b2220bc3ca55ebfd6d04e8c0bebbf1ae316c85a54db60b8fc02d22642f19d5 52 + 299 GUI.cfg.sample 4ee27f7099d588c21358cd645a21621e631d80712f1b514dad898faa5fee2483 53 + 1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c 54 + 3900 README.md f4a1e3ea1700d2b415acfad661cb45f960fe8e8ffbe98dbecb6c7ed071a101ac 55 + 3846 x fourmi.py f0b11f5f153f96f6af2e504cdf369e43c04316752de131a659eb6246fd80212a 56 + 261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85 57 + 416 sources.cfg.sample 11cd0fc18693da17883c98d25a384ae1b6158adfef13778b6dd02b878f6b8a70 58 + tests/ 59 + 107 __init__.py ce90e54e58a0912cadbe3adcf5166dc72477bf9ce289bf427f8e2f5b25406670 60 + 2870 test_configurator.py 318d542b1cda5075a2a9a6be97e9e7a79372ee58e1ab3014c161534094f7364d 61 + 1315 test_gui.py 0fb95d0b542765bf52bcebb037bf2ed1299209beab23448af741a93c9fbb1ca8 62 + 1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031 63 + 1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869 64 + 2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299 65 + utils/ 66 + 40 __init__.py f1237ae74693e2ec1b3154e57aec27438a80a735e5ccf2411aecd194ef443b6a 67 + 4047 configurator.py 8b566a0435a9f105a8ec616b16c3e21edb9b82f8debe1ef9f1df6bbbf20949d5 68 + 2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37 69 + ``` 70 + 71 + #### Ignore 72 + 73 + ``` 74 + /SIGNED.md 75 + ``` 76 + 77 + #### Presets 78 + 79 + ``` 80 + git # ignore .git and anything as described by .gitignore files 81 + dropbox # ignore .dropbox-cache and other Dropbox-related files 82 + kb # ignore anything as described by .kbignore files 83 + ``` 84 + 85 + <!-- summarize version = 0.0.9 --> 86 + 87 + ### End signed statement 88 + 89 + <hr> 90 + 91 + #### Notes 92 + 93 + With keybase you can sign any directory's contents, whether it's a git repo, 94 + source code distribution, or a personal documents folder. It aims to replace the drudgery of: 95 + 96 + 1. comparing a zipped file to a detached statement 97 + 2. downloading a public key 98 + 3. confirming it is in fact the author's by reviewing public statements they've made, using it 99 + 100 + All in one simple command: 101 + 102 + ```bash 103 + keybase dir verify 104 + ``` 105 + 106 + There are lots of options, including assertions for automating your checks. 107 + 108 + For more info, check out https://keybase.io/docs/command_line/code_signing
+38 -44
fourmi.py
··· 1 1 #!/usr/bin/env python 2 2 """ 3 - Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms). 3 + Fourmi, a web scraper build to search specific information for a given compound (and its pseudonyms). 4 4 5 5 Usage: 6 + fourmi 6 7 fourmi search <compound> 7 8 fourmi [options] search <compound> 8 - fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound> 9 + fourmi [options] [-v | -vv | -vvv] [--include=<sourcename> | --exclude=<sourcename>] search <compound> 9 10 fourmi list 10 11 fourmi [--include=<sourcename> | --exclude=<sourcename>] list 11 12 fourmi -h | --help ··· 15 16 --attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*] 16 17 -h --help Show this screen. 17 18 --version Show version. 18 - --verbose Verbose logging output. 19 + -v Verbose logging output. (Multiple occurrences increase logging level) 19 20 --log=<file> Save log to an file. 20 - -o <file> --output=<file> Output file [default: result.*format*] 21 - -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines] 21 + -o <file> --output=<file> Output file [default: <compound>.*format*] 22 + -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv] 22 23 --include=<regex> Include only sources that match these regular expressions split by a comma. 23 24 --exclude=<regex> Exclude the sources that match these regular expressions split by a comma. 24 25 """ 25 26 26 27 from twisted.internet import reactor 27 28 from scrapy.crawler import Crawler 28 - from scrapy import log, signals 29 - from scrapy.utils.project import get_project_settings 29 + from scrapy import signals, log 30 30 import docopt 31 31 32 32 from FourmiCrawler.spider import FourmiSpider 33 - from sourceloader import SourceLoader 33 + from utils.configurator import Configurator 34 + from utils.sourceloader import SourceLoader 35 + from GUI import gui 34 36 35 37 36 - def setup_crawler(searchable, settings, source_loader, attributes): 37 - spider = FourmiSpider(compound=searchable, selected_attributes=attributes) 38 - spider.add_parsers(source_loader.sources) 38 + def setup_crawler(compound, settings, source_loader, attributes): 39 + """ 40 + This function prepares and start the crawler which starts the actual search on the internet 41 + :param compound: The compound which should be searched 42 + :param settings: A scrapy settings object 43 + :param source_loader: A fully functional SourceLoader object which contains only the sources that should be used. 44 + :param attributes: A list of regular expressions which the attribute names should match. 45 + """ 46 + spider = FourmiSpider(compound=compound, selected_attributes=attributes) 47 + spider.add_sources(source_loader.sources) 39 48 crawler = Crawler(settings) 40 49 crawler.signals.connect(reactor.stop, signal=signals.spider_closed) 41 50 crawler.configure() ··· 43 52 crawler.start() 44 53 45 54 46 - def scrapy_settings_manipulation(docopt_arguments): 47 - settings = get_project_settings() 48 - # [todo] - add at least a warning for files that already exist 49 - if docopt_arguments["--output"] != 'result.*format*': 50 - settings.overrides["FEED_URI"] = docopt_arguments["--output"] 51 - elif docopt_arguments["--format"] == "jsonlines": 52 - settings.overrides["FEED_URI"] = "results.json" 53 - elif docopt_arguments["--format"] is not None: 54 - settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"] 55 - 56 - if docopt_arguments["--format"] is not None: 57 - settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"] 58 - 59 - return settings 60 - 61 - 62 - def start_log(docopt_arguments): 63 - if docopt_arguments["--log"] is not None: 64 - if docopt_arguments["--verbose"]: 65 - log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG) 66 - else: 67 - log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING) 68 - else: 69 - if docopt_arguments["--verbose"]: 70 - log.start(logstdout=False, loglevel=log.DEBUG) 71 - else: 72 - log.start(logstdout=True, loglevel=log.WARNING) 73 - 74 - 75 55 def search(docopt_arguments, source_loader): 76 - start_log(docopt_arguments) 77 - settings = scrapy_settings_manipulation(docopt_arguments) 78 - setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(',')) 56 + """ 57 + The function that facilitates the search for a specific compound. 58 + :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. 59 + :param source_loader: An initiated SourceLoader object pointed at the directory with the sources. 60 + """ 61 + conf = Configurator() 62 + conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"]) 63 + conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments["<compound>"]) 64 + setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, 65 + source_loader, docopt_arguments["--attributes"].split(',')) 66 + if conf.scrapy_settings.getbool("LOG_ENABLED"): 67 + log.start(conf.scrapy_settings.get("LOG_FILE"), 68 + conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) 79 69 reactor.run() 80 70 81 71 72 + # The start for the Fourmi Command Line interface. 82 73 if __name__ == '__main__': 83 - arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.0') 74 + arguments = docopt.docopt(__doc__, version='Fourmi - V0.6.0') 84 75 loader = SourceLoader() 85 76 86 77 if arguments["--include"]: ··· 93 84 elif arguments["list"]: 94 85 print "-== Available Sources ==-" 95 86 print str(loader) 87 + else: 88 + gui_window = gui.GUI(search, sourceloader=SourceLoader()) 89 + gui_window.run()
-41
sourceloader.py
··· 1 - import inspect 2 - import os 3 - import re 4 - from FourmiCrawler.sources.source import Source 5 - 6 - 7 - class SourceLoader: 8 - sources = [] 9 - 10 - def __init__(self, rel_dir="FourmiCrawler/sources"): 11 - path = os.path.dirname(os.path.abspath(__file__)) 12 - path += "/" + rel_dir 13 - known_parser = set() 14 - 15 - for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: 16 - mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) 17 - classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] 18 - for cls in classes: 19 - if issubclass(cls, Source) and cls not in known_parser: 20 - self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers? 21 - known_parser.add(cls) 22 - 23 - def include(self, source_names): 24 - new = set() 25 - for name in source_names: 26 - new.update([src for src in self.sources if re.match(name, src.__class__.__name__)]) 27 - self.sources = list(new) 28 - 29 - def exclude(self, source_names): 30 - exclude = [] 31 - for name in source_names: 32 - exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)]) 33 - self.sources = [src for src in self.sources if src not in exclude] 34 - 35 - def __str__(self): 36 - string = "" 37 - for src in self.sources: 38 - string += "Source: " + src.__class__.__name__ 39 - string += " - " 40 - string += "URI: " + src.website + "\n" 41 - return string
+19
sources.cfg.sample
··· 1 + [DEFAULT] 2 + reliability = Unknown 3 + 4 + #For each source listed in FourmiCrawler/sources there should be a section 5 + #named exactly as the filename in here. If not present, the DEFAULT value is 6 + #used for reliability of that source. 7 + 8 + [ChemSpider] 9 + reliability = High 10 + #token=Paste ChemSpider API token here and remove the hashtag 11 + 12 + [NIST] 13 + reliability = High 14 + 15 + [WikipediaParser] 16 + reliability = Medium 17 + 18 + [PubChem] 19 + reliability = High
+6
tests/__init__.py
··· 1 + import test_configurator 2 + import test_gui 3 + import test_pipeline 4 + import test_sourceloader 5 + import test_spider 6 +
+68
tests/test_configurator.py
··· 1 + import unittest 2 + import ConfigParser 3 + 4 + from utils.configurator import Configurator 5 + 6 + 7 + class TestConfigurator(unittest.TestCase): 8 + 9 + def setUp(self): 10 + self.conf = Configurator() 11 + 12 + def test_set_output(self): 13 + self.conf.set_output(filename="test.txt", fileformat="csv", compound="test") 14 + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt") 15 + self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") 16 + 17 + self.conf.set_output("<compound>.*format*", "jsonlines", "test") 18 + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.json") 19 + self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines") 20 + 21 + self.conf.set_output("<compound>.*format*", "csv", "test") 22 + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.csv") 23 + self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") 24 + 25 + def test_start_log(self): 26 + for i in range(0, 3): 27 + self.conf.set_logging("TEST", i) 28 + self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), "TEST") 29 + if i > 0: 30 + self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), True) 31 + if i > 1: 32 + self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), False) 33 + else: 34 + self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True) 35 + else: 36 + self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), False) 37 + self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True) 38 + if i == 1: 39 + self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "WARNING") 40 + elif i == 2: 41 + self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "INFO") 42 + elif i == 3: 43 + self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "DEBUG") 44 + 45 + self.conf.set_logging(verbose=i) 46 + self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), None) 47 + 48 + def test_read_sourceconfiguration(self): 49 + config = self.conf.read_sourceconfiguration() 50 + self.assertIsInstance(config, ConfigParser.ConfigParser) 51 + 52 + def test_get_section(self): 53 + config = ConfigParser.ConfigParser() 54 + section = self.conf.get_section(config, 'test') 55 + self.assertIn('reliability', section) 56 + self.assertEquals(section['reliability'], '') 57 + 58 + config.set('DEFAULT', 'reliability', 'Low') 59 + 60 + section = self.conf.get_section(config, 'test') 61 + self.assertEquals(section['reliability'], 'Low') 62 + 63 + config.add_section('test') 64 + config.set('test', 'var', 'Maybe') 65 + 66 + section = self.conf.get_section(config, 'test') 67 + self.assertEquals(section['reliability'], 'Low') 68 + self.assertEqual(section['var'], 'Maybe')
+32
tests/test_gui.py
··· 1 + import unittest 2 + 3 + from GUI import gui 4 + 5 + class TestGUI(unittest.TestCase): 6 + def setUp(self): 7 + pass 8 + 9 + def test_empty_attributes(self): 10 + self.test_gui = gui.GUI(None, config_file="../GUI.cfg.sample", in_source=True) 11 + self.test_gui.window.after(9, self.test_gui.prepare_search) 12 + self.test_gui.window.after(11, self.test_gui.window.destroy) 13 + self.test_gui.run() 14 + 15 + output_type = self.test_gui.configurator.load_output_types().split(',')[0] 16 + 17 + self.assertEqual(self.test_gui.values.get('substance'), '') 18 + self.assertEqual(self.test_gui.values.get('output_type'), output_type) 19 + self.assertEqual(self.test_gui.values.get('output_name'), 'results.csv') 20 + 21 + 22 + def test_no_configurations(self): 23 + self.test_gui = gui.GUI(None, config_file="../GUI.cfg.sample") 24 + self.test_gui.configurator = gui.ConfigImporter('') 25 + self.test_gui.finish_with_search = True 26 + self.test_gui.window.after(9, self.test_gui.prepare_search) 27 + self.test_gui.window.after(11, self.test_gui.window.destroy) 28 + self.test_gui.run() 29 + 30 + self.assertEqual(self.test_gui.values.get('substance'), '') 31 + self.assertEqual(self.test_gui.values.get('output_type'), 'csv') 32 + self.assertEqual(self.test_gui.values.get('output_name'), 'results.csv')
+53
tests/test_pipeline.py
··· 1 + import copy 2 + import unittest 3 + 4 + from scrapy.exceptions import DropItem 5 + 6 + from FourmiCrawler import pipelines, spider, items 7 + 8 + 9 + class TestPipelines(unittest.TestCase): 10 + def setUp(self): 11 + self.testItem = items.Result() 12 + 13 + def test_none_pipeline(self): 14 + # Testing the pipeline that replaces the None values in items. 15 + self.testItem["value"] = "abc" 16 + self.testItem["source"] = None 17 + pipe = pipelines.RemoveNonePipeline() 18 + processed = pipe.process_item(self.testItem, spider.FourmiSpider()) 19 + 20 + self.assertTrue(processed["value"] == "abc") 21 + 22 + for key in self.testItem: 23 + self.assertIsNotNone(processed[key]) 24 + if key is not "value": 25 + self.assertIs(processed[key], "") 26 + 27 + def test_duplicate_pipeline(self): 28 + # Testing the pipeline that removes duplicates. 29 + self.testItem["attribute"] = "test" 30 + self.testItem["value"] = "test" 31 + self.testItem["conditions"] = "test" 32 + 33 + pipe = pipelines.DuplicatePipeline() 34 + self.assertEqual(pipe.process_item(self.testItem, spider.FourmiSpider()), self.testItem) 35 + self.assertRaises(DropItem, pipe.process_item, self.testItem, spider.FourmiSpider()) 36 + 37 + other_item = copy.deepcopy(self.testItem) 38 + other_item["value"] = "test1" 39 + self.assertEqual(pipe.process_item(other_item, spider.FourmiSpider()), other_item) 40 + 41 + def test_attribute_selection(self): 42 + # Testing the pipeline that selects attributes. 43 + item1 = copy.deepcopy(self.testItem) 44 + item2 = copy.deepcopy(self.testItem) 45 + 46 + item1["attribute"] = "abd" 47 + item2["attribute"] = "abc" 48 + 49 + s = spider.FourmiSpider(selected_attributes=["a.d"]) 50 + pipe = pipelines.AttributeSelectionPipeline() 51 + 52 + self.assertEqual(pipe.process_item(item1, s), item1) 53 + self.assertRaises(DropItem, pipe.process_item, item2, s)
+33
tests/test_sourceloader.py
··· 1 + import unittest 2 + 3 + from utils.sourceloader import SourceLoader 4 + 5 + 6 + class TestSourceloader(unittest.TestCase): 7 + def setUp(self): 8 + self.loader = SourceLoader() 9 + 10 + def test_init(self): 11 + # Test if sourceloader points to the right directory, where the sources are present. 12 + self.assertIn("Source: Source", str(self.loader)) 13 + self.assertIn("Source: NIST", str(self.loader)) 14 + self.assertIn("Source: ChemSpider", str(self.loader)) 15 + self.assertIn("Source: WikipediaParser", str(self.loader)) 16 + 17 + def test_include(self): 18 + # Tests for the include functionality. 19 + self.loader.include(["So.rc.*"]) 20 + 21 + self.assertIn("Source: Source", str(self.loader)) 22 + self.assertNotIn("Source: NIST", str(self.loader)) 23 + self.assertNotIn("Source: ChemSpider", str(self.loader)) 24 + self.assertNotIn("Source: WikipediaParser", str(self.loader)) 25 + 26 + def test_exclude(self): 27 + # Tests for the exclude functionality. 28 + self.loader.exclude(["So.rc.*"]) 29 + 30 + self.assertNotIn("Source: Source", str(self.loader)) 31 + self.assertIn("Source: NIST", str(self.loader)) 32 + self.assertIn("Source: ChemSpider", str(self.loader)) 33 + self.assertIn("Source: WikipediaParser", str(self.loader))
+63
tests/test_spider.py
··· 1 + import unittest 2 + 3 + from scrapy.http import Request 4 + 5 + from FourmiCrawler import spider 6 + from FourmiCrawler.sources.NIST import NIST 7 + from FourmiCrawler.sources.source import Source 8 + 9 + 10 + class TestFoumiSpider(unittest.TestCase): 11 + def setUp(self): 12 + self.compound = "test_compound" 13 + self.attributes = ["a.*", ".*a"] 14 + self.spi = spider.FourmiSpider(self.compound, self.attributes) 15 + 16 + def test_init(self): 17 + # Test the initiation of the Fourmi spider 18 + self.assertIn(self.compound, self.spi.synonyms) 19 + for attr in self.attributes: 20 + self.assertIn(attr, self.spi.selected_attributes) 21 + 22 + def test_add_source(self): 23 + # Testing the source adding function of the Fourmi spider 24 + src = Source() 25 + self.spi.add_source(src) 26 + self.assertIn(src, self.spi._sources) 27 + 28 + def test_add_sources(self): 29 + # Testing the function that adds multiple sources 30 + srcs = [Source(), Source(), Source()] 31 + self.spi.add_sources(srcs) 32 + 33 + for src in srcs: 34 + self.assertIn(src, self.spi._sources) 35 + 36 + def test_start_requests(self): 37 + # A test for the function that generates the start requests 38 + self.spi._sources = [] 39 + 40 + src = Source() 41 + self.spi.add_source(src) 42 + self.assertEqual(self.spi.start_requests(), []) 43 + 44 + src2 = NIST() 45 + self.spi.add_source(src2) 46 + requests = self.spi.start_requests() 47 + self.assertGreater(len(requests), 0) 48 + self.assertIsInstance(requests[0], Request) 49 + 50 + def test_synonym_requests(self): 51 + # A test for the synonym request function 52 + self.spi._sources = [] 53 + 54 + src = Source() 55 + self.spi.add_source(src) 56 + self.assertEqual(self.spi.get_synonym_requests("new_compound"), []) 57 + self.assertIn("new_compound", self.spi.synonyms) 58 + 59 + src2 = NIST() 60 + self.spi.add_source(src2) 61 + self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request) 62 + self.assertIn("other_compound", self.spi.synonyms) 63 + self.assertEqual(self.spi.get_synonym_requests("other_compound"), [])
+2
utils/__init__.py
··· 1 + import configurator 2 + import sourceloader
+101
utils/configurator.py
··· 1 + import ConfigParser 2 + import os 3 + import shutil 4 + 5 + from scrapy.utils.project import get_project_settings 6 + 7 + 8 + class Configurator: 9 + """ 10 + A helper class in the fourmi class. This class is used to process the settings as set 11 + from one of the Fourmi applications. 12 + """ 13 + 14 + def __init__(self): 15 + self.scrapy_settings = get_project_settings() 16 + 17 + def set_output(self, filename, fileformat, compound): 18 + """ 19 + This function manipulates the Scrapy output file settings that normally would be set in the settings file. 20 + In the Fourmi project these are command line arguments. 21 + :param filename: The filename of the file where the output will be put. 22 + :param fileformat: The format in which the output will be. 23 + """ 24 + 25 + if filename != '<compound>.*format*': 26 + self.scrapy_settings.overrides["FEED_URI"] = filename 27 + elif fileformat == "jsonlines": 28 + self.scrapy_settings.overrides["FEED_URI"] = compound + ".json" 29 + elif fileformat is not None: 30 + self.scrapy_settings.overrides["FEED_URI"] = compound + "." + fileformat 31 + 32 + if fileformat is not None: 33 + self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat 34 + 35 + def set_logging(self, logfile=None, verbose=0): 36 + """ 37 + This function changes the default settings of Scapy's logging functionality 38 + using the settings given by the CLI. 39 + :param logfile: The location where the logfile will be saved. 40 + :param verbose: A integer value to switch between loglevels. 41 + """ 42 + if verbose != 0: 43 + self.scrapy_settings.overrides["LOG_ENABLED"] = True 44 + else: 45 + self.scrapy_settings.overrides["LOG_ENABLED"] = False 46 + 47 + if verbose == 1: 48 + self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING" 49 + elif verbose == 2: 50 + self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO" 51 + else: 52 + self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG" 53 + 54 + if verbose > 1: 55 + self.scrapy_settings.overrides["LOG_STDOUT"] = False 56 + else: 57 + self.scrapy_settings.overrides["LOG_STDOUT"] = True 58 + 59 + if logfile is not None: 60 + self.scrapy_settings.overrides["LOG_FILE"] = logfile 61 + else: 62 + self.scrapy_settings.overrides["LOG_FILE"] = None 63 + 64 + @staticmethod 65 + def read_sourceconfiguration(): 66 + """ 67 + This function reads sources.cfg in the main folder for configuration 68 + variables for sources 69 + :return a ConfigParser object of sources.cfg 70 + """ 71 + current_dir = os.path.dirname(os.path.abspath(__file__)) 72 + config_path = current_dir + '/../sources.cfg' 73 + # [TODO]: location of sources.cfg should be softcoded eventually 74 + if not os.path.isfile(config_path): 75 + try: 76 + shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../sources.cfg.sample", config_path) 77 + except IOError: 78 + print "WARNING: Source configuration couldn't be found and couldn't be created." 79 + config = ConfigParser.ConfigParser() 80 + config.read(config_path) 81 + return config 82 + 83 + @staticmethod 84 + def get_section(config, sourcename): 85 + """ 86 + This function reads a config section labeled in variable sourcename and 87 + tests whether the reliability variable is set else set to empty string. 88 + Return the default section if the labeled config section does not exist 89 + :param config: a ConfigParser object 90 + :param sourcename: the name of the section to be read 91 + :return a dictionary of the section in the config labeled in sourcename 92 + """ 93 + section = dict() 94 + if config.has_section(sourcename): 95 + section = dict(config.items(sourcename)) 96 + elif config.defaults(): 97 + section = config.defaults() 98 + if 'reliability' not in section: 99 + print 'WARNING: Reliability not set for %s' % sourcename 100 + section['reliability'] = '' 101 + return section
+64
utils/sourceloader.py
··· 1 + import inspect 2 + import os 3 + import re 4 + 5 + from FourmiCrawler.sources.source import Source 6 + from utils.configurator import Configurator 7 + 8 + 9 + class SourceLoader: 10 + sources = [] 11 + 12 + def __init__(self, rel_dir="../FourmiCrawler/sources"): 13 + """ 14 + The initiation of a SourceLoader, selects and indexes a directory for usable sources. 15 + Also loads a configuration file for Sources and passes the arguments in 16 + the named section to the source 17 + :param rel_dir: A relative path to a directory. 18 + """ 19 + path = os.path.dirname(os.path.abspath(__file__)) 20 + path += "/" + rel_dir 21 + known_parser = set() 22 + 23 + config = Configurator.read_sourceconfiguration() 24 + 25 + for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: 26 + mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py]) 27 + classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] 28 + for cls in classes: 29 + if issubclass(cls, Source) and cls not in known_parser: 30 + sourcecfg = Configurator.get_section(config, cls.__name__) 31 + self.sources.append(cls(sourcecfg)) 32 + known_parser.add(cls) 33 + 34 + def include(self, source_names): 35 + """ 36 + This function excludes all sources that don't match the given regular expressions. 37 + :param source_names: A list of regular expression (strings) 38 + """ 39 + new = set() 40 + for name in source_names: 41 + new.update([src for src in self.sources if re.match(name, src.__class__.__name__)]) 42 + self.sources = list(new) 43 + 44 + def exclude(self, source_names): 45 + """ 46 + This function excludes all sources that match the given regular expressions. 47 + :param source_names: A list of regular expression (strings) 48 + """ 49 + exclude = [] 50 + for name in source_names: 51 + exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)]) 52 + self.sources = [src for src in self.sources if src not in exclude] 53 + 54 + def __str__(self): 55 + """ 56 + This function returns a string with all sources currently available in the SourceLoader. 57 + :return: a string with all available sources. 58 + """ 59 + string = "" 60 + for src in self.sources: 61 + string += "Source: " + src.__class__.__name__ 62 + string += " - " 63 + string += "URI: " + src.website + "\n" 64 + return string