comparing basic-scraper-structure and main on dekker.one/Fourmi

+5

.gitignore

··· 4 #Python Specific ignores 5 *.pyc 6 7 #THINGS WE WOULD NEVER EVER WANT! 8 #ignore thumbnails created by windows 9 Thumbs.db

··· 4 #Python Specific ignores 5 *.pyc 6 7 + #may contain authentication information 8 + sources.cfg 9 + #Another of our config files 10 + GUI.cfg 11 + 12 #THINGS WE WOULD NEVER EVER WANT! 13 #ignore thumbnails created by windows 14 Thumbs.db

+23

.travis.yml

···

··· 1 + # Config file for automatic testing at travis-ci.org 2 + 3 + language: python 4 + python: 2.7 5 + 6 + before_install: 7 + - "export DISPLAY=:99.0" 8 + - "sh -e /etc/init.d/xvfb start" 9 + 10 + # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors 11 + install: 12 + - pip install Scrapy docopt 13 + - pip install coveralls 14 + 15 + # command to run tests, e.g. python setup.py test 16 + script: 17 + - nosetests --with-coverage --cover-package=FourmiCrawler,utils,GUI tests 18 + 19 + notifications: 20 + slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM 21 + 22 + after_success: 23 + coveralls --verbose

+20

Changelog.md

···

··· 1 + ### v0.6.0 2 + - Feature: Added a Graphical User interface 3 + - Feature: Automatic config file createion from config samples 4 + - FIX: The default name of the output files will now consist of the compound name and the file format when using the CLI 5 + - FIX: A lot of bugfixes of the PubChem plugin, as is wasn't working as it should 6 + - FIX: Using absolute path for configuration files 7 + - DEV: General Code cleanup in documentation 8 + 9 + ### v0.5.3 10 + - FIX: It is now again possible to use both verbose and the source inclusion/exclusion options 11 + - FIX: Logging is now "actually" disabled if not using the verbose option. 12 + - FEATURE: Added support for PubChem 13 + 14 + ### v0.5.2 15 + - FIX: Signatured used to contain untracked and older files, current signature 16 + should be correct. 17 + 18 + ### v0.5.1 19 + - UPDATED: Logging functionality from command line 20 + - DEV: Code cleanup and extra tests

-31

Fourmi.py

··· 1 - #!/usr/bin/env python 2 - """ 3 - Fourmi - An internet webcrawler searching for information on chemical 4 - compounds. [todo] - Add some more useful text here. 5 - """ 6 - 7 - from twisted.internet import reactor 8 - from scrapy.crawler import Crawler 9 - from scrapy import log, signals 10 - from FourmiCrawler.spiders.Fourmispider import FourmiSpider 11 - from scrapy.utils.project import get_project_settings 12 - 13 - 14 - def setup_crawler(searchable): 15 - # [TODO] - Initiate all parsers for the different websites and get 16 - # allowed URLs. 17 - spider = FourmiSpider(compound=searchable) 18 - settings = get_project_settings() 19 - crawler = Crawler(settings) 20 - crawler.signals.connect(reactor.stop, signal=signals.spider_closed) 21 - crawler.configure() 22 - crawler.crawl(spider) 23 - crawler.start() 24 - 25 - 26 - def start(): 27 - setup_crawler("Methane") 28 - log.start() 29 - reactor.run() 30 - 31 - start()

···

+1 -3

FourmiCrawler/items.py

··· 1 - # Define here the models for your scraped items 2 - # 3 - # See documentation in: 4 # http://doc.scrapy.org/en/latest/topics/items.html 5 6 from scrapy.item import Item, Field

··· 1 + # For more information on item definitions, see the Scrapy documentation in: 2 # http://doc.scrapy.org/en/latest/topics/items.html 3 4 from scrapy.item import Item, Field

FourmiCrawler/parsers/__init__.py

This is a binary file and will not be displayed.

-9

FourmiCrawler/parsers/parser.py

··· 1 - from scrapy import log 2 - 3 - 4 - class Parser: 5 - website = "http://localhost/*" 6 - 7 - def parse(self, reponse): 8 - log.msg("The parse function of the empty parser was used.", level=log.Warning) 9 - pass

···

+43 -7

FourmiCrawler/pipelines.py

··· 1 - # Define your item pipelines here 2 - # 3 - # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 - # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 from scrapy.exceptions import DropItem 6 7 8 - class FourmiPipeline(object): 9 10 def __init__(self): 11 self.known_values = set() 12 ··· 17 :param spider: The spider which scraped the spider 18 :return: :raise DropItem: Returns the item if unique or drops them if it's already known 19 """ 20 - value = item['attribute'], item['value'] 21 if value in self.known_values: 22 - raise DropItem("Duplicate item found: %s" % item) 23 else: 24 self.known_values.add(value) 25 return item

··· 1 + # For more information on item pipelines, see the Scrapy documentation in: 2 + # http://doc.scrapy.org/en/latest/topics/item-pipeline.html 3 + import re 4 + 5 from scrapy.exceptions import DropItem 6 7 8 + class RemoveNonePipeline(object): 9 + def __init__(self): 10 + pass 11 + 12 + @staticmethod 13 + def process_item(item, spider): 14 + """ 15 + Processing the items so None values are replaced by empty strings 16 + :param item: The incoming item 17 + :param spider: The spider which scraped the spider 18 + :return: :raise DropItem: Returns the item if unique or drops them if it's already known 19 + """ 20 + for key in item: 21 + if item[key] is None: 22 + item[key] = "" 23 + return item 24 25 + 26 + class DuplicatePipeline(object): 27 def __init__(self): 28 self.known_values = set() 29 ··· 34 :param spider: The spider which scraped the spider 35 :return: :raise DropItem: Returns the item if unique or drops them if it's already known 36 """ 37 + value = (item['attribute'], item['value'], item['conditions']) 38 if value in self.known_values: 39 + raise DropItem("Duplicate item found: %s" % item) # [todo] append sources of first item. 40 else: 41 self.known_values.add(value) 42 return item 43 + 44 + 45 + class AttributeSelectionPipeline(object): 46 + def __init__(self): 47 + pass 48 + 49 + @staticmethod 50 + def process_item(item, spider): 51 + """ 52 + The items are processed using the selected attribute list available in the spider, 53 + items that don't match the selected items are dropped. 54 + :param item: The incoming item 55 + :param spider: The spider which scraped the item. Should have an attribute "selected_attributes". 56 + :return: :raise DropItem: Returns item if it matches an selected attribute, else it is dropped. 57 + """ 58 + if [x for x in spider.selected_attributes if re.match(x, item["attribute"])]: 59 + return item 60 + else: 61 + raise DropItem("Attribute not selected by used: %s" % item)

+7 -3

FourmiCrawler/settings.py

··· 3 # For simplicity, this file contains only the most important settings by 4 # default. All the other settings are documented here: 5 # 6 - # http://doc.scrapy.org/en/latest/topics/settings.html 7 # 8 9 BOT_NAME = 'FourmiCrawler' ··· 11 SPIDER_MODULES = ['FourmiCrawler'] 12 NEWSPIDER_MODULE = 'FourmiCrawler' 13 ITEM_PIPELINES = { 14 - 'FourmiCrawler.pipelines.FourmiPipeline': 100 15 } 16 17 # Crawl responsibly by identifying yourself (and your website) on the 18 # user-agent 19 20 - # USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'

··· 3 # For simplicity, this file contains only the most important settings by 4 # default. All the other settings are documented here: 5 # 6 + # http://doc.scrapy.org/en/latest/topics/settings.html 7 # 8 9 BOT_NAME = 'FourmiCrawler' ··· 11 SPIDER_MODULES = ['FourmiCrawler'] 12 NEWSPIDER_MODULE = 'FourmiCrawler' 13 ITEM_PIPELINES = { 14 + "FourmiCrawler.pipelines.RemoveNonePipeline": 100, 15 + 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 200, 16 + 'FourmiCrawler.pipelines.DuplicatePipeline': 300, 17 } 18 + FEED_URI = 'results.json' 19 + FEED_FORMAT = 'jsonlines' 20 21 # Crawl responsibly by identifying yourself (and your website) on the 22 # user-agent 23 24 + USER_AGENT = 'Fourmi'

+298

FourmiCrawler/sources/ChemSpider.py

···

··· 1 + import re 2 + 3 + from scrapy import log 4 + from scrapy.http import Request 5 + from scrapy.selector import Selector 6 + 7 + from source import Source 8 + from FourmiCrawler.items import Result 9 + 10 + 11 + # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. 12 + 13 + class ChemSpider(Source): 14 + """ 15 + ChemSpider scraper for synonyms and properties 16 + This parser will manage searching for chemicals through the 17 + ChemsSpider API, and parsing the resulting ChemSpider page. 18 + The token required for the API should be in a configuration file 19 + somewhere. 20 + """ 21 + 22 + website = 'http://www\\.chemspider\\.com/.*' 23 + 24 + search = 'Search.asmx/SimpleSearch?query=%s&token=' 25 + structure = 'Chemical-Structure.%s.html' 26 + extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' 27 + 28 + def __init__(self, config=None): 29 + """ 30 + Initialization of ChemSpider scraper 31 + :param config: a dictionary of settings for this scraper, must contain 32 + 'reliability' key 33 + """ 34 + Source.__init__(self, config) 35 + self.ignore_list = [] 36 + if 'token' not in self.cfg or self.cfg['token'] == '': 37 + log.msg('ChemSpider token not set or empty, search/MassSpec API ' 38 + 'not available', level=log.WARNING) 39 + self.cfg['token'] = '' 40 + self.search += self.cfg['token'] 41 + self.extendedinfo += self.cfg['token'] 42 + 43 + def parse(self, response): 44 + """ 45 + This function is called when a Response matching the variable 46 + 'website' is available for parsing the Response object. 47 + :param response: the Scrapy Response object to be parsed 48 + :return: a list of Result items and Request objects 49 + """ 50 + sel = Selector(response) 51 + requests = [] 52 + requests_synonyms = self.parse_synonyms(sel) 53 + requests.extend(requests_synonyms) 54 + requests_properties = self.parse_properties(sel) 55 + requests.extend(requests_properties) 56 + 57 + return requests 58 + 59 + def parse_properties(self, sel): 60 + """ 61 + This function scrapes the Experimental Data and Predicted ACD/Labs tabs 62 + :param sel: a Selector object of the whole page 63 + :return: a list of Result items 64 + """ 65 + properties = [] 66 + 67 + properties.extend(self.parse_acdlabstab(sel)) 68 + properties.extend(self.parse_experimentaldatatab(sel)) 69 + 70 + return properties 71 + 72 + def parse_acdlabstab(self, sel): 73 + """ 74 + This function scrapes the 'Predicted ACD/Labs tab' under Properties 75 + :param sel: a Selector object of the whole page 76 + :return: a list of Request objects 77 + """ 78 + properties = [] 79 + 80 + td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath( 81 + 'normalize-space(string())') 82 + prop_names = td_list[::2] 83 + prop_values = td_list[1::2] 84 + for (prop_name, prop_value) in zip(prop_names, prop_values): 85 + # [:-1] is to remove the colon at the end, [TODO] - test for colon 86 + prop_name = prop_name.extract().encode('utf-8')[:-1] 87 + prop_value = prop_value.extract().encode('utf-8') 88 + prop_conditions = '' 89 + 90 + # Test for properties without values, with one hardcoded exception 91 + if (not re.match(r'^\d', prop_value) or 92 + (prop_name == 'Polarizability' and prop_value == '10-24cm3')): 93 + continue 94 + 95 + m = re.match(r'(.*) \((.*)\)', prop_name) 96 + if m: 97 + prop_name = m.group(1) 98 + prop_conditions = m.group(2) 99 + 100 + m = re.match(r'(.*) at (.*)', prop_value) 101 + if m: 102 + prop_value = m.group(1) 103 + prop_conditions = m.group(2) 104 + 105 + new_prop = self.newresult( 106 + attribute=prop_name, 107 + value=prop_value, 108 + source='ChemSpider Predicted - ACD/Labs Tab', 109 + conditions=prop_conditions 110 + ) 111 + properties.append(new_prop) 112 + 113 + return properties 114 + 115 + def parse_experimentaldatatab(self, sel): 116 + """ 117 + This function scrapes Experimental Data tab, Physico-chemical 118 + properties in particular. 119 + :param sel: a Selector object of the whole page 120 + :return: a list of Result items 121 + """ 122 + properties = [] 123 + 124 + scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical ' 125 + 'Properties"]//li/table/tr/td') 126 + if not scraped_list: 127 + return properties 128 + # Format is: property name followed by a list of values 129 + property_name = scraped_list.pop(0).xpath( 130 + 'span/text()').extract()[0].rstrip() 131 + for line in scraped_list: 132 + if line.xpath('span/text()'): 133 + property_name = line.xpath('span/text()').extract()[0].rstrip() 134 + else: 135 + new_prop = self.newresult( 136 + attribute=property_name[:-1], 137 + value=line.xpath('text()').extract()[0].rstrip(), 138 + source=line.xpath('strong/text()').extract()[0].rstrip(), 139 + ) 140 + properties.append(new_prop) 141 + 142 + return properties 143 + 144 + def parse_synonyms(self, sel): 145 + """ 146 + This function scrapes the list of Names and Identifiers 147 + :param sel: a Selector object of the whole page 148 + :return: a list of Requests 149 + """ 150 + requests = [] 151 + synonyms = [] 152 + 153 + # Exact type for this is unknown, but equivalent to Validated by Expert 154 + for syn in sel.xpath('//p[@class="syn"][span[@class="synonym_cn"]]'): 155 + name = syn.xpath('span[@class="synonym_cn"]/text()').extract()[0] 156 + synonyms.append(self.new_synonym(syn, name, 'expert')) 157 + # These synonyms are labeled by ChemSpider as "Validated by Experts" 158 + for syn in sel.xpath('//p[@class="syn"][strong]'): 159 + name = syn.xpath('strong/text()').extract()[0] 160 + synonyms.append(self.new_synonym(syn, name, 'expert')) 161 + # These synonyms are labeled by ChemSpider as "Validated by Users" 162 + for syn in sel.xpath( 163 + '//p[@class="syn"][span[@class="synonym_confirmed"]]'): 164 + name = syn.xpath( 165 + 'span[@class="synonym_confirmed"]/text()').extract()[0] 166 + synonyms.append(self.new_synonym(syn, name, 'user')) 167 + # These syonyms are labeled as "Non-validated" and assumed unreliable 168 + for syn in sel.xpath('//p[@class="syn"][span[@class=""]]'): 169 + name = syn.xpath('span[@class=""]/text()').extract()[0] 170 + synonyms.append(self.new_synonym(syn, name, 'nonvalidated')) 171 + 172 + # [TODO] - confirm if English User-Validated synonyms are OK too 173 + for syn in synonyms: 174 + if syn['category'] == 'expert' and syn['language'] == 'English': 175 + log.msg('CS emit synonym: %s' % syn['name'], level=log.DEBUG) 176 + self._spider.get_synonym_requests(syn['name']) 177 + 178 + return requests 179 + 180 + def new_synonym(self, sel, name, category): 181 + """ 182 + This function scrapes for a single synonym at a given HTML tag 183 + :param sel: a Selector object of the given HTML tag 184 + :param name: the name of the synonym in the tag 185 + :param category: the name of the category the synonym is labeled as 186 + :return: a dictionary containing data on the synonym 187 + """ 188 + self.ignore_list.append(name) 189 + language = sel.xpath('span[@class="synonym_language"]/text()') 190 + if language: 191 + # The [1:-1] is to remove brackets around the language name 192 + language = language.extract()[0][1:-1] 193 + else: 194 + # If language is not given, English is assumed, [TODO] - confirm 195 + language = 'English' 196 + log.msg('CS synonym: %s (%s) (%s)' % (name, category, language), 197 + level=log.DEBUG) 198 + references = [] 199 + # A synonym can have multiple references, each optionally with link 200 + for ref in sel.xpath('span[@class="synonym_ref"]'): 201 + refname = ref.xpath('normalize-space(string())') 202 + references.append({ 203 + 'name': refname.extract()[0][1:-1], 204 + 'URI': '' 205 + }) 206 + for ref in sel.xpath('a[@class="synonym_ref"]'): 207 + references.append({ 208 + 'name': ref.xpath('@title').extract()[0], 209 + 'URI': ref.xpath('@href').extract()[0] 210 + }) 211 + for ref in references: 212 + log.msg('CS synonym ref: %s %s' % (ref['name'], ref['URI']), 213 + level=log.DEBUG) 214 + synonym = { 215 + 'name': name, 216 + 'category': category, 217 + 'language': language, 218 + 'references': references 219 + } 220 + return synonym 221 + 222 + def parse_extendedinfo(self, response): 223 + """ 224 + This function scrapes data from the ChemSpider GetExtendedCompoundInfo 225 + API, if a token is present in the configuration settings 226 + :param response: a Response object to be parsed 227 + :return: a list of Result items 228 + """ 229 + sel = Selector(response) 230 + properties = [] 231 + names = sel.xpath('*').xpath('name()').extract() 232 + values = sel.xpath('*').xpath('text()').extract() 233 + for (name, value) in zip(names, values): 234 + result = self.newresult( 235 + attribute=name, 236 + value=value, # These values have no unit! 237 + source='ChemSpider ExtendedCompoundInfo', 238 + ) 239 + if result['value']: 240 + properties.append(result) 241 + return properties 242 + 243 + def newresult(self, attribute, value, conditions='', source='ChemSpider'): 244 + """ 245 + This function abstracts from the Result item and provides default 246 + values. 247 + :param attribute: the name of the attribute 248 + :param value: the value of the attribute 249 + :param conditions: optional conditions regarding the value 250 + :param source: the name of the source if it is not ChemSpider 251 + :return: A Result item 252 + """ 253 + return Result({ 254 + 'attribute': attribute, 255 + 'value': value, 256 + 'source': source, 257 + 'reliability': self.cfg['reliability'], 258 + 'conditions': conditions 259 + }) 260 + 261 + def parse_searchrequest(self, response): 262 + """ 263 + This function parses the initial response of the ChemSpider Search API 264 + Requires a valid token to function. 265 + :param response: the Response object to be parsed 266 + :return: A Request for the information page and a Request for the 267 + extendedinfo API call 268 + """ 269 + sel = Selector(response) 270 + log.msg('chemspider parse_searchrequest', level=log.DEBUG) 271 + sel.register_namespace('cs', 'http://www.chemspider.com/') 272 + csids = sel.xpath('.//cs:int/text()').extract() 273 + if len(csids) == 0: 274 + log.msg('ChemSpider found nothing', level=log.ERROR) 275 + return 276 + elif len(csids) > 1: 277 + log.msg('ChemSpider found multiple substances, taking first ' 278 + 'element', level=log.DEBUG) 279 + csid = csids[0] 280 + structure_url = self.website[:-2].replace("\\", "") + self.structure % csid 281 + extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid 282 + log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG) 283 + return [Request(url=structure_url, 284 + callback=self.parse), 285 + Request(url=extendedinfo_url, 286 + callback=self.parse_extendedinfo)] 287 + 288 + def new_compound_request(self, compound): 289 + """ 290 + This function is called when a new synonym is returned to the spider 291 + to generate new requests 292 + :param compound: the name of the compound to search for 293 + """ 294 + if compound in self.ignore_list or self.cfg['token'] == '': 295 + return None 296 + searchurl = self.website[:-2].replace("\\", "") + self.search % compound 297 + log.msg('chemspider compound', level=log.DEBUG) 298 + return Request(url=searchurl, callback=self.parse_searchrequest)

+334

FourmiCrawler/sources/NIST.py

···

··· 1 + import re 2 + 3 + from scrapy import log 4 + from scrapy.http import Request 5 + from scrapy.selector import Selector 6 + 7 + from source import Source 8 + from FourmiCrawler.items import Result 9 + 10 + 11 + # [TODO]: values can be '128.', perhaps remove the dot in that case? 12 + # [TODO]: properties have references and comments which do not exist in the 13 + # Result item, but should be included eventually. 14 + 15 + class NIST(Source): 16 + """ 17 + NIST Scraper plugin 18 + This plugin manages searching for a chemical on the NIST website 19 + and parsing the resulting page if the chemical exists on NIST. 20 + """ 21 + website = "http://webbook\\.nist\\.gov/.*" 22 + 23 + search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' 24 + 25 + def __init__(self, config=None): 26 + """ 27 + Initialization of NIST scraper 28 + :param config: configuration variables for this scraper, must contain 29 + 'reliability' key. 30 + """ 31 + Source.__init__(self, config) 32 + self.ignore_list = set() 33 + 34 + def parse(self, response): 35 + """ 36 + This function is called when a Response matching the variable 37 + 'website' is available for parsing the Response object. 38 + :param response: The Scrapy Response object to be parsed 39 + :return: a list of Result items and Request objects 40 + """ 41 + sel = Selector(response) 42 + 43 + title = sel.xpath('head/title/text()').extract()[0] 44 + if title == 'Name Not Found': 45 + log.msg('NIST: Chemical not found!', level=log.ERROR) 46 + return 47 + if title not in self.ignore_list: 48 + self.ignore_list.update(title) 49 + log.msg('NIST emit synonym: %s' % title, level=log.DEBUG) 50 + self._spider.get_synonym_requests(title) 51 + 52 + requests = [] 53 + 54 + requests.extend(self.parse_generic_info(sel)) 55 + 56 + symbol_table = {} 57 + tds = sel.xpath('//table[@class="symbol_table"]/tr/td') 58 + for (symbol_td, name_td) in zip(tds[::2], tds[1::2]): 59 + symbol = ''.join(symbol_td.xpath('node()').extract()) 60 + name = name_td.xpath('text()').extract()[0] 61 + symbol_table[symbol] = name 62 + log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name), 63 + level=log.DEBUG) 64 + 65 + requests.extend(self.parse_tables(sel, symbol_table)) 66 + 67 + return requests 68 + 69 + def parse_tables(self, sel, symbol_table): 70 + """ 71 + This function identifies and distributes parsing of tables to other 72 + functions below. 73 + :param sel: A Selector object of the whole page 74 + :param symbol_table: a dictionary containing translations of raw HTML 75 + tags to human readable names 76 + :return: a list of Result items and Requests 77 + """ 78 + requests = [] 79 + 80 + for table in sel.xpath('//table[@class="data"]'): 81 + summary = table.xpath('@summary').extract()[0] 82 + if summary == 'One dimensional data': 83 + log.msg('NIST table: Aggregrate data', level=log.DEBUG) 84 + requests.extend( 85 + self.parse_aggregate_data(table, symbol_table)) 86 + elif table.xpath('tr/th="Initial Phase"').extract()[0] == '1': 87 + log.msg('NIST table; Enthalpy/entropy of phase transition', 88 + level=log.DEBUG) 89 + requests.extend(self.parse_transition_data(table, summary)) 90 + elif table.xpath('tr[1]/td'): 91 + log.msg('NIST table: Horizontal table', level=log.DEBUG) 92 + elif summary == 'Antoine Equation Parameters': 93 + log.msg('NIST table: Antoine Equation Parameters', 94 + level=log.DEBUG) 95 + requests.extend(self.parse_antoine_data(table, summary)) 96 + elif len(table.xpath('tr[1]/th')) == 5: 97 + log.msg('NIST table: generic 5 columns', level=log.DEBUG) 98 + # Symbol (unit) Temperature (K) Method Reference Comment 99 + requests.extend(self.parse_generic_data(table, summary)) 100 + elif len(table.xpath('tr[1]/th')) == 4: 101 + log.msg('NIST table: generic 4 columns', level=log.DEBUG) 102 + # Symbol (unit) Temperature (K) Reference Comment 103 + requests.extend(self.parse_generic_data(table, summary)) 104 + else: 105 + log.msg('NIST table: NOT SUPPORTED', level=log.WARNING) 106 + continue # Assume unsupported 107 + return requests 108 + 109 + def parse_generic_info(self, sel): 110 + """ 111 + This function parses: synonyms, chemical formula, molecular weight, 112 + InChI, InChiKey, CAS number 113 + :param sel: A Selector object of the entire page in the original 114 + response 115 + :return: a list of Result items 116 + """ 117 + ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]') 118 + 119 + raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract() 120 + for synonym in raw_synonyms[0].strip().split(';\n'): 121 + log.msg('NIST synonym: %s' % synonym, level=log.DEBUG) 122 + self.ignore_list.update(synonym) 123 + self._spider.get_synonym_requests(synonym) 124 + 125 + data = {} 126 + 127 + raw_formula = ul.xpath('li[strong/a="Formula"]//text()').extract() 128 + data['Chemical formula'] = ''.join(raw_formula[2:]).strip() 129 + 130 + raw_mol_weight = ul.xpath('li[strong/a="Molecular weight"]/text()') 131 + data['Molecular weight'] = raw_mol_weight.extract()[0].strip() 132 + 133 + raw_inchi = ul.xpath('li[strong="IUPAC Standard InChI:"]//tt/text()') 134 + data['IUPAC Standard InChI'] = raw_inchi.extract()[0] 135 + 136 + raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]' 137 + '/tt/text()') 138 + data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0] 139 + 140 + raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()') 141 + data['CAS Registry Number'] = raw_cas_number.extract()[0].strip() 142 + 143 + requests = [] 144 + for key, value in data.iteritems(): 145 + result = self.newresult( 146 + attribute=key, 147 + value=value 148 + ) 149 + requests.append(result) 150 + 151 + return requests 152 + 153 + def parse_aggregate_data(self, table, symbol_table): 154 + """ 155 + This function parses the table(s) which contain possible links to 156 + individual data points 157 + :param table: a Selector object of the table to be parsed 158 + :param symbol_table: a dictionary containing translations of raw HTML 159 + tags to human readable names 160 + :return: a list of Result items and Request objects 161 + """ 162 + results = [] 163 + for tr in table.xpath('tr[td]'): 164 + extra_data_url = tr.xpath('td[last()][a="Individual data points"]' 165 + '/a/@href').extract() 166 + if extra_data_url: 167 + request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0], 168 + callback=self.parse_individual_datapoints) 169 + results.append(request) 170 + continue 171 + data = [] 172 + for td in tr.xpath('td'): 173 + data.append(''.join(td.xpath('node()').extract())) 174 + 175 + name = symbol_table[data[0]] 176 + condition = '' 177 + 178 + m = re.match(r'(.*) at (.*)', name) 179 + if m: 180 + name = m.group(1) 181 + condition = m.group(2) 182 + 183 + result = self.newresult( 184 + attribute=name, 185 + value=data[1] + ' ' + data[2], 186 + conditions=condition 187 + ) 188 + log.msg('NIST: |%s|' % data, level=log.DEBUG) 189 + results.append(result) 190 + return results 191 + 192 + def parse_transition_data(self, table, summary): 193 + """ 194 + This function parses the table containing properties regarding phase 195 + changes 196 + :param table: a Selector object of the table to be parsed 197 + :param summary: the name of the property 198 + :return: a list of Result items 199 + """ 200 + results = [] 201 + 202 + unit = self.get_unit(table) 203 + 204 + for tr in table.xpath('tr[td]'): 205 + tds = tr.xpath('td/text()').extract() 206 + result = self.newresult( 207 + attribute=summary, 208 + value=tds[0] + ' ' + unit, 209 + conditions='%s K, (%s -> %s)' % (tds[1], tds[2], tds[3]) 210 + ) 211 + results.append(result) 212 + 213 + return results 214 + 215 + def parse_generic_data(self, table, summary): 216 + """ 217 + Parses the common tables of 4 and 5 rows. Assumes they are of the 218 + form: 219 + Symbol (unit)|Temperature (K)|Method|Reference|Comment 220 + Symbol (unit)|Temperature (K)|Reference|Comment 221 + :param table: a Selector object of the table to be parsed 222 + :param summary: the name of the property 223 + :return: a list of Result items 224 + """ 225 + results = [] 226 + 227 + unit = self.get_unit(table) 228 + 229 + for tr in table.xpath('tr[td]'): 230 + tds = tr.xpath('td/text()').extract() 231 + result = self.newresult( 232 + attribute=summary, 233 + value=tds[0] + ' ' + unit, 234 + conditions='%s K' % tds[1] 235 + ) 236 + results.append(result) 237 + return results 238 + 239 + def parse_antoine_data(self, table, summary): 240 + """ 241 + This function parses the table containing parameters for the Antione 242 + equation 243 + :param table: a Selector object of the table to be parsed 244 + :param summary: the name of the property 245 + :return: a list of Result items 246 + """ 247 + results = [] 248 + 249 + for tr in table.xpath('tr[td]'): 250 + tds = tr.xpath('td/text()').extract() 251 + result = self.newresult( 252 + attribute=summary, 253 + value='A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]), 254 + conditions='%s K' % tds[0] 255 + ) 256 + results.append(result) 257 + 258 + return results 259 + 260 + def parse_individual_datapoints(self, response): 261 + """ 262 + This function parses the 'individual data points' page linked from 263 + the aggregate data table(s) 264 + :param response: the Scrapy Response object to be parsed 265 + :return: a list of Result items 266 + """ 267 + sel = Selector(response) 268 + table = sel.xpath('//table[@class="data"]')[0] 269 + 270 + results = [] 271 + 272 + name = table.xpath('@summary').extract()[0] 273 + condition = '' 274 + m = re.match(r'(.*) at (.*)', name) 275 + if m: 276 + name = m.group(1) 277 + condition = m.group(2) 278 + 279 + unit = self.get_unit(table) 280 + 281 + for tr in table.xpath('tr[td]'): 282 + tds = tr.xpath('td/text()').extract() 283 + uncertainty = '' 284 + m = re.search('Uncertainty assigned by TRC = (.*?) ', tds[-1]) 285 + if m: 286 + uncertainty = '+- %s ' % m.group(1) 287 + # [TODO]: get the plusminus sign working in here 288 + result = self.newresult( 289 + attribute=name, 290 + value='%s %s%s' % (tds[0], uncertainty, unit), 291 + conditions=condition 292 + ) 293 + results.append(result) 294 + 295 + return results 296 + 297 + @staticmethod 298 + def get_unit(table): 299 + tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract()) 300 + m = re.search(r'\((.*)\)', tr_unit) 301 + unit = '!' 302 + if m: 303 + unit = m.group(1) 304 + 305 + return unit 306 + 307 + def newresult(self, attribute, value, conditions=''): 308 + """ 309 + This function abstracts from the Result item and provides default 310 + values 311 + :param attribute: the name of the attribute 312 + :param value: the value of the attribute 313 + :param conditions: optional conditions regarding the value 314 + :return: A Result item 315 + """ 316 + return Result( 317 + { 318 + 'attribute': attribute, 319 + 'value': value, 320 + 'source': 'NIST', 321 + 'reliability': self.cfg['reliability'], 322 + 'conditions': conditions 323 + }) 324 + 325 + def new_compound_request(self, compound): 326 + """ 327 + This function is called when a new synonym is returned to the spider 328 + to generate new requests 329 + :param compound: the name of the compound to search for 330 + """ 331 + if compound not in self.ignore_list: 332 + self.ignore_list.update(compound) 333 + return Request(url=self.website[:-2].replace("\\", "") + self.search % compound, 334 + callback=self.parse)

+149

FourmiCrawler/sources/PubChem.py

···

··· 1 + import re 2 + 3 + from scrapy.http import Request 4 + from scrapy import log 5 + from scrapy.selector import Selector 6 + 7 + from source import Source 8 + from FourmiCrawler.items import Result 9 + 10 + 11 + class PubChem(Source): 12 + """ PubChem scraper for chemical properties 13 + 14 + This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance, 15 + including sources of the values of properties. 16 + """ 17 + 18 + # PubChem has its data on compound name, properties and their values on different html pages, so different URLs used 19 + website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*' 20 + website_www = 'http://www.ncbi.nlm.nih.gov/*' 21 + website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*' 22 + search = 'pccompound?term=%s' 23 + data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s' 24 + 25 + __spider = None 26 + searched_compounds = set() 27 + 28 + def __init__(self, config): 29 + Source.__init__(self, config) 30 + self.cfg = config 31 + 32 + def parse(self, response): 33 + """ 34 + Distributes the above described behaviour 35 + :param response: The incoming search request 36 + :return Returns the found properties if response is unique or returns none if it's already known 37 + """ 38 + requests = [] 39 + log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) 40 + 41 + sel = Selector(response) 42 + compound = sel.xpath('//h1/text()').extract()[0] 43 + if compound in self.searched_compounds: 44 + return None 45 + 46 + self.searched_compounds.update(compound) 47 + raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0] 48 + for synonym in raw_synonyms.strip().split(', '): 49 + log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG) 50 + self.searched_compounds.update(synonym) 51 + self._spider.get_synonym_requests(synonym) 52 + log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG) 53 + 54 + n = re.search(r'cid=(\d+)', response.url) 55 + if n: 56 + cid = n.group(1) 57 + log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach 58 + # the seperate html page which contains the properties and their values 59 + 60 + # using this cid to get the right url and scrape it 61 + requests.append( 62 + Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data)) 63 + return requests 64 + 65 + def parse_data(self, response): 66 + """ 67 + Parse data found in 'Chemical and Physical properties' part of a substance page. 68 + :param response: The response with the page to parse 69 + :return: requests: Returns a list of properties with their values, source, etc. 70 + """ 71 + log.msg('parsing data', level=log.DEBUG) 72 + requests = [] 73 + 74 + sel = Selector(response) 75 + props = sel.xpath('//div') 76 + 77 + for prop in props: 78 + prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing 79 + if prop.xpath('a'): # parsing for single value in property 80 + prop_source = ''.join(prop.xpath('a/@title').extract()) 81 + prop_value = ''.join(prop.xpath('a/text()').extract()) 82 + new_prop = Result({ 83 + 'attribute': prop_name, 84 + 'value': prop_value, 85 + 'source': prop_source, 86 + 'reliability': self.cfg['reliability'], 87 + 'conditions': '' 88 + }) 89 + log.msg('PubChem prop: |%s| |%s| |%s|' % 90 + (new_prop['attribute'], new_prop['value'], 91 + new_prop['source']), level=log.DEBUG) 92 + requests.append(new_prop) 93 + elif prop.xpath('ul'): # parsing for multiple values (list) in property 94 + prop_values = prop.xpath('ul//li') 95 + for prop_li in prop_values: 96 + prop_value = ''.join(prop_li.xpath('a/text()').extract()) 97 + prop_source = ''.join(prop_li.xpath('a/@title').extract()) 98 + new_prop = Result({ 99 + 'attribute': prop_name, 100 + 'value': prop_value, 101 + 'source': prop_source, 102 + 'reliability': self.cfg['reliability'], 103 + 'conditions': '' 104 + }) 105 + log.msg('PubChem prop: |%s| |%s| |%s|' % 106 + (new_prop['attribute'], new_prop['value'], 107 + new_prop['source']), level=log.DEBUG) 108 + requests.append(new_prop) 109 + 110 + return requests 111 + 112 + def parse_searchrequest(self, response): 113 + """ 114 + This function parses the response to the new_compound_request Request 115 + :param response: the Response object to be parsed 116 + :return: A Request for the compound page or what self.parse returns in 117 + case the search request forwarded to the compound page 118 + """ 119 + 120 + # check if pubchem forwarded straight to compound page 121 + m = re.match(self.website_pubchem, response.url) 122 + if m: 123 + log.msg('PubChem search forwarded to compound page', 124 + level=log.DEBUG) 125 + return self.parse(response) 126 + 127 + sel = Selector(response) 128 + 129 + results = sel.xpath('//div[@class="rsltcont"]') 130 + if results: 131 + url = results[0].xpath('div/p/a[1]/@href') 132 + else: 133 + log.msg('PubChem search found nothing or xpath failed', 134 + level=log.DEBUG) 135 + return None 136 + 137 + if url: 138 + url = 'http:' + ''.join(url[0].extract()) 139 + log.msg('PubChem compound page: %s' % url, level=log.DEBUG) 140 + else: 141 + log.msg('PubChem search found results, but no url in first result', 142 + level=log.DEBUG) 143 + return None 144 + 145 + return Request(url=url, callback=self.parse) 146 + 147 + def new_compound_request(self, compound): 148 + return Request(url=self.website_www[:-1] + self.search % compound, 149 + callback=self.parse_searchrequest)

+169

FourmiCrawler/sources/WikipediaParser.py

···

··· 1 + import re 2 + 3 + from scrapy.http import Request 4 + from scrapy import log 5 + from scrapy.selector import Selector 6 + 7 + from source import Source 8 + from FourmiCrawler.items import Result 9 + 10 + 11 + class WikipediaParser(Source): 12 + """ Wikipedia scraper for chemical properties 13 + 14 + This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values. 15 + It also returns requests with other external sources which contain information on parsed subject. 16 + """ 17 + 18 + website = "http://en\\.wikipedia\\.org/wiki/.*" 19 + __spider = None 20 + searched_compounds = [] 21 + 22 + def __init__(self, config=None): 23 + Source.__init__(self, config) 24 + 25 + def parse(self, response): 26 + """ 27 + Distributes the above described behaviour 28 + :param response: The incoming search request 29 + :return: Returns the found properties if response is unique or returns none if it's already known 30 + """ 31 + log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) 32 + sel = Selector(response) 33 + compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page 34 + if compound in self.searched_compounds: 35 + return None 36 + else: 37 + items = self.parse_infobox(sel) 38 + self.searched_compounds.append(compound) 39 + return items 40 + 41 + def parse_infobox(self, sel): 42 + """ 43 + Scrape data from infobox on wikipedia. 44 + 45 + Data from two types of infoboxes: class="infobox bordered" and class="infobox" is scraped and 46 + :param sel: The selector with the html-information of the page to parse 47 + :return: item_list: Returns a list of properties with their values, source, etc.. 48 + """ 49 + 50 + items = [] 51 + 52 + # scrape the chembox (wikipedia template) 53 + items = self.parse_chembox(sel, items) 54 + 55 + # scrape the drugbox (wikipedia template) 56 + items = self.parse_drugbox(sel, items) 57 + 58 + items = filter(lambda a: a['value'] != '', items) # remove items with an empty value 59 + item_list = self.clean_items(items) 60 + 61 + identifiers = self.get_identifiers(sel) 62 + 63 + #add extra sources to scrape from as requests 64 + for i, identifier in enumerate(identifiers): 65 + request = None 66 + #discard internal wikipedia links 67 + if re.match('//en\.wikipedia', identifier): 68 + log.msg('Found link to Wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING) 69 + #fix links starting with '//www.' 70 + elif re.match('/{2}', identifier): 71 + identifier = re.sub("/{2}", "http://", identifier) 72 + request = Request(identifier) 73 + else: 74 + request = Request(identifier) 75 + log.msg('New identifier found, request: %s' % identifier, level=log.DEBUG) 76 + item_list.append(request) 77 + 78 + return item_list 79 + 80 + def parse_chembox(self, sel, items): 81 + """ 82 + Scrape data from chembox infobox on wikipedia. 83 + 84 + :param sel: The selector with the html-information of the page to parse 85 + :param items: the list of items where the result have to be stored in 86 + :return: items: the list of items with the new found and stored items 87 + """ 88 + tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \ 89 + xpath('normalize-space(string())') 90 + prop_names = tr_list[::2] 91 + prop_values = tr_list[1::2] 92 + for i, prop_name in enumerate(prop_names): 93 + item = self.newresult( 94 + attribute=prop_name.extract().encode('utf-8'), 95 + value=prop_values[i].extract().encode('utf-8') 96 + ) 97 + items.append(item) 98 + log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) 99 + return items 100 + 101 + def parse_drugbox(self, sel, items): 102 + """ 103 + Scrape data from drugbox infobox on wikipedia. 104 + 105 + :param sel: The selector with the html-information of the page to parse 106 + :param items: the list of items where the result have to be stored in 107 + :return: items: the list of items with the new found and stored items 108 + """ 109 + tr_list2 = sel.xpath('.//table[@class="infobox"]//tr') 110 + log.msg('dit: %s' % tr_list2, level=log.DEBUG) 111 + for tablerow in tr_list2: 112 + log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG) 113 + if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath( 114 + 'normalize-space(string())'): 115 + item = self.newresult( 116 + attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'), 117 + value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'), 118 + ) 119 + items.append(item) 120 + log.msg( 121 + 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']), 122 + level=log.DEBUG) 123 + return items 124 + 125 + def new_compound_request(self, compound): 126 + return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse) 127 + 128 + @staticmethod 129 + def clean_items(items): 130 + 131 + """ 132 + Clean up properties using regex, makes it possible to split the values from the units 133 + 134 + Almost not in use, only cleans J/K/mol values and boiling/melting points. 135 + 136 + :param items: List of properties with their values, source, etc.. 137 + :return: items: List of now cleaned up items 138 + """ 139 + for item in items: 140 + value = item['value'] 141 + m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F) 142 + if m: 143 + item['value'] = m.group(1) + " K" 144 + m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) # clean up J/K/mol values 145 + if m: 146 + item['value'] = m.group(1) + " J/K/mol" 147 + return items 148 + 149 + @staticmethod 150 + def get_identifiers(sel): 151 + """ 152 + Find external links, named 'Identifiers' to different sources. 153 + 154 + :param sel: The selector with the html-information of the page to parse 155 + :return: links: New links which can be used to expand the crawlers search 156 + """ 157 + links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a' 158 + '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() 159 + return links 160 + 161 + def newresult(self, attribute, value): 162 + return Result( 163 + { 164 + 'attribute': attribute, 165 + 'value': value, 166 + 'source': 'Wikipedia', 167 + 'reliability': self.cfg['reliability'], 168 + 'conditions': '' 169 + })

FourmiCrawler/sources/__init__.py

This is a binary file and will not be displayed.

+41

FourmiCrawler/sources/source.py

···

··· 1 + from scrapy import log 2 + # from scrapy.http import Request 3 + 4 + 5 + class Source: 6 + website = "http://something/.*" # Regex of URI's the source is able to parse 7 + _spider = None 8 + 9 + def __init__(self, config=None): 10 + """ 11 + Initiation of a new Source 12 + """ 13 + self.cfg = {} 14 + if config is not None: 15 + self.cfg = config 16 + pass 17 + 18 + def parse(self, response): 19 + """ 20 + This function should be able to parse all Scrapy Response objects with a URL matching the website Regex. 21 + :param response: A Scrapy Response object 22 + :return: A list of Result items and new Scrapy Requests 23 + """ 24 + log.msg("The parse function of the empty source was used.", level=log.WARNING) 25 + pass 26 + 27 + def new_compound_request(self, compound): 28 + """ 29 + This function should return a Scrapy Request for the given compound request. 30 + :param compound: A compound name. 31 + :return: A new Scrapy Request 32 + """ 33 + # return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse) 34 + pass 35 + 36 + def set_spider(self, spider): 37 + """ 38 + A Function to save the associated spider. 39 + :param spider: A FourmiSpider object 40 + """ 41 + self._spider = spider

+72 -7

FourmiCrawler/spider.py

··· 1 from scrapy.spider import Spider 2 3 4 class FourmiSpider(Spider): 5 name = "FourmiSpider" 6 7 - def __init__(self, compound=None, *args, **kwargs): 8 super(FourmiSpider, self).__init__(*args, **kwargs) 9 10 - def parse(self, reponse): 11 - # [TODO] - This function should delegate it's functionality to other 12 - # parsers. 13 - pass 14 15 - def add_parser(self, parser): 16 - self.parsers.add(parser)

··· 1 + import re 2 + 3 from scrapy.spider import Spider 4 + from scrapy import log 5 6 7 class FourmiSpider(Spider): 8 + """ 9 + A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data. 10 + """ 11 name = "FourmiSpider" 12 13 + def __init__(self, compound=None, selected_attributes=None, *args, **kwargs): 14 + """ 15 + Initiation of the Spider 16 + :param compound: compound that will be searched. 17 + :param selected_attributes: A list of regular expressions that the attributes should match. 18 + """ 19 + self._sources = [] 20 + self.synonyms = set() 21 super(FourmiSpider, self).__init__(*args, **kwargs) 22 + self.synonyms.add(compound) 23 + if selected_attributes is None: 24 + self.selected_attributes = [".*"] 25 + else: 26 + self.selected_attributes = selected_attributes 27 28 + def parse(self, response): 29 + """ 30 + The function that is called when a response to a request is available. This function distributes this to a 31 + source which should be able to handle parsing the data. 32 + :param response: A Scrapy Response object that should be parsed 33 + :return: A list of Result items and new Request to be handled by the scrapy core. 34 + """ 35 + for source in self._sources: 36 + if re.match(source.website, response.url): 37 + log.msg("URL: " + response.url + " -> Source: " + source.website, level=log.DEBUG) 38 + return source.parse(response) 39 + log.msg("URL: " + response.url + " -> No compatible source", level=log.INFO) 40 + return None 41 + 42 + def get_synonym_requests(self, compound, force=False): 43 + """ 44 + A function that generates new Scrapy Request for each source given a new synonym of a compound. 45 + :param compound: A compound name 46 + :return: A list of Scrapy Request objects 47 + """ 48 + requests = [] 49 + if force or compound not in self.synonyms: 50 + self.synonyms.add(compound) 51 + for parser in self._sources: 52 + parser_requests = parser.new_compound_request(compound) 53 + if parser_requests is not None: 54 + requests.append(parser_requests) 55 + return requests 56 + 57 + def start_requests(self): 58 + """ 59 + The function called by Scrapy for it's first Requests 60 + :return: A list of Scrapy Request generated from the known synonyms using the available sources. 61 + """ 62 + requests = [] 63 + for synonym in self.synonyms: 64 + requests.extend(self.get_synonym_requests(synonym, force=True)) 65 + return requests 66 + 67 + def add_sources(self, sources): 68 + """ 69 + A function to add a new Parser objects to the list of available sources. 70 + :param sources: A list of Source Objects. 71 + """ 72 + for parser in sources: 73 + self.add_source(parser) 74 75 + def add_source(self, source): 76 + """ 77 + A function add a new Parser object to the list of available parsers. 78 + :param source: A Source Object 79 + """ 80 + self._sources.append(source) 81 + source.set_spider(self)

+1

GUI/__init__.py

···

··· 1 + import gui

+30

GUI/configImporter.py

···

··· 1 + import ConfigParser 2 + 3 + 4 + class ConfigImporter(): 5 + def __init__(self, filename): 6 + """Read the filename into the parser.""" 7 + self.filename = filename 8 + self.parser = ConfigParser.ConfigParser() 9 + self.parser.read(self.filename) 10 + 11 + def load_common_attributes(self): 12 + """Loads common attributes from the initialized file.""" 13 + try: 14 + return self.parser.get('GUI', 'CommonParameters') 15 + except: 16 + return 'One, Two, Three' 17 + 18 + def load_output_types(self): 19 + """Loads output types from the initialized file.""" 20 + try: 21 + return self.parser.get('GUI', 'OutputTypes') 22 + except: 23 + return 'csv' 24 + 25 + def load_always_attributes(self): 26 + """Loads attributes that are always searched for from the initialized file.""" 27 + try: 28 + return self.parser.get('GUI', 'AlwaysParameters') 29 + except: 30 + return 'Name, Weight'

+196

GUI/gui.py

···

··· 1 + from Tkinter import * 2 + import os 3 + import shutil 4 + from tkFileDialog import asksaveasfilename 5 + 6 + from configImporter import * 7 + 8 + 9 + class GUI(): 10 + def __init__(self, search, config_file='GUI.cfg', sourceloader=None, in_source=True): 11 + """Boots the window, configuration.""" 12 + if not in_source: 13 + current_dir = os.path.dirname(os.path.abspath(__file__)) 14 + config_file = current_dir + '../' + config_file 15 + if not os.path.isfile(config_file): 16 + try: 17 + shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../GUI.cfg.sample", config_file) 18 + except IOError: 19 + print "GUI configuration couldn't be found and couldn't be created." 20 + sys.exit() 21 + self.configurator = ConfigImporter(config_file) 22 + self.sourceloader = sourceloader 23 + self.finish_with_search = False 24 + self.values = {} 25 + self.required_variables = ['substance'] 26 + self.search = search 27 + self.window, self.variables = self.generate_window(self.load_common_attributes(), self.load_output_types()) 28 + 29 + def load_common_attributes(self): 30 + """Calls the configuration parser for common attributes.""" 31 + return [x.strip() for x in self.configurator.load_common_attributes().split(',')] 32 + 33 + def load_output_types(self): 34 + """Calls the configuration parser for output types.""" 35 + return [x.strip() for x in self.configurator.load_output_types().split(',')] 36 + 37 + def load_always_attributes(self): 38 + """Calls the configuration parser for attributes that are always used.""" 39 + return ','.join([x.strip() for x in self.configurator.load_always_attributes().split(',')]) 40 + 41 + def set_output(self): 42 + self.variable_output_name.set(asksaveasfilename()) 43 + self.button_output_name.config(text=self.variable_output_name.get()) 44 + 45 + def generate_window(self, common_attributes, output_types): 46 + """Creates all widgets and variables in the window.""" 47 + window = Tk() 48 + window.wm_title("Fourmi Crawler") 49 + 50 + variables = {} 51 + 52 + variable_substance = StringVar(window) 53 + frame_substance = Frame(window) 54 + label_substance = Label(frame_substance, text="Substance: ") 55 + input_substance = Entry(frame_substance, font=("Helvetica", 12), width=25, textvariable=variable_substance) 56 + variables.update({"substance": variable_substance}) 57 + frame_substance.pack(side=TOP) 58 + label_substance.pack() 59 + input_substance.pack() 60 + input_substance.focus() 61 + 62 + frame_all_attributes = Frame(window) 63 + frame_selecting_attributes = Frame(frame_all_attributes) 64 + frame_new_attributes = Frame(frame_selecting_attributes) 65 + label_new_attributes = Label(frame_new_attributes, text="Parameters: ") 66 + input_new_attributes = Text(frame_new_attributes, font=("Helvetica", 8), width=25, height=7, padx=5, pady=5) 67 + variables.update({"new_attributes": input_new_attributes}) 68 + frame_new_attributes.pack(side=LEFT) 69 + label_new_attributes.pack() 70 + input_new_attributes.pack() 71 + 72 + frame_common_attributes = Frame(frame_selecting_attributes) 73 + label_common_attributes = Label(frame_common_attributes, text="Common Parameters: ") 74 + input_common_attributes = Listbox(frame_common_attributes, selectmode=MULTIPLE, height=7) 75 + scrollbar_common_attributes = Scrollbar(frame_common_attributes) 76 + input_common_attributes.config(yscrollcommand=scrollbar_common_attributes.set) 77 + scrollbar_common_attributes.config(command=input_common_attributes.yview) 78 + if common_attributes and len(common_attributes) > 0: 79 + input_common_attributes.insert(END, *common_attributes) 80 + variables.update({"common_attributes": input_common_attributes}) 81 + frame_common_attributes.pack(side=RIGHT) 82 + label_common_attributes.pack(side=TOP) 83 + input_common_attributes.pack(side=LEFT) 84 + scrollbar_common_attributes.pack(side=RIGHT, fill=Y) 85 + frame_selecting_attributes.pack() 86 + 87 + frame_last = Frame(window) 88 + search_button = Button(frame_last, text="Start search", command=self.prepare_search) 89 + cancel_button = Button(frame_last, text="Cancel", command=window.destroy) 90 + frame_last.pack(side=BOTTOM) 91 + search_button.pack(side=LEFT) 92 + cancel_button.pack(side=RIGHT) 93 + 94 + frame_name = Frame(window) 95 + frame_output_name = Frame(frame_name) 96 + label_output_name = Label(frame_output_name, text='Output file:') 97 + self.variable_output_name = StringVar() 98 + self.variable_output_name.set('results.csv') 99 + variables.update({'output_name':self.variable_output_name}) 100 + self.button_output_name = Button(frame_output_name, command=self.set_output, text="Select file") 101 + frame_output_name.pack(side=LEFT) 102 + label_output_name.pack() 103 + self.button_output_name.pack() 104 + frame_name.pack(side=BOTTOM) 105 + 106 + 107 + frame_checkboxes = Frame(window) 108 + frame_checkbox_attributes = Frame(frame_checkboxes) 109 + variable_all_attributes = BooleanVar() 110 + variable_all_attributes.set(True) 111 + input_all_attributes = Checkbutton(frame_checkbox_attributes, text="Search ALL parameters", 112 + variable=variable_all_attributes) 113 + variables.update({"all_attributes": variable_all_attributes}) 114 + frame_checkbox_attributes.pack(side=LEFT) 115 + input_all_attributes.pack() 116 + 117 + frame_logging = Frame(frame_checkboxes) 118 + variable_logging = BooleanVar() 119 + variable_logging.set(False) 120 + input_logging = Checkbutton(frame_logging, text="Verbose logging", variable=variable_logging) 121 + variables.update({'logging':variable_logging}) 122 + frame_logging.pack(side=RIGHT) 123 + frame_checkboxes.pack(side=BOTTOM) 124 + input_logging.pack() 125 + frame_all_attributes.pack() 126 + 127 + return window, variables 128 + 129 + def prepare_search(self): 130 + """Saves the values from the window for later retrieval.""" 131 + variables = self.variables 132 + values = {} 133 + 134 + values.update({"Always attributes": self.load_always_attributes()}) 135 + for name, var in variables.iteritems(): 136 + if var.__class__ is StringVar: 137 + values.update({name: var.get()}) 138 + elif var.__class__ is BooleanVar: 139 + values.update({name: var.get()}) 140 + elif var.__class__ is Text: 141 + values.update({name: str(var.get("1.0", END)).strip()}) 142 + elif var.__class__ is Listbox: 143 + values.update({name: ", ".join([var.get(int(i)) for i in var.curselection()])}) 144 + else: 145 + print "No known class, {}, {}".format(name, var) 146 + 147 + values.update({'output_name':self.variable_output_name.get()}) 148 + values.update({'output_type':self.check_output_type(values.get('output_name'))}) 149 + 150 + self.values = values 151 + if all([values.get(i) != '' for i in self.required_variables]): 152 + self.finish_with_search = True 153 + self.window.destroy() 154 + else: 155 + self.finish_with_search = False 156 + #tkMessageBox.showinfo('Not all required information was entered!') 157 + 158 + def execute_search(self): 159 + """Calls the Fourmi crawler with the values from the GUI""" 160 + if self.values.get('all_attributes'): 161 + attributes = ".*" 162 + else: 163 + attribute_types = ['attributes', 'Common attributes', 'Always attributes'] 164 + attributes = ','.join([str(self.values.get(attribute)) for attribute in attribute_types]) 165 + output_file = "file://" + str(self.values.get('output_name')) #Dealing with absolute paths 166 + 167 + arguments = {'--attributes': attributes, 168 + '--exclude': None, 169 + '--format': self.values.get('output_type'), 170 + '--help': False, 171 + '--include': None, 172 + '--log': 'log.txt', 173 + '--output': output_file, 174 + '-v': 0 if self.values.get('logging') else 3, 175 + '--version': False, 176 + '<compound>': self.values.get('substance'), 177 + 'list': False, 178 + 'search': True} 179 + 180 + self.search(arguments, self.sourceloader) 181 + 182 + def run(self): 183 + """Starts the window and the search.""" 184 + self.window.mainloop() 185 + if self.finish_with_search: 186 + self.execute_search() 187 + 188 + def check_output_type(self, filename): 189 + parts = str(filename).split('.') 190 + output_types = self.load_output_types() 191 + extension = parts[-1] 192 + 193 + for type in output_types: 194 + if extension==type: 195 + return extension 196 + return output_types[0]

+10

GUI.cfg.sample

···

··· 1 + [GUI] 2 + # Personalize options in your User Interface 3 + 4 + # Commonly used parameters are listed in the GUI for easy selection 5 + CommonParameters = Weight, Polarity, Viscosity, Solubility, Name 6 + 7 + # Parameters that are always used in the search 8 + AlwaysParameters = Name 9 + 10 + OutputTypes = csv, json, jsonlines, xml

+21

LICENSE

···

··· 1 + The MIT License (MIT) 2 + 3 + Copyright (c) 2014 Ivo B. Rietveld 4 + 5 + Permission is hereby granted, free of charge, to any person obtaining a copy 6 + of this software and associated documentation files (the "Software"), to deal 7 + in the Software without restriction, including without limitation the rights 8 + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 + copies of the Software, and to permit persons to whom the Software is 10 + furnished to do so, subject to the following conditions: 11 + 12 + The above copyright notice and this permission notice shall be included in all 13 + copies or substantial portions of the Software. 14 + 15 + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 + SOFTWARE.

+80

README.md

···

··· 1 + # Fourmi 2 + 3 + **Master branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=master)](https://travis-ci.org/jjdekker/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=master) 4 + 5 + **Developing branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=develop)](https://travis-ci.org/jjdekker/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=develop) 6 + 7 + Fourmi is an web scraper for chemical substances. The program is designed to be 8 + used as a search engine to search multiple chemical databases for a specific 9 + substance. The program will produce all available attributes of the substance 10 + and conditions associated with the attributes. Fourmi also attempts to estimate 11 + the reliability of each data point to assist the user in deciding which data 12 + should be used. 13 + 14 + The Fourmi project is open source project licensed under the MIT license. Feel 15 + free to contribute! 16 + 17 + Fourmi is based on the [Scrapy framework](http://scrapy.org/), an open source 18 + web scraping framework for python. Most of the functionality of this project can 19 + be traced to this framework. Should the documentation for this application fall 20 + short, we suggest you take a close look at the [Scrapy architecture] 21 + (http://doc.scrapy.org/en/latest/topics/architecture.html) and the [Scrapy 22 + documentation](http://doc.scrapy.org/en/latest/index.html). 23 + 24 + ### Installing 25 + 26 + If you're installing Fourmi, please take a look at our installation guides 27 + on our [wiki](https://github.com/jjdekker/Fourmi/wiki). When you've installed the application, make sure to check our 28 + usage guide on the [Command Line Interface](https://github.com/jjdekker/Fourmi/wiki/CLI) and on the [Graphical User Interface](https://github.com/jjdekker/Fourmi/wiki/GUI). 29 + 30 + ### Using the Source 31 + 32 + To use the Fourmi source code multiple dependencies are required. Take a look at 33 + our [wiki pages](https://github.com/jjdekker/Fourmi/wiki) on using the application source code in our a step by step 34 + installation guide. 35 + 36 + When developing for the Fourmi project keep in mind that code readability is a 37 + must. To maintain the readability, code should be conform with the 38 + [PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python 39 + code. More information about the different structures and principles of the 40 + Fourmi application can be found on our [wiki](https://github.com/jjdekker/Fourmi/wiki). 41 + 42 + ### To Do 43 + 44 + The Fourmi project has the following goals for the nearby future: 45 + 46 + __Main goals:__ 47 + 48 + - Build an graphical user interface(GUI) as alternative for the command line 49 + interface(CLI). (Assignee: Harmen) 50 + - Compiling the source into an windows executable. (Assignee: Bas) 51 + 52 + __Side goals:__ 53 + 54 + - Clean and unify data. 55 + - Extensive reliability analysis using statistical tests. 56 + - Test data with Descartes 1. 57 + 58 + ### Project Origin 59 + 60 + The Fourmi project was started in February of 2014 as part of a software 61 + engineering course at the Radboud University for students studying Computer 62 + Science, Information Science or Artificial Intelligence. Students participate in 63 + a real software development project as part of the 64 + [Giphouse](http://www.giphouse.nl/). 65 + 66 + This particular project was started on behalf of Ivo B. Rietveld. As a chemist 67 + he was in need of an application to automatically search information on chemical 68 + substances and create an phase diagram. The so called "Descrates" project was 69 + split into two teams each creating a different application that has part of the 70 + functionality. We are the team Descartes 2 and as we were responsible for 71 + creating a web crawler, we've named our application Fourmi (Englis: Ants). 72 + 73 + The following people were part of the original team: 74 + 75 + - [Jip J. Dekker](http://jip.dekker.li) 76 + - Rob ten Berge 77 + - Harmen Prins 78 + - Bas van Berkel 79 + - Nout van Deijck 80 + - Michail Kuznetcov

-16

README.rst

··· 1 - We are the team Descartes 2. 2 - ---------------------------- 3 - 4 - Our team members are: 5 - 6 - + Rob ten Berge 7 - 8 - + Bas van Berkel 9 - 10 - + Nout van Deijck 11 - 12 - + Jip J. Dekker 13 - 14 - + Michail Kuznetcov 15 - 16 - + Harmen Prins

···

+108

SIGNED.md

···

··· 1 + ##### Signed by https://keybase.io/jdekker 2 + ``` 3 + -----BEGIN PGP SIGNATURE----- 4 + Version: GnuPG v1.4.11 (GNU/Linux) 5 + 6 + iQIcBAABAgAGBQJTpMZAAAoJEJrQ9RIUCT6/Hf8P/AyX9ZD5zj6rBi2CwDOTs5aa 7 + flVqw9syvdqTzVfXQaR4UrCSOuyuOeAkiqub0BMjxyCurqAwN/SCPf3uOJ/tGXmt 8 + ZPtYVHjevJ4mbojLhZiJ2av8LC9VOh3Zl+reR3L2cLuBD4rVSrfUMJtczbbtNlk+ 9 + +mczRcTpzNvHQW6mKqyUoKn8xqNnLC7C+p5ybNZ5EADUfoKIF1xyTN6je6fpYZ1U 10 + IHxiUzeOvfX9ohmbfnfkpkuSll1nUJWsTgUPKhthJuxEhwCQ1xMdWhxfcyZJaMT2 11 + Pxgo8C8S6lzAk4PxBRBoePjgWAeaFmbr317WXHvw6SSHPIdzToKZgDiDC5LWvKxb 12 + RRdLZ6w7tg0/FSUexekrUafGT8Je0oIoLUQlNaEQzrPNhDpma1uHFfZg0vb2m4Hq 13 + WHLLKTCr6FMczhP1TmuIEtdjKtymT+rO+Ls4ciw+654R7MtBYcmTr+RqmAd+GadJ 14 + vJNmGDod2oPwCydEps8bYAbksqRhMmk3xwco/g6dWYh5/+1GzCr80J7fYpqtoPFH 15 + V5qKyDQovF5jPlb/buq4mH8XYVT1z4Sx8azKVctMLig57zRnvN0WyskpT09oY7dK 16 + TPvIqwTixekndYLcM3QacVq/NhVOOQPFvD0PwU18eKs4EfD2L7iWd2XjV9Az++aD 17 + jUY6EwEuOzDCexWP4eM8 18 + =h6TK 19 + -----END PGP SIGNATURE----- 20 + 21 + ``` 22 + 23 +  24 + 25 + ### Begin signed statement 26 + 27 + #### Expect 28 + 29 + ``` 30 + size exec file contents 31 + ./ 32 + 412 .gitignore 25059da2ee328837ece01b979cd5c1083ed1679372f06c14c1c58035d8120614 33 + 548 .travis.yml 7f11bc58a8e94276ef949afeb107f9f1e184c0dbb84f821705ea2245902ed546 34 + 846 Changelog.md 345f9aea4812b37b1b2714703ea0d5edd27414c0f839ec3e322450ad5ec5c6ed 35 + FourmiCrawler/ 36 + 0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 37 + 304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806 38 + 2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49 39 + 677 settings.py f1e7d21b899ffc2523516c0ebe67d967dc62495b90c2fe34651042a3049fcd94 40 + sources/ 41 + 12103 ChemSpider.py f647d70acf9b3f1ee7bde75586aa45156331f977ca7fe836ceac4477a2c0d4ce 42 + 12400 NIST.py cdb4c423355ac8fb1097197a9f8df44f667925a785c6bae7c583820da08908ee 43 + 6121 PubChem.py 8f8ad40459090b818a384a202e739fe4696a04154df2b8419aee896b0fa02481 44 + 6930 WikipediaParser.py ae9f57bbf2aad9c371abcd143fd2dda5995a196cb700734a5035dd94b1988870 45 + 0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 46 + 1281 source.py 7927fda259ff2c8096fa526db1f08586de6e04473a491e19a07b092fdeed81fc 47 + 3111 spider.py ec7c946907fea10c17ee6dd88a506f3e3bf2cd748e3eb09200487fcec2ae7ba3 48 + GUI/ 49 + 11 __init__.py 40567015c415e853210425c1b4f3834dbc2a3165e3713e04dd3424b79bc90aa3 50 + 940 configImporter.py 5d731d63a3117b25b7e556a746a1dd5b16e8cbb60e57be46de333c31c8c00271 51 + 8776 gui.py 20b2220bc3ca55ebfd6d04e8c0bebbf1ae316c85a54db60b8fc02d22642f19d5 52 + 299 GUI.cfg.sample 4ee27f7099d588c21358cd645a21621e631d80712f1b514dad898faa5fee2483 53 + 1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c 54 + 3900 README.md f4a1e3ea1700d2b415acfad661cb45f960fe8e8ffbe98dbecb6c7ed071a101ac 55 + 3846 x fourmi.py f0b11f5f153f96f6af2e504cdf369e43c04316752de131a659eb6246fd80212a 56 + 261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85 57 + 416 sources.cfg.sample 11cd0fc18693da17883c98d25a384ae1b6158adfef13778b6dd02b878f6b8a70 58 + tests/ 59 + 107 __init__.py ce90e54e58a0912cadbe3adcf5166dc72477bf9ce289bf427f8e2f5b25406670 60 + 2870 test_configurator.py 318d542b1cda5075a2a9a6be97e9e7a79372ee58e1ab3014c161534094f7364d 61 + 1315 test_gui.py 0fb95d0b542765bf52bcebb037bf2ed1299209beab23448af741a93c9fbb1ca8 62 + 1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031 63 + 1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869 64 + 2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299 65 + utils/ 66 + 40 __init__.py f1237ae74693e2ec1b3154e57aec27438a80a735e5ccf2411aecd194ef443b6a 67 + 4047 configurator.py 8b566a0435a9f105a8ec616b16c3e21edb9b82f8debe1ef9f1df6bbbf20949d5 68 + 2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37 69 + ``` 70 + 71 + #### Ignore 72 + 73 + ``` 74 + /SIGNED.md 75 + ``` 76 + 77 + #### Presets 78 + 79 + ``` 80 + git # ignore .git and anything as described by .gitignore files 81 + dropbox # ignore .dropbox-cache and other Dropbox-related files 82 + kb # ignore anything as described by .kbignore files 83 + ``` 84 + 85 +  86 + 87 + ### End signed statement 88 + 89 + <hr> 90 + 91 + #### Notes 92 + 93 + With keybase you can sign any directory's contents, whether it's a git repo, 94 + source code distribution, or a personal documents folder. It aims to replace the drudgery of: 95 + 96 + 1. comparing a zipped file to a detached statement 97 + 2. downloading a public key 98 + 3. confirming it is in fact the author's by reviewing public statements they've made, using it 99 + 100 + All in one simple command: 101 + 102 + ```bash 103 + keybase dir verify 104 + ``` 105 + 106 + There are lots of options, including assertions for automating your checks. 107 + 108 + For more info, check out https://keybase.io/docs/command_line/code_signing

+89

fourmi.py

···

··· 1 + #!/usr/bin/env python 2 + """ 3 + Fourmi, a web scraper build to search specific information for a given compound (and its pseudonyms). 4 + 5 + Usage: 6 + fourmi 7 + fourmi search <compound> 8 + fourmi [options] search <compound> 9 + fourmi [options] [-v | -vv | -vvv] [--include=<sourcename> | --exclude=<sourcename>] search <compound> 10 + fourmi list 11 + fourmi [--include=<sourcename> | --exclude=<sourcename>] list 12 + fourmi -h | --help 13 + fourmi --version 14 + 15 + Options: 16 + --attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*] 17 + -h --help Show this screen. 18 + --version Show version. 19 + -v Verbose logging output. (Multiple occurrences increase logging level) 20 + --log=<file> Save log to an file. 21 + -o <file> --output=<file> Output file [default: <compound>.*format*] 22 + -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv] 23 + --include=<regex> Include only sources that match these regular expressions split by a comma. 24 + --exclude=<regex> Exclude the sources that match these regular expressions split by a comma. 25 + """ 26 + 27 + from twisted.internet import reactor 28 + from scrapy.crawler import Crawler 29 + from scrapy import signals, log 30 + import docopt 31 + 32 + from FourmiCrawler.spider import FourmiSpider 33 + from utils.configurator import Configurator 34 + from utils.sourceloader import SourceLoader 35 + from GUI import gui 36 + 37 + 38 + def setup_crawler(compound, settings, source_loader, attributes): 39 + """ 40 + This function prepares and start the crawler which starts the actual search on the internet 41 + :param compound: The compound which should be searched 42 + :param settings: A scrapy settings object 43 + :param source_loader: A fully functional SourceLoader object which contains only the sources that should be used. 44 + :param attributes: A list of regular expressions which the attribute names should match. 45 + """ 46 + spider = FourmiSpider(compound=compound, selected_attributes=attributes) 47 + spider.add_sources(source_loader.sources) 48 + crawler = Crawler(settings) 49 + crawler.signals.connect(reactor.stop, signal=signals.spider_closed) 50 + crawler.configure() 51 + crawler.crawl(spider) 52 + crawler.start() 53 + 54 + 55 + def search(docopt_arguments, source_loader): 56 + """ 57 + The function that facilitates the search for a specific compound. 58 + :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. 59 + :param source_loader: An initiated SourceLoader object pointed at the directory with the sources. 60 + """ 61 + conf = Configurator() 62 + conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"]) 63 + conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments["<compound>"]) 64 + setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, 65 + source_loader, docopt_arguments["--attributes"].split(',')) 66 + if conf.scrapy_settings.getbool("LOG_ENABLED"): 67 + log.start(conf.scrapy_settings.get("LOG_FILE"), 68 + conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) 69 + reactor.run() 70 + 71 + 72 + # The start for the Fourmi Command Line interface. 73 + if __name__ == '__main__': 74 + arguments = docopt.docopt(__doc__, version='Fourmi - V0.6.0') 75 + loader = SourceLoader() 76 + 77 + if arguments["--include"]: 78 + loader.include(arguments["--include"].split(',')) 79 + elif arguments["--exclude"]: 80 + loader.exclude(arguments["--exclude"].split(',')) 81 + 82 + if arguments["search"]: 83 + search(arguments, loader) 84 + elif arguments["list"]: 85 + print "-== Available Sources ==-" 86 + print str(loader) 87 + else: 88 + gui_window = gui.GUI(search, sourceloader=SourceLoader()) 89 + gui_window.run()

+19

sources.cfg.sample

···

··· 1 + [DEFAULT] 2 + reliability = Unknown 3 + 4 + #For each source listed in FourmiCrawler/sources there should be a section 5 + #named exactly as the filename in here. If not present, the DEFAULT value is 6 + #used for reliability of that source. 7 + 8 + [ChemSpider] 9 + reliability = High 10 + #token=Paste ChemSpider API token here and remove the hashtag 11 + 12 + [NIST] 13 + reliability = High 14 + 15 + [WikipediaParser] 16 + reliability = Medium 17 + 18 + [PubChem] 19 + reliability = High

+6

tests/__init__.py

···

··· 1 + import test_configurator 2 + import test_gui 3 + import test_pipeline 4 + import test_sourceloader 5 + import test_spider 6 +

+68

tests/test_configurator.py

···

··· 1 + import unittest 2 + import ConfigParser 3 + 4 + from utils.configurator import Configurator 5 + 6 + 7 + class TestConfigurator(unittest.TestCase): 8 + 9 + def setUp(self): 10 + self.conf = Configurator() 11 + 12 + def test_set_output(self): 13 + self.conf.set_output(filename="test.txt", fileformat="csv", compound="test") 14 + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt") 15 + self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") 16 + 17 + self.conf.set_output("<compound>.*format*", "jsonlines", "test") 18 + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.json") 19 + self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines") 20 + 21 + self.conf.set_output("<compound>.*format*", "csv", "test") 22 + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.csv") 23 + self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") 24 + 25 + def test_start_log(self): 26 + for i in range(0, 3): 27 + self.conf.set_logging("TEST", i) 28 + self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), "TEST") 29 + if i > 0: 30 + self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), True) 31 + if i > 1: 32 + self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), False) 33 + else: 34 + self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True) 35 + else: 36 + self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), False) 37 + self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True) 38 + if i == 1: 39 + self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "WARNING") 40 + elif i == 2: 41 + self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "INFO") 42 + elif i == 3: 43 + self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "DEBUG") 44 + 45 + self.conf.set_logging(verbose=i) 46 + self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), None) 47 + 48 + def test_read_sourceconfiguration(self): 49 + config = self.conf.read_sourceconfiguration() 50 + self.assertIsInstance(config, ConfigParser.ConfigParser) 51 + 52 + def test_get_section(self): 53 + config = ConfigParser.ConfigParser() 54 + section = self.conf.get_section(config, 'test') 55 + self.assertIn('reliability', section) 56 + self.assertEquals(section['reliability'], '') 57 + 58 + config.set('DEFAULT', 'reliability', 'Low') 59 + 60 + section = self.conf.get_section(config, 'test') 61 + self.assertEquals(section['reliability'], 'Low') 62 + 63 + config.add_section('test') 64 + config.set('test', 'var', 'Maybe') 65 + 66 + section = self.conf.get_section(config, 'test') 67 + self.assertEquals(section['reliability'], 'Low') 68 + self.assertEqual(section['var'], 'Maybe')

+32

tests/test_gui.py

···

··· 1 + import unittest 2 + 3 + from GUI import gui 4 + 5 + class TestGUI(unittest.TestCase): 6 + def setUp(self): 7 + pass 8 + 9 + def test_empty_attributes(self): 10 + self.test_gui = gui.GUI(None, config_file="../GUI.cfg.sample", in_source=True) 11 + self.test_gui.window.after(9, self.test_gui.prepare_search) 12 + self.test_gui.window.after(11, self.test_gui.window.destroy) 13 + self.test_gui.run() 14 + 15 + output_type = self.test_gui.configurator.load_output_types().split(',')[0] 16 + 17 + self.assertEqual(self.test_gui.values.get('substance'), '') 18 + self.assertEqual(self.test_gui.values.get('output_type'), output_type) 19 + self.assertEqual(self.test_gui.values.get('output_name'), 'results.csv') 20 + 21 + 22 + def test_no_configurations(self): 23 + self.test_gui = gui.GUI(None, config_file="../GUI.cfg.sample") 24 + self.test_gui.configurator = gui.ConfigImporter('') 25 + self.test_gui.finish_with_search = True 26 + self.test_gui.window.after(9, self.test_gui.prepare_search) 27 + self.test_gui.window.after(11, self.test_gui.window.destroy) 28 + self.test_gui.run() 29 + 30 + self.assertEqual(self.test_gui.values.get('substance'), '') 31 + self.assertEqual(self.test_gui.values.get('output_type'), 'csv') 32 + self.assertEqual(self.test_gui.values.get('output_name'), 'results.csv')

+53

tests/test_pipeline.py

···

··· 1 + import copy 2 + import unittest 3 + 4 + from scrapy.exceptions import DropItem 5 + 6 + from FourmiCrawler import pipelines, spider, items 7 + 8 + 9 + class TestPipelines(unittest.TestCase): 10 + def setUp(self): 11 + self.testItem = items.Result() 12 + 13 + def test_none_pipeline(self): 14 + # Testing the pipeline that replaces the None values in items. 15 + self.testItem["value"] = "abc" 16 + self.testItem["source"] = None 17 + pipe = pipelines.RemoveNonePipeline() 18 + processed = pipe.process_item(self.testItem, spider.FourmiSpider()) 19 + 20 + self.assertTrue(processed["value"] == "abc") 21 + 22 + for key in self.testItem: 23 + self.assertIsNotNone(processed[key]) 24 + if key is not "value": 25 + self.assertIs(processed[key], "") 26 + 27 + def test_duplicate_pipeline(self): 28 + # Testing the pipeline that removes duplicates. 29 + self.testItem["attribute"] = "test" 30 + self.testItem["value"] = "test" 31 + self.testItem["conditions"] = "test" 32 + 33 + pipe = pipelines.DuplicatePipeline() 34 + self.assertEqual(pipe.process_item(self.testItem, spider.FourmiSpider()), self.testItem) 35 + self.assertRaises(DropItem, pipe.process_item, self.testItem, spider.FourmiSpider()) 36 + 37 + other_item = copy.deepcopy(self.testItem) 38 + other_item["value"] = "test1" 39 + self.assertEqual(pipe.process_item(other_item, spider.FourmiSpider()), other_item) 40 + 41 + def test_attribute_selection(self): 42 + # Testing the pipeline that selects attributes. 43 + item1 = copy.deepcopy(self.testItem) 44 + item2 = copy.deepcopy(self.testItem) 45 + 46 + item1["attribute"] = "abd" 47 + item2["attribute"] = "abc" 48 + 49 + s = spider.FourmiSpider(selected_attributes=["a.d"]) 50 + pipe = pipelines.AttributeSelectionPipeline() 51 + 52 + self.assertEqual(pipe.process_item(item1, s), item1) 53 + self.assertRaises(DropItem, pipe.process_item, item2, s)

+33

tests/test_sourceloader.py

···

··· 1 + import unittest 2 + 3 + from utils.sourceloader import SourceLoader 4 + 5 + 6 + class TestSourceloader(unittest.TestCase): 7 + def setUp(self): 8 + self.loader = SourceLoader() 9 + 10 + def test_init(self): 11 + # Test if sourceloader points to the right directory, where the sources are present. 12 + self.assertIn("Source: Source", str(self.loader)) 13 + self.assertIn("Source: NIST", str(self.loader)) 14 + self.assertIn("Source: ChemSpider", str(self.loader)) 15 + self.assertIn("Source: WikipediaParser", str(self.loader)) 16 + 17 + def test_include(self): 18 + # Tests for the include functionality. 19 + self.loader.include(["So.rc.*"]) 20 + 21 + self.assertIn("Source: Source", str(self.loader)) 22 + self.assertNotIn("Source: NIST", str(self.loader)) 23 + self.assertNotIn("Source: ChemSpider", str(self.loader)) 24 + self.assertNotIn("Source: WikipediaParser", str(self.loader)) 25 + 26 + def test_exclude(self): 27 + # Tests for the exclude functionality. 28 + self.loader.exclude(["So.rc.*"]) 29 + 30 + self.assertNotIn("Source: Source", str(self.loader)) 31 + self.assertIn("Source: NIST", str(self.loader)) 32 + self.assertIn("Source: ChemSpider", str(self.loader)) 33 + self.assertIn("Source: WikipediaParser", str(self.loader))

+63

tests/test_spider.py

···

··· 1 + import unittest 2 + 3 + from scrapy.http import Request 4 + 5 + from FourmiCrawler import spider 6 + from FourmiCrawler.sources.NIST import NIST 7 + from FourmiCrawler.sources.source import Source 8 + 9 + 10 + class TestFoumiSpider(unittest.TestCase): 11 + def setUp(self): 12 + self.compound = "test_compound" 13 + self.attributes = ["a.*", ".*a"] 14 + self.spi = spider.FourmiSpider(self.compound, self.attributes) 15 + 16 + def test_init(self): 17 + # Test the initiation of the Fourmi spider 18 + self.assertIn(self.compound, self.spi.synonyms) 19 + for attr in self.attributes: 20 + self.assertIn(attr, self.spi.selected_attributes) 21 + 22 + def test_add_source(self): 23 + # Testing the source adding function of the Fourmi spider 24 + src = Source() 25 + self.spi.add_source(src) 26 + self.assertIn(src, self.spi._sources) 27 + 28 + def test_add_sources(self): 29 + # Testing the function that adds multiple sources 30 + srcs = [Source(), Source(), Source()] 31 + self.spi.add_sources(srcs) 32 + 33 + for src in srcs: 34 + self.assertIn(src, self.spi._sources) 35 + 36 + def test_start_requests(self): 37 + # A test for the function that generates the start requests 38 + self.spi._sources = [] 39 + 40 + src = Source() 41 + self.spi.add_source(src) 42 + self.assertEqual(self.spi.start_requests(), []) 43 + 44 + src2 = NIST() 45 + self.spi.add_source(src2) 46 + requests = self.spi.start_requests() 47 + self.assertGreater(len(requests), 0) 48 + self.assertIsInstance(requests[0], Request) 49 + 50 + def test_synonym_requests(self): 51 + # A test for the synonym request function 52 + self.spi._sources = [] 53 + 54 + src = Source() 55 + self.spi.add_source(src) 56 + self.assertEqual(self.spi.get_synonym_requests("new_compound"), []) 57 + self.assertIn("new_compound", self.spi.synonyms) 58 + 59 + src2 = NIST() 60 + self.spi.add_source(src2) 61 + self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request) 62 + self.assertIn("other_compound", self.spi.synonyms) 63 + self.assertEqual(self.spi.get_synonym_requests("other_compound"), [])

+2

utils/__init__.py

···

··· 1 + import configurator 2 + import sourceloader

+101

utils/configurator.py

···

··· 1 + import ConfigParser 2 + import os 3 + import shutil 4 + 5 + from scrapy.utils.project import get_project_settings 6 + 7 + 8 + class Configurator: 9 + """ 10 + A helper class in the fourmi class. This class is used to process the settings as set 11 + from one of the Fourmi applications. 12 + """ 13 + 14 + def __init__(self): 15 + self.scrapy_settings = get_project_settings() 16 + 17 + def set_output(self, filename, fileformat, compound): 18 + """ 19 + This function manipulates the Scrapy output file settings that normally would be set in the settings file. 20 + In the Fourmi project these are command line arguments. 21 + :param filename: The filename of the file where the output will be put. 22 + :param fileformat: The format in which the output will be. 23 + """ 24 + 25 + if filename != '<compound>.*format*': 26 + self.scrapy_settings.overrides["FEED_URI"] = filename 27 + elif fileformat == "jsonlines": 28 + self.scrapy_settings.overrides["FEED_URI"] = compound + ".json" 29 + elif fileformat is not None: 30 + self.scrapy_settings.overrides["FEED_URI"] = compound + "." + fileformat 31 + 32 + if fileformat is not None: 33 + self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat 34 + 35 + def set_logging(self, logfile=None, verbose=0): 36 + """ 37 + This function changes the default settings of Scapy's logging functionality 38 + using the settings given by the CLI. 39 + :param logfile: The location where the logfile will be saved. 40 + :param verbose: A integer value to switch between loglevels. 41 + """ 42 + if verbose != 0: 43 + self.scrapy_settings.overrides["LOG_ENABLED"] = True 44 + else: 45 + self.scrapy_settings.overrides["LOG_ENABLED"] = False 46 + 47 + if verbose == 1: 48 + self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING" 49 + elif verbose == 2: 50 + self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO" 51 + else: 52 + self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG" 53 + 54 + if verbose > 1: 55 + self.scrapy_settings.overrides["LOG_STDOUT"] = False 56 + else: 57 + self.scrapy_settings.overrides["LOG_STDOUT"] = True 58 + 59 + if logfile is not None: 60 + self.scrapy_settings.overrides["LOG_FILE"] = logfile 61 + else: 62 + self.scrapy_settings.overrides["LOG_FILE"] = None 63 + 64 + @staticmethod 65 + def read_sourceconfiguration(): 66 + """ 67 + This function reads sources.cfg in the main folder for configuration 68 + variables for sources 69 + :return a ConfigParser object of sources.cfg 70 + """ 71 + current_dir = os.path.dirname(os.path.abspath(__file__)) 72 + config_path = current_dir + '/../sources.cfg' 73 + # [TODO]: location of sources.cfg should be softcoded eventually 74 + if not os.path.isfile(config_path): 75 + try: 76 + shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../sources.cfg.sample", config_path) 77 + except IOError: 78 + print "WARNING: Source configuration couldn't be found and couldn't be created." 79 + config = ConfigParser.ConfigParser() 80 + config.read(config_path) 81 + return config 82 + 83 + @staticmethod 84 + def get_section(config, sourcename): 85 + """ 86 + This function reads a config section labeled in variable sourcename and 87 + tests whether the reliability variable is set else set to empty string. 88 + Return the default section if the labeled config section does not exist 89 + :param config: a ConfigParser object 90 + :param sourcename: the name of the section to be read 91 + :return a dictionary of the section in the config labeled in sourcename 92 + """ 93 + section = dict() 94 + if config.has_section(sourcename): 95 + section = dict(config.items(sourcename)) 96 + elif config.defaults(): 97 + section = config.defaults() 98 + if 'reliability' not in section: 99 + print 'WARNING: Reliability not set for %s' % sourcename 100 + section['reliability'] = '' 101 + return section

+64

utils/sourceloader.py

···

··· 1 + import inspect 2 + import os 3 + import re 4 + 5 + from FourmiCrawler.sources.source import Source 6 + from utils.configurator import Configurator 7 + 8 + 9 + class SourceLoader: 10 + sources = [] 11 + 12 + def __init__(self, rel_dir="../FourmiCrawler/sources"): 13 + """ 14 + The initiation of a SourceLoader, selects and indexes a directory for usable sources. 15 + Also loads a configuration file for Sources and passes the arguments in 16 + the named section to the source 17 + :param rel_dir: A relative path to a directory. 18 + """ 19 + path = os.path.dirname(os.path.abspath(__file__)) 20 + path += "/" + rel_dir 21 + known_parser = set() 22 + 23 + config = Configurator.read_sourceconfiguration() 24 + 25 + for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: 26 + mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py]) 27 + classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] 28 + for cls in classes: 29 + if issubclass(cls, Source) and cls not in known_parser: 30 + sourcecfg = Configurator.get_section(config, cls.__name__) 31 + self.sources.append(cls(sourcecfg)) 32 + known_parser.add(cls) 33 + 34 + def include(self, source_names): 35 + """ 36 + This function excludes all sources that don't match the given regular expressions. 37 + :param source_names: A list of regular expression (strings) 38 + """ 39 + new = set() 40 + for name in source_names: 41 + new.update([src for src in self.sources if re.match(name, src.__class__.__name__)]) 42 + self.sources = list(new) 43 + 44 + def exclude(self, source_names): 45 + """ 46 + This function excludes all sources that match the given regular expressions. 47 + :param source_names: A list of regular expression (strings) 48 + """ 49 + exclude = [] 50 + for name in source_names: 51 + exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)]) 52 + self.sources = [src for src in self.sources if src not in exclude] 53 + 54 + def __str__(self): 55 + """ 56 + This function returns a string with all sources currently available in the SourceLoader. 57 + :return: a string with all available sources. 58 + """ 59 + string = "" 60 + for src in self.sources: 61 + string += "Source: " + src.__class__.__name__ 62 + string += " - " 63 + string += "URI: " + src.website + "\n" 64 + return string

Compare changes