A web scraper build to search specific information for a given compound (and its pseudonyms)
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

These regular expressions where all wrong

+17 -14
+4 -3
FourmiCrawler/sources/ChemSpider.py
··· 1 + import re 2 + 1 3 from scrapy import log 2 4 from scrapy.http import Request 3 5 from scrapy.selector import Selector ··· 5 7 from source import Source 6 8 from FourmiCrawler.items import Result 7 9 8 - import re 9 10 10 11 # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. 11 12 ··· 18 19 somewhere. 19 20 """ 20 21 21 - website = 'http://www.chemspider.com/*' 22 + website = 'http://www\.chemspider\.com/.*' 22 23 23 24 search = 'Search.asmx/SimpleSearch?query=%s&token=' 24 25 structure = 'Chemical-Structure.%s.html' ··· 292 293 """ 293 294 if compound in self.ignore_list or self.cfg['token'] == '': 294 295 return None 295 - searchurl = self.website[:-1] + self.search % compound 296 + searchurl = self.website[:-2] + self.search % compound 296 297 log.msg('chemspider compound', level=log.DEBUG) 297 298 return Request(url=searchurl, callback=self.parse_searchrequest)
+2 -2
FourmiCrawler/sources/NIST.py
··· 18 18 This plugin manages searching for a chemical on the NIST website 19 19 and parsing the resulting page if the chemical exists on NIST. 20 20 """ 21 - website = "http://webbook.nist.gov/*" 21 + website = "http://webbook\.nist\.gov/.*" 22 22 23 23 search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' 24 24 ··· 329 329 """ 330 330 if compound not in self.ignore_list: 331 331 self.ignore_list.update(compound) 332 - return Request(url=self.website[:-1] + self.search % compound, 332 + return Request(url=self.website[:-2] + self.search % compound, 333 333 callback=self.parse)
+7 -5
FourmiCrawler/sources/PubChem.py
··· 1 + import re 2 + 1 3 from scrapy.http import Request 2 4 from scrapy import log 3 - from source import Source 4 5 from scrapy.selector import Selector 6 + 7 + from source import Source 5 8 from FourmiCrawler.items import Result 6 - import re 7 9 8 10 9 11 class PubChem(Source): ··· 14 16 """ 15 17 16 18 #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used 17 - website = 'https://*.ncbi.nlm.nih.gov/*' 18 - website_www = 'https://www.ncbi.nlm.nih.gov/*' 19 - website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*' 19 + website = 'https://.*\.ncbi\.nlm\.nih\.gov/.*' 20 + website_www = 'https://www.ncbi.nlm.nih.gov/.*' 21 + website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/.*' 20 22 search = 'pccompound?term=%s' 21 23 data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s' 22 24
+2 -2
FourmiCrawler/sources/WikipediaParser.py
··· 15 15 It also returns requests with other external sources which contain information on parsed subject. 16 16 """ 17 17 18 - website = "http://en.wikipedia.org/wiki/*" 18 + website = "http://en\.wikipedia\.org/wiki/.*" 19 19 __spider = None 20 20 searched_compounds = [] 21 21 ··· 123 123 return items 124 124 125 125 def new_compound_request(self, compound): 126 - return Request(url=self.website[:-1] + compound, callback=self.parse) 126 + return Request(url=self.website[:-2] + compound, callback=self.parse) 127 127 128 128 @staticmethod 129 129 def clean_items(items):
+2 -2
FourmiCrawler/sources/source.py
··· 3 3 4 4 5 5 class Source: 6 - website = "http://something/*" # Regex of URI's the source is able to parse 6 + website = "http://something/.*" # Regex of URI's the source is able to parse 7 7 _spider = None 8 8 9 9 def __init__(self, config=None): ··· 30 30 :param compound: A compound name. 31 31 :return: A new Scrapy Request 32 32 """ 33 - # return Request(url=self.website[:-1] + compound, callback=self.parse) 33 + # return Request(url=self.website[:-2] + compound, callback=self.parse) 34 34 pass 35 35 36 36 def set_spider(self, spider):