···11+import re
22+13from scrapy import log
24from scrapy.http import Request
35from scrapy.selector import Selector
···57from source import Source
68from FourmiCrawler.items import Result
7988-import re
9101011# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
1112···1819 somewhere.
1920 """
20212121- website = 'http://www.chemspider.com/*'
2222+ website = 'http://www\.chemspider\.com/.*'
22232324 search = 'Search.asmx/SimpleSearch?query=%s&token='
2425 structure = 'Chemical-Structure.%s.html'
···292293 """
293294 if compound in self.ignore_list or self.cfg['token'] == '':
294295 return None
295295- searchurl = self.website[:-1] + self.search % compound
296296+ searchurl = self.website[:-2] + self.search % compound
296297 log.msg('chemspider compound', level=log.DEBUG)
297298 return Request(url=searchurl, callback=self.parse_searchrequest)
+2-2
FourmiCrawler/sources/NIST.py
···1818 This plugin manages searching for a chemical on the NIST website
1919 and parsing the resulting page if the chemical exists on NIST.
2020 """
2121- website = "http://webbook.nist.gov/*"
2121+ website = "http://webbook\.nist\.gov/.*"
22222323 search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
2424···329329 """
330330 if compound not in self.ignore_list:
331331 self.ignore_list.update(compound)
332332- return Request(url=self.website[:-1] + self.search % compound,
332332+ return Request(url=self.website[:-2] + self.search % compound,
333333 callback=self.parse)
+7-5
FourmiCrawler/sources/PubChem.py
···11+import re
22+13from scrapy.http import Request
24from scrapy import log
33-from source import Source
45from scrapy.selector import Selector
66+77+from source import Source
58from FourmiCrawler.items import Result
66-import re
79810911class PubChem(Source):
···1416 """
15171618 #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
1717- website = 'https://*.ncbi.nlm.nih.gov/*'
1818- website_www = 'https://www.ncbi.nlm.nih.gov/*'
1919- website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*'
1919+ website = 'https://.*\.ncbi\.nlm\.nih\.gov/.*'
2020+ website_www = 'https://www.ncbi.nlm.nih.gov/.*'
2121+ website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/.*'
2022 search = 'pccompound?term=%s'
2123 data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
2224
+2-2
FourmiCrawler/sources/WikipediaParser.py
···1515 It also returns requests with other external sources which contain information on parsed subject.
1616 """
17171818- website = "http://en.wikipedia.org/wiki/*"
1818+ website = "http://en\.wikipedia\.org/wiki/.*"
1919 __spider = None
2020 searched_compounds = []
2121···123123 return items
124124125125 def new_compound_request(self, compound):
126126- return Request(url=self.website[:-1] + compound, callback=self.parse)
126126+ return Request(url=self.website[:-2] + compound, callback=self.parse)
127127128128 @staticmethod
129129 def clean_items(items):
+2-2
FourmiCrawler/sources/source.py
···334455class Source:
66- website = "http://something/*" # Regex of URI's the source is able to parse
66+ website = "http://something/.*" # Regex of URI's the source is able to parse
77 _spider = None
8899 def __init__(self, config=None):
···3030 :param compound: A compound name.
3131 :return: A new Scrapy Request
3232 """
3333- # return Request(url=self.website[:-1] + compound, callback=self.parse)
3333+ # return Request(url=self.website[:-2] + compound, callback=self.parse)
3434 pass
35353636 def set_spider(self, spider):