Merge branch 'release/v0.2.6' · dekker.one/Fourmi@59e9c1d

+98 -1

2 changed files

expand all

FourmiCrawler

sources

WikipediaParser.py

fourmi.py

+97

FourmiCrawler/sources/WikipediaParser.py

··· 1 + from scrapy.http import Request 2 + from scrapy import log 3 + from source import Source 4 + from scrapy.selector import Selector 5 + from FourmiCrawler.items import Result 6 + import re 7 + 8 + 9 + class WikipediaParser(Source): 10 + """ Wikipedia scraper for chemical properties 11 + 12 + This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values. 13 + It also returns requests with other external sources which contain information on parsed subject. 14 + """ 15 + 16 + website = "http://en.wikipedia.org/wiki/*" 17 + __spider = None 18 + searched_compounds = [] 19 + 20 + def __init__(self): 21 + Source.__init__(self) 22 + 23 + def parse(self, response): 24 + """ Distributes the above described behaviour """ 25 + log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) 26 + sel = Selector(response) 27 + compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page 28 + if compound in self.searched_compounds: 29 + return None 30 + else: 31 + items = self.parse_infobox(sel) 32 + self.searched_compounds.append(compound) 33 + return items 34 + 35 + def parse_infobox(self, sel): 36 + """ scrape data from infobox on wikipedia. """ 37 + items = [] 38 + 39 + #be sure to get both chembox (wikipedia template) and drugbox (wikipedia template) to scrape 40 + tr_list = sel.xpath('.//table[@class="infobox bordered" or @class="infobox"]//td[not(@colspan)]').\ 41 + xpath('normalize-space(string())') 42 + prop_names = tr_list[::2] 43 + prop_values = tr_list[1::2] 44 + for i, prop_name in enumerate(prop_names): 45 + item = Result({ 46 + 'attribute': prop_name.extract().encode('utf-8'), 47 + 'value': prop_values[i].extract().encode('utf-8'), 48 + 'source': "Wikipedia", 49 + 'reliability': "", 50 + 'conditions': "" 51 + }) 52 + items.append(item) 53 + log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) 54 + items = filter(lambda a: a['value'] != '', items) # remove items with an empty value 55 + item_list = self.clean_items(items) 56 + 57 + identifiers = self.get_identifiers(sel) 58 + 59 + #add extra sources to scrape from as requests 60 + for i, identifier in enumerate(identifiers): 61 + request = None 62 + #discard internal wikipedia links 63 + if re.match('//en\.wikipedia', identifier): 64 + log.msg('Found link to Wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING) 65 + #fix links starting with '//www.' 66 + elif re.match('/{2}', identifier): 67 + identifier = re.sub("/{2}", "http://", identifier) 68 + request = Request(identifier) 69 + else: 70 + request = Request(identifier) 71 + log.msg('New identifier found, request: %s' % identifier, level=log.DEBUG) 72 + item_list.append(request) 73 + 74 + return item_list 75 + 76 + def new_compound_request(self, compound): 77 + return Request(url=self.website[:-1] + compound, callback=self.parse) 78 + 79 + @staticmethod 80 + def clean_items(items): 81 + """ clean up properties using regex, makes it possible to split the values from the units """ 82 + for item in items: 83 + value = item['value'] 84 + m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F) 85 + if m: 86 + item['value'] = m.group(1) + " K" 87 + m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) # clean up J/K/mol values 88 + if m: 89 + item['value'] = m.group(1) + " J/K/mol" 90 + return items 91 + 92 + @staticmethod 93 + def get_identifiers(sel): 94 + """ find external links, named 'Identifiers' to different sources. """ 95 + links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a' 96 + '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() 97 + return links

+1 -1

fourmi.py

··· 79 79 80 80 81 81 if __name__ == '__main__': 82 - arguments = docopt.docopt(__doc__, version='Fourmi - V0.2.5') 82 + arguments = docopt.docopt(__doc__, version='Fourmi - V0.2.6') 83 83 loader = SourceLoader() 84 84 85 85 if arguments["--include"]:

Configure Feed

Configure Feed