A web scraper build to search specific information for a given compound (and its pseudonyms)
at feature/executable 119 lines 5.1 kB view raw
1import re 2 3from scrapy.http import Request 4from scrapy import log 5from scrapy.selector import Selector 6 7from source import Source 8from FourmiCrawler.items import Result 9 10 11class WikipediaParser(Source): 12 """ Wikipedia scraper for chemical properties 13 14 This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values. 15 It also returns requests with other external sources which contain information on parsed subject. 16 """ 17 18 website = "http://en.wikipedia.org/wiki/*" 19 __spider = None 20 searched_compounds = [] 21 22 def __init__(self): 23 Source.__init__(self) 24 25 def parse(self, response): 26 """ Distributes the above described behaviour """ 27 log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) 28 sel = Selector(response) 29 compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page 30 if compound in self.searched_compounds: 31 return None 32 else: 33 items = self.parse_infobox(sel) 34 self.searched_compounds.append(compound) 35 return items 36 37 def parse_infobox(self, sel): 38 """ scrape data from infobox on wikipedia. """ 39 items = [] 40 41 # be sure to get chembox (wikipedia template) 42 tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \ 43 xpath('normalize-space(string())') 44 prop_names = tr_list[::2] 45 prop_values = tr_list[1::2] 46 for i, prop_name in enumerate(prop_names): 47 item = Result({ 48 'attribute': prop_name.extract().encode('utf-8'), 49 'value': prop_values[i].extract().encode('utf-8'), 50 'source': "Wikipedia", 51 'reliability': "Unknown", 52 'conditions': "" 53 }) 54 items.append(item) 55 log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) 56 57 #scrape the drugbox (wikipedia template) 58 tr_list2 = sel.xpath('.//table[@class="infobox"]//tr') 59 log.msg('dit: %s' % tr_list2, level=log.DEBUG) 60 for tablerow in tr_list2: 61 log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG) 62 if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath( 63 'normalize-space(string())'): 64 item = Result({ 65 'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'), 66 'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'), 67 'source': "Wikipedia", 68 'reliability': "Unknown", 69 'conditions': "" 70 }) 71 items.append(item) 72 log.msg( 73 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']), 74 level=log.DEBUG) 75 76 items = filter(lambda a: a['value'] != '', items) # remove items with an empty value 77 item_list = self.clean_items(items) 78 79 identifiers = self.get_identifiers(sel) 80 81 #add extra sources to scrape from as requests 82 for i, identifier in enumerate(identifiers): 83 request = None 84 #discard internal wikipedia links 85 if re.match('//en\.wikipedia', identifier): 86 log.msg('Found link to Wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING) 87 #fix links starting with '//www.' 88 elif re.match('/{2}', identifier): 89 identifier = re.sub("/{2}", "http://", identifier) 90 request = Request(identifier) 91 else: 92 request = Request(identifier) 93 log.msg('New identifier found, request: %s' % identifier, level=log.DEBUG) 94 item_list.append(request) 95 96 return item_list 97 98 def new_compound_request(self, compound): 99 return Request(url=self.website[:-1] + compound, callback=self.parse) 100 101 @staticmethod 102 def clean_items(items): 103 """ clean up properties using regex, makes it possible to split the values from the units """ 104 for item in items: 105 value = item['value'] 106 m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F) 107 if m: 108 item['value'] = m.group(1) + " K" 109 m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) # clean up J/K/mol values 110 if m: 111 item['value'] = m.group(1) + " J/K/mol" 112 return items 113 114 @staticmethod 115 def get_identifiers(sel): 116 """ find external links, named 'Identifiers' to different sources. """ 117 links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a' 118 '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() 119 return links