FourmiCrawler/sources/WikipediaParser.py at feature/executable · dekker.one/Fourmi

dekker.one / Fourmi
A web scraper build to search specific information for a given compound (and its pseudonyms)
Fourmi / FourmiCrawler / sources / WikipediaParser.py
at feature/executable 119 lines 5.1 kB view raw
  1import re
  2
  3from scrapy.http import Request
  4from scrapy import log
  5from scrapy.selector import Selector
  6
  7from source import Source
  8from FourmiCrawler.items import Result
  9
 10
 11class WikipediaParser(Source):
 12    """ Wikipedia scraper for chemical properties
 13
 14    This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values.
 15     It also returns requests with other external sources which contain information on parsed subject.
 16    """
 17
 18    website = "http://en.wikipedia.org/wiki/*"
 19    __spider = None
 20    searched_compounds = []
 21
 22    def __init__(self):
 23        Source.__init__(self)
 24
 25    def parse(self, response):
 26        """ Distributes the above described behaviour """
 27        log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
 28        sel = Selector(response)
 29        compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0]  # makes sure to use main page
 30        if compound in self.searched_compounds:
 31            return None
 32        else:
 33            items = self.parse_infobox(sel)
 34            self.searched_compounds.append(compound)
 35            return items
 36
 37    def parse_infobox(self, sel):
 38        """ scrape data from infobox on wikipedia. """
 39        items = []
 40
 41        # be sure to get chembox (wikipedia template)
 42        tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
 43            xpath('normalize-space(string())')
 44        prop_names = tr_list[::2]
 45        prop_values = tr_list[1::2]
 46        for i, prop_name in enumerate(prop_names):
 47            item = Result({
 48                'attribute': prop_name.extract().encode('utf-8'),
 49                'value': prop_values[i].extract().encode('utf-8'),
 50                'source': "Wikipedia",
 51                'reliability': "Unknown",
 52                'conditions': ""
 53            })
 54            items.append(item)
 55            log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
 56
 57        #scrape the  drugbox (wikipedia template)
 58        tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
 59        log.msg('dit: %s' % tr_list2, level=log.DEBUG)
 60        for tablerow in tr_list2:
 61            log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
 62            if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
 63                    'normalize-space(string())'):
 64                item = Result({
 65                    'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
 66                    'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
 67                    'source': "Wikipedia",
 68                    'reliability': "Unknown",
 69                    'conditions': ""
 70                })
 71                items.append(item)
 72                log.msg(
 73                    'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
 74                    level=log.DEBUG)
 75
 76        items = filter(lambda a: a['value'] != '', items)  # remove items with an empty value
 77        item_list = self.clean_items(items)
 78
 79        identifiers = self.get_identifiers(sel)
 80
 81        #add extra sources to scrape from as requests
 82        for i, identifier in enumerate(identifiers):
 83            request = None
 84            #discard internal wikipedia links
 85            if re.match('//en\.wikipedia', identifier):
 86                log.msg('Found link to Wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING)
 87            #fix links starting with '//www.'
 88            elif re.match('/{2}', identifier):
 89                identifier = re.sub("/{2}", "http://", identifier)
 90                request = Request(identifier)
 91            else:
 92                request = Request(identifier)
 93            log.msg('New identifier found, request: %s' % identifier, level=log.DEBUG)
 94            item_list.append(request)
 95
 96        return item_list
 97
 98    def new_compound_request(self, compound):
 99        return Request(url=self.website[:-1] + compound, callback=self.parse)
100
101    @staticmethod
102    def clean_items(items):
103        """ clean up properties using regex, makes it possible to split the values from the units """
104        for item in items:
105            value = item['value']
106            m = re.search('F;\s(\d+[\.,]?\d*)', value)  # clean up numerical Kelvin value (after F)
107            if m:
108                item['value'] = m.group(1) + " K"
109            m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value)  # clean up J/K/mol values
110            if m:
111                item['value'] = m.group(1) + " J/K/mol"
112        return items
113
114    @staticmethod
115    def get_identifiers(sel):
116        """ find external links, named 'Identifiers' to different sources. """
117        links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
118                          '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
119        return links