A web scraper build to search specific information for a given compound (and its pseudonyms)
1import re
2
3from scrapy.http import Request
4from scrapy import log
5from scrapy.selector import Selector
6
7from source import Source
8from FourmiCrawler.items import Result
9
10
11class WikipediaParser(Source):
12 """ Wikipedia scraper for chemical properties
13
14 This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values.
15 It also returns requests with other external sources which contain information on parsed subject.
16 """
17
18 website = "http://en.wikipedia.org/wiki/*"
19 __spider = None
20 searched_compounds = []
21
22 def __init__(self):
23 Source.__init__(self)
24
25 def parse(self, response):
26 """ Distributes the above described behaviour """
27 log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
28 sel = Selector(response)
29 compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page
30 if compound in self.searched_compounds:
31 return None
32 else:
33 items = self.parse_infobox(sel)
34 self.searched_compounds.append(compound)
35 return items
36
37 def parse_infobox(self, sel):
38 """ scrape data from infobox on wikipedia. """
39 items = []
40
41 # be sure to get chembox (wikipedia template)
42 tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
43 xpath('normalize-space(string())')
44 prop_names = tr_list[::2]
45 prop_values = tr_list[1::2]
46 for i, prop_name in enumerate(prop_names):
47 item = Result({
48 'attribute': prop_name.extract().encode('utf-8'),
49 'value': prop_values[i].extract().encode('utf-8'),
50 'source': "Wikipedia",
51 'reliability': "Unknown",
52 'conditions': ""
53 })
54 items.append(item)
55 log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
56
57 #scrape the drugbox (wikipedia template)
58 tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
59 log.msg('dit: %s' % tr_list2, level=log.DEBUG)
60 for tablerow in tr_list2:
61 log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
62 if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
63 'normalize-space(string())'):
64 item = Result({
65 'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
66 'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
67 'source': "Wikipedia",
68 'reliability': "Unknown",
69 'conditions': ""
70 })
71 items.append(item)
72 log.msg(
73 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
74 level=log.DEBUG)
75
76 items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
77 item_list = self.clean_items(items)
78
79 identifiers = self.get_identifiers(sel)
80
81 #add extra sources to scrape from as requests
82 for i, identifier in enumerate(identifiers):
83 request = None
84 #discard internal wikipedia links
85 if re.match('//en\.wikipedia', identifier):
86 log.msg('Found link to Wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING)
87 #fix links starting with '//www.'
88 elif re.match('/{2}', identifier):
89 identifier = re.sub("/{2}", "http://", identifier)
90 request = Request(identifier)
91 else:
92 request = Request(identifier)
93 log.msg('New identifier found, request: %s' % identifier, level=log.DEBUG)
94 item_list.append(request)
95
96 return item_list
97
98 def new_compound_request(self, compound):
99 return Request(url=self.website[:-1] + compound, callback=self.parse)
100
101 @staticmethod
102 def clean_items(items):
103 """ clean up properties using regex, makes it possible to split the values from the units """
104 for item in items:
105 value = item['value']
106 m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F)
107 if m:
108 item['value'] = m.group(1) + " K"
109 m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) # clean up J/K/mol values
110 if m:
111 item['value'] = m.group(1) + " J/K/mol"
112 return items
113
114 @staticmethod
115 def get_identifiers(sel):
116 """ find external links, named 'Identifiers' to different sources. """
117 links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
118 '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
119 return links