···11+# Config file for automatic testing at travis-ci.org
22+33+language: python
44+python: 2.7
55+66+# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
77+install:
88+ - pip install Scrapy docopt
99+1010+# command to run tests, e.g. python setup.py test
1111+script:
1212+ - nosetests tests
1313+1414+notifications:
1515+ slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
-31
Fourmi.py
···11-#!/usr/bin/env python
22-"""
33-Fourmi - An internet webcrawler searching for information on chemical
44-compounds. [todo] - Add some more useful text here.
55-"""
66-77-from twisted.internet import reactor
88-from scrapy.crawler import Crawler
99-from scrapy import log, signals
1010-from FourmiCrawler.spiders.Fourmispider import FourmiSpider
1111-from scrapy.utils.project import get_project_settings
1212-1313-1414-def setup_crawler(searchable):
1515- # [TODO] - Initiate all parsers for the different websites and get
1616- # allowed URLs.
1717- spider = FourmiSpider(compound=searchable)
1818- settings = get_project_settings()
1919- crawler = Crawler(settings)
2020- crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
2121- crawler.configure()
2222- crawler.crawl(spider)
2323- crawler.start()
2424-2525-2626-def start():
2727- setup_crawler("Methane")
2828- log.start()
2929- reactor.run()
3030-3131-start()
+1-3
FourmiCrawler/items.py
···11-# Define here the models for your scraped items
22-#
33-# See documentation in:
11+# For more information on item definitions, see the Scrapy documentation in:
42# http://doc.scrapy.org/en/latest/topics/items.html
5364from scrapy.item import Item, Field
FourmiCrawler/parsers/__init__.py
This is a binary file and will not be displayed.
-9
FourmiCrawler/parsers/parser.py
···11-from scrapy import log
22-33-44-class Parser:
55- website = "http://localhost/*"
66-77- def parse(self, reponse):
88- log.msg("The parse function of the empty parser was used.", level=log.Warning)
99- pass
+43-7
FourmiCrawler/pipelines.py
···11-# Define your item pipelines here
22-#
33-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
44-# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
11+# For more information on item pipelines, see the Scrapy documentation in:
22+# http://doc.scrapy.org/en/latest/topics/item-pipeline.html
33+import re
44+55from scrapy.exceptions import DropItem
667788-class FourmiPipeline(object):
88+class RemoveNonePipeline(object):
99+ def __init__(self):
1010+ pass
1111+1212+ @staticmethod
1313+ def process_item(item, spider):
1414+ """
1515+ Processing the items so None values are replaced by empty strings
1616+ :param item: The incoming item
1717+ :param spider: The spider which scraped the spider
1818+ :return: :raise DropItem: Returns the item if unique or drops them if it's already known
1919+ """
2020+ for key in item:
2121+ if item[key] is None:
2222+ item[key] = ""
2323+ return item
9242525+2626+class DuplicatePipeline(object):
1027 def __init__(self):
1128 self.known_values = set()
1229···1734 :param spider: The spider which scraped the spider
1835 :return: :raise DropItem: Returns the item if unique or drops them if it's already known
1936 """
2020- value = item['attribute'], item['value']
3737+ value = (item['attribute'], item['value'], item['conditions'])
2138 if value in self.known_values:
2222- raise DropItem("Duplicate item found: %s" % item)
3939+ raise DropItem("Duplicate item found: %s" % item) # [todo] append sources of first item.
2340 else:
2441 self.known_values.add(value)
2542 return item
4343+4444+4545+class AttributeSelectionPipeline(object):
4646+ def __init__(self):
4747+ pass
4848+4949+ @staticmethod
5050+ def process_item(item, spider):
5151+ """
5252+ The items are processed using the selected attribute list available in the spider,
5353+ items that don't match the selected items are dropped.
5454+ :param item: The incoming item
5555+ :param spider: The spider which scraped the item. Should have an attribute "selected_attributes".
5656+ :return: :raise DropItem: Returns item if it matches an selected attribute, else it is dropped.
5757+ """
5858+ if [x for x in spider.selected_attributes if re.match(x, item["attribute"])]:
5959+ return item
6060+ else:
6161+ raise DropItem("Attribute not selected by used: %s" % item)
+7-2
FourmiCrawler/settings.py
···33# For simplicity, this file contains only the most important settings by
44# default. All the other settings are documented here:
55#
66-# http://doc.scrapy.org/en/latest/topics/settings.html
66+# http://doc.scrapy.org/en/latest/topics/settings.html
77#
8899BOT_NAME = 'FourmiCrawler'
···1111SPIDER_MODULES = ['FourmiCrawler']
1212NEWSPIDER_MODULE = 'FourmiCrawler'
1313ITEM_PIPELINES = {
1414- 'FourmiCrawler.pipelines.FourmiPipeline': 100
1414+ "FourmiCrawler.pipelines.RemoveNonePipeline": 100,
1515+ 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 200,
1616+ 'FourmiCrawler.pipelines.DuplicatePipeline': 300,
1517}
1818+FEED_URI = 'results.json'
1919+FEED_FORMAT = 'jsonlines'
2020+16211722# Crawl responsibly by identifying yourself (and your website) on the
1823# user-agent
+231
FourmiCrawler/sources/ChemSpider.py
···11+import re
22+33+from scrapy import log
44+from scrapy.http import Request
55+from scrapy.selector import Selector
66+77+from source import Source
88+from FourmiCrawler.items import Result
99+1010+1111+# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
1212+1313+1414+class ChemSpider(Source):
1515+ """ChemSpider scraper for synonyms and properties
1616+1717+ This parser will manage searching for chemicals through the
1818+ ChemsSpider API, and parsing the resulting ChemSpider page.
1919+ The token required for the API should be in a configuration file
2020+ somewhere.
2121+ """
2222+2323+ def __init__(self):
2424+ Source.__init__(self)
2525+2626+ website = 'http://www.chemspider.com/*'
2727+2828+ # [TODO] - Save and access token of specific user.
2929+ search = ('Search.asmx/SimpleSearch?query=%s&token='
3030+ '052bfd06-5ce4-43d6-bf12-89eabefd2338')
3131+ structure = 'Chemical-Structure.%s.html'
3232+ extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
3333+ '052bfd06-5ce4-43d6-bf12-89eabefd2338')
3434+3535+ ignore_list = []
3636+3737+ def parse(self, response):
3838+ sel = Selector(response)
3939+ requests = []
4040+ requests_synonyms = self.parse_synonyms(sel)
4141+ requests.extend(requests_synonyms)
4242+ requests_properties = self.parse_properties(sel)
4343+ requests.extend(requests_properties)
4444+4545+ return requests
4646+4747+ @staticmethod
4848+ def parse_properties(sel):
4949+ """scrape Experimental Data and Predicted ACD/Labs tabs"""
5050+ properties = []
5151+5252+ # Predicted - ACD/Labs tab
5353+ td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
5454+ 'normalize-space(string())')
5555+ prop_names = td_list[::2]
5656+ prop_values = td_list[1::2]
5757+ for (prop_name, prop_value) in zip(prop_names, prop_values):
5858+ # [:-1] is to remove the colon at the end, [TODO] - test for colon
5959+ prop_name = prop_name.extract().encode('utf-8')[:-1]
6060+ prop_value = prop_value.extract().encode('utf-8')
6161+ prop_conditions = ''
6262+6363+ # Test for properties without values, with one hardcoded exception
6464+ if not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and prop_value == '10-24cm3'):
6565+ continue
6666+6767+ # Match for condition in parentheses
6868+ m = re.match(r'(.*) \((.*)\)', prop_name)
6969+ if m:
7070+ prop_name = m.group(1)
7171+ prop_conditions = m.group(2)
7272+7373+ # Match for condition in value seperated by an 'at'
7474+ m = re.match(r'(.*) at (.*)', prop_value)
7575+ if m:
7676+ prop_value = m.group(1)
7777+ prop_conditions = m.group(2)
7878+7979+ new_prop = Result({
8080+ 'attribute': prop_name,
8181+ 'value': prop_value,
8282+ 'source': 'ChemSpider Predicted - ACD/Labs Tab',
8383+ 'reliability': 'Unknown',
8484+ 'conditions': prop_conditions
8585+ })
8686+ properties.append(new_prop)
8787+ log.msg('CS prop: |%s| |%s| |%s|' %
8888+ (new_prop['attribute'], new_prop['value'], new_prop['source']),
8989+ level=log.DEBUG)
9090+9191+ # Experimental Data Tab, Physico-chemical properties in particular
9292+ scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
9393+ 'Properties"]//li/table/tr/td')
9494+ if not scraped_list:
9595+ return properties
9696+ # Format is: property name followed by a list of values
9797+ property_name = scraped_list.pop(0).xpath(
9898+ 'span/text()').extract()[0].rstrip()
9999+ for line in scraped_list:
100100+ if line.xpath('span/text()'):
101101+ property_name = line.xpath('span/text()').extract()[0].rstrip()
102102+ else:
103103+ new_prop = Result({
104104+ 'attribute': property_name[:-1],
105105+ 'value': line.xpath('text()').extract()[0].rstrip(),
106106+ 'source': line.xpath(
107107+ 'strong/text()').extract()[0].rstrip(),
108108+ 'reliability': 'Unknown',
109109+ 'conditions': ''
110110+ })
111111+ properties.append(new_prop)
112112+ log.msg('CS prop: |%s| |%s| |%s|' %
113113+ (new_prop['attribute'], new_prop['value'],
114114+ new_prop['source']), level=log.DEBUG)
115115+116116+ return properties
117117+118118+ def parse_synonyms(self, sel):
119119+ """Scrape list of Names and Identifiers"""
120120+ requests = []
121121+ synonyms = []
122122+123123+ # Exact type for this is unknown, but equivalent to Validated by Expert
124124+ for syn in sel.xpath('//p[@class="syn"][span[@class="synonym_cn"]]'):
125125+ name = syn.xpath('span[@class="synonym_cn"]/text()').extract()[0]
126126+ synonyms.append(self.new_synonym(syn, name, 'expert'))
127127+ # These synonyms are labeled by ChemSpider as "Validated by Experts"
128128+ for syn in sel.xpath('//p[@class="syn"][strong]'):
129129+ name = syn.xpath('strong/text()').extract()[0]
130130+ synonyms.append(self.new_synonym(syn, name, 'expert'))
131131+ # These synonyms are labeled by ChemSpider as "Validated by Users"
132132+ for syn in sel.xpath(
133133+ '//p[@class="syn"][span[@class="synonym_confirmed"]]'):
134134+ name = syn.xpath(
135135+ 'span[@class="synonym_confirmed"]/text()').extract()[0]
136136+ synonyms.append(self.new_synonym(syn, name, 'user'))
137137+ # These syonyms are labeled as "Non-validated" and assumed unreliable
138138+ for syn in sel.xpath('//p[@class="syn"][span[@class=""]]'):
139139+ name = syn.xpath('span[@class=""]/text()').extract()[0]
140140+ synonyms.append(self.new_synonym(syn, name, 'nonvalidated'))
141141+142142+ # [TODO] - confirm if English User-Validated synonyms are OK too
143143+ for syn in synonyms:
144144+ if syn['category'] == 'expert' and syn['language'] == 'English':
145145+ log.msg('CS emit synonym: %s' % syn['name'], level=log.DEBUG)
146146+ self._spider.get_synonym_requests(syn['name'])
147147+148148+ return requests
149149+150150+ def new_synonym(self, sel, name, category):
151151+ """Scrape for a single synonym at a given HTML tag"""
152152+ self.ignore_list.append(name)
153153+ language = sel.xpath('span[@class="synonym_language"]/text()')
154154+ if language:
155155+ # The [1:-1] is to remove brackets around the language name
156156+ language = language.extract()[0][1:-1]
157157+ else:
158158+ # If language is not given, English is assumed, [TODO] - confirm
159159+ language = 'English'
160160+ log.msg('CS synonym: %s (%s) (%s)' % (name, category, language),
161161+ level=log.DEBUG)
162162+ references = []
163163+ # A synonym can have multiple references, each optionally with link
164164+ for ref in sel.xpath('span[@class="synonym_ref"]'):
165165+ refname = ref.xpath('normalize-space(string())')
166166+ references.append({
167167+ 'name': refname.extract()[0][1:-1],
168168+ 'URI': ''
169169+ })
170170+ for ref in sel.xpath('a[@class="synonym_ref"]'):
171171+ references.append({
172172+ 'name': ref.xpath('@title').extract()[0],
173173+ 'URI': ref.xpath('@href').extract()[0]
174174+ })
175175+ for ref in references:
176176+ log.msg('CS synonym ref: %s %s' % (ref['name'], ref['URI']),
177177+ level=log.DEBUG)
178178+ synonym = {
179179+ 'name': name,
180180+ 'category': category,
181181+ 'language': language,
182182+ 'references': references
183183+ }
184184+ return synonym
185185+186186+ @staticmethod
187187+ def parse_extendedinfo(response):
188188+ """Scrape data from the ChemSpider GetExtendedCompoundInfo API"""
189189+ sel = Selector(response)
190190+ properties = []
191191+ names = sel.xpath('*').xpath('name()').extract()
192192+ values = sel.xpath('*').xpath('text()').extract()
193193+ for (name, value) in zip(names, values):
194194+ result = Result({
195195+ 'attribute': name,
196196+ 'value': value, # These values have no unit!
197197+ 'source': 'ChemSpider ExtendedCompoundInfo',
198198+ 'reliability': 'Unknown',
199199+ 'conditions': ''
200200+ })
201201+ if result['value']:
202202+ properties.append(result)
203203+ return properties
204204+205205+ def parse_searchrequest(self, response):
206206+ """Parse the initial response of the ChemSpider Search API """
207207+ sel = Selector(response)
208208+ log.msg('chemspider parse_searchrequest', level=log.DEBUG)
209209+ sel.register_namespace('cs', 'http://www.chemspider.com/')
210210+ csids = sel.xpath('.//cs:int/text()').extract()
211211+ if len(csids) == 0:
212212+ log.msg('ChemSpider found nothing', level=log.ERROR)
213213+ return
214214+ elif len(csids) > 1:
215215+ log.msg('ChemSpider found multiple substances, taking first '
216216+ 'element', level=log.DEBUG)
217217+ csid = csids[0]
218218+ structure_url = self.website[:-1] + self.structure % csid
219219+ extendedinfo_url = self.website[:-1] + self.extendedinfo % csid
220220+ log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
221221+ return [Request(url=structure_url,
222222+ callback=self.parse),
223223+ Request(url=extendedinfo_url,
224224+ callback=self.parse_extendedinfo)]
225225+226226+ def new_compound_request(self, compound):
227227+ if compound in self.ignore_list: # [TODO] - add regular expression
228228+ return None
229229+ searchurl = self.website[:-1] + self.search % compound
230230+ log.msg('chemspider compound', level=log.DEBUG)
231231+ return Request(url=searchurl, callback=self.parse_searchrequest)
+276
FourmiCrawler/sources/NIST.py
···11+import re
22+33+from scrapy import log
44+from scrapy.http import Request
55+from scrapy.selector import Selector
66+77+from source import Source
88+from FourmiCrawler.items import Result
99+1010+1111+# [TODO]: values can be '128.', perhaps remove the dot in that case?
1212+# [TODO]: properties have references and comments which do not exist in the
1313+# Result item, but should be included eventually.
1414+1515+class NIST(Source):
1616+ """NIST Scraper plugin
1717+1818+ This plugin manages searching for a chemical on the NIST website
1919+ and parsing the resulting page if the chemical exists on NIST.
2020+ """
2121+ website = "http://webbook.nist.gov/*"
2222+2323+ search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
2424+2525+ ignore_list = set()
2626+2727+ def __init__(self):
2828+ Source.__init__(self)
2929+3030+ def parse(self, response):
3131+ sel = Selector(response)
3232+3333+ title = sel.xpath('head/title/text()').extract()[0]
3434+ if title == 'Name Not Found':
3535+ log.msg('NIST: Chemical not found!', level=log.ERROR)
3636+ return
3737+ if title not in self.ignore_list:
3838+ self.ignore_list.update(title)
3939+ log.msg('NIST emit synonym: %s' % title, level=log.DEBUG)
4040+ self._spider.get_synonym_requests(title)
4141+4242+ requests = []
4343+4444+ requests.extend(self.parse_generic_info(sel))
4545+4646+ symbol_table = {}
4747+ tds = sel.xpath('//table[@class="symbol_table"]/tr/td')
4848+ for (symbol_td, name_td) in zip(tds[::2], tds[1::2]):
4949+ symbol = ''.join(symbol_td.xpath('node()').extract())
5050+ name = name_td.xpath('text()').extract()[0]
5151+ symbol_table[symbol] = name
5252+ log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
5353+ level=log.DEBUG)
5454+5555+ for table in sel.xpath('//table[@class="data"]'):
5656+ summary = table.xpath('@summary').extract()[0]
5757+ if summary == 'One dimensional data':
5858+ log.msg('NIST table: Aggregrate data', level=log.DEBUG)
5959+ requests.extend(
6060+ self.parse_aggregate_data(table, symbol_table))
6161+ elif table.xpath('tr/th="Initial Phase"').extract()[0] == '1':
6262+ log.msg('NIST table; Enthalpy/entropy of phase transition',
6363+ level=log.DEBUG)
6464+ requests.extend(self.parse_transition_data(table, summary))
6565+ elif table.xpath('tr[1]/td'):
6666+ log.msg('NIST table: Horizontal table', level=log.DEBUG)
6767+ elif summary == 'Antoine Equation Parameters':
6868+ log.msg('NIST table: Antoine Equation Parameters',
6969+ level=log.DEBUG)
7070+ requests.extend(self.parse_antoine_data(table, summary))
7171+ elif len(table.xpath('tr[1]/th')) == 5:
7272+ log.msg('NIST table: generic 5 columns', level=log.DEBUG)
7373+ # Symbol (unit) Temperature (K) Method Reference Comment
7474+ requests.extend(self.parse_generic_data(table, summary))
7575+ elif len(table.xpath('tr[1]/th')) == 4:
7676+ log.msg('NIST table: generic 4 columns', level=log.DEBUG)
7777+ # Symbol (unit) Temperature (K) Reference Comment
7878+ requests.extend(self.parse_generic_data(table, summary))
7979+ else:
8080+ log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
8181+ continue # Assume unsupported
8282+ return requests
8383+8484+ def parse_generic_info(self, sel):
8585+ """Parses: synonyms, chemical formula, molecular weight, InChI,
8686+ InChiKey, CAS number
8787+ """
8888+ ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
8989+ li = ul.xpath('li')
9090+9191+ raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
9292+ for synonym in raw_synonyms[0].strip().split(';\n'):
9393+ log.msg('NIST synonym: %s' % synonym, level=log.DEBUG)
9494+ self.ignore_list.update(synonym)
9595+ self._spider.get_synonym_requests(synonym)
9696+9797+ data = {}
9898+9999+ raw_formula = ul.xpath('li[strong/a="Formula"]//text()').extract()
100100+ data['Chemical formula'] = ''.join(raw_formula[2:]).strip()
101101+102102+ raw_mol_weight = ul.xpath('li[strong/a="Molecular weight"]/text()')
103103+ data['Molecular weight'] = raw_mol_weight.extract()[0].strip()
104104+105105+ raw_inchi = ul.xpath('li[strong="IUPAC Standard InChI:"]//tt/text()')
106106+ data['IUPAC Standard InChI'] = raw_inchi.extract()[0]
107107+108108+ raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]'
109109+ '/tt/text()')
110110+ data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0]
111111+112112+ raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()')
113113+ data['CAS Registry Number'] = raw_cas_number.extract()[0].strip()
114114+115115+ requests = []
116116+ for key, value in data.iteritems():
117117+ result = Result({
118118+ 'attribute': key,
119119+ 'value': value,
120120+ 'source': 'NIST',
121121+ 'reliability': 'Unknown',
122122+ 'conditions': ''
123123+ })
124124+ requests.append(result)
125125+126126+ return requests
127127+128128+ def parse_aggregate_data(self, table, symbol_table):
129129+ """Parses the table(s) which contain possible links to individual
130130+ data points
131131+ """
132132+ results = []
133133+ for tr in table.xpath('tr[td]'):
134134+ extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
135135+ '/a/@href').extract()
136136+ if extra_data_url:
137137+ request = Request(url=self.website[:-1] + extra_data_url[0],
138138+ callback=self.parse_individual_datapoints)
139139+ results.append(request)
140140+ continue
141141+ data = []
142142+ for td in tr.xpath('td'):
143143+ data.append(''.join(td.xpath('node()').extract()))
144144+145145+ name = symbol_table[data[0]]
146146+ condition = ''
147147+148148+ m = re.match(r'(.*) at (.*)', name)
149149+ if m:
150150+ name = m.group(1)
151151+ condition = m.group(2)
152152+153153+ result = Result({
154154+ 'attribute': name,
155155+ 'value': data[1] + ' ' + data[2],
156156+ 'source': 'NIST',
157157+ 'reliability': 'Unknown',
158158+ 'conditions': condition
159159+ })
160160+ log.msg('NIST: |%s|' % data, level=log.DEBUG)
161161+ results.append(result)
162162+ return results
163163+164164+ @staticmethod
165165+ def parse_transition_data(table, summary):
166166+ """Parses the table containing properties regarding phase changes"""
167167+ results = []
168168+169169+ tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
170170+ m = re.search(r'\((.*)\)', tr_unit)
171171+ unit = '!'
172172+ if m:
173173+ unit = m.group(1)
174174+175175+ for tr in table.xpath('tr[td]'):
176176+ tds = tr.xpath('td/text()').extract()
177177+ result = Result({
178178+ 'attribute': summary,
179179+ 'value': tds[0] + ' ' + unit,
180180+ 'source': 'NIST',
181181+ 'reliability': 'Unknown',
182182+ 'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
183183+ })
184184+ results.append(result)
185185+186186+ return results
187187+188188+ @staticmethod
189189+ def parse_generic_data(table, summary):
190190+ """Parses the common tables of 4 and 5 rows. Assumes they are of the
191191+ form:
192192+ Symbol (unit)|Temperature (K)|Method|Reference|Comment
193193+ Symbol (unit)|Temperature (K)|Reference|Comment
194194+ """
195195+ results = []
196196+197197+ tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
198198+ m = re.search(r'\((.*)\)', tr_unit)
199199+ unit = '!'
200200+ if m:
201201+ unit = m.group(1)
202202+203203+ for tr in table.xpath('tr[td]'):
204204+ tds = tr.xpath('td/text()').extract()
205205+ result = Result({
206206+ 'attribute': summary,
207207+ 'value': tds[0] + ' ' + unit,
208208+ 'source': 'NIST',
209209+ 'reliability': 'Unknown',
210210+ 'conditions': '%s K' % tds[1]
211211+ })
212212+ results.append(result)
213213+ return results
214214+215215+ @staticmethod
216216+ def parse_antoine_data(table, summary):
217217+ """Parse table containing parameters for the Antione equation"""
218218+ results = []
219219+220220+ for tr in table.xpath('tr[td]'):
221221+ tds = tr.xpath('td/text()').extract()
222222+ result = Result({
223223+ 'attribute': summary,
224224+ 'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
225225+ 'source': 'NIST',
226226+ 'reliability': 'Unknown',
227227+ 'conditions': '%s K' % tds[0]
228228+ })
229229+ results.append(result)
230230+231231+ return results
232232+233233+ @staticmethod
234234+ def parse_individual_datapoints(response):
235235+ """Parses the page linked from aggregate data"""
236236+ sel = Selector(response)
237237+ table = sel.xpath('//table[@class="data"]')[0]
238238+239239+ results = []
240240+241241+ name = table.xpath('@summary').extract()[0]
242242+ condition = ''
243243+ m = re.match(r'(.*) at (.*)', name)
244244+ if m:
245245+ name = m.group(1)
246246+ condition = m.group(2)
247247+248248+ tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
249249+ m = re.search(r'\((.*)\)', tr_unit)
250250+ unit = '!'
251251+ if m:
252252+ unit = m.group(1)
253253+254254+ for tr in table.xpath('tr[td]'):
255255+ tds = tr.xpath('td/text()').extract()
256256+ uncertainty = ''
257257+ m = re.search('Uncertainty assigned by TRC = (.*?) ', tds[-1])
258258+ if m:
259259+ uncertainty = '+- %s ' % m.group(1)
260260+ # [TODO]: get the plusminus sign working in here
261261+ result = Result({
262262+ 'attribute': name,
263263+ 'value': '%s %s%s' % (tds[0], uncertainty, unit),
264264+ 'source': 'NIST',
265265+ 'reliability': 'Unknown',
266266+ 'conditions': condition
267267+ })
268268+ results.append(result)
269269+270270+ return results
271271+272272+ def new_compound_request(self, compound):
273273+ if compound not in self.ignore_list:
274274+ self.ignore_list.update(compound)
275275+ return Request(url=self.website[:-1] + self.search % compound,
276276+ callback=self.parse)
+119
FourmiCrawler/sources/WikipediaParser.py
···11+import re
22+33+from scrapy.http import Request
44+from scrapy import log
55+from scrapy.selector import Selector
66+77+from source import Source
88+from FourmiCrawler.items import Result
99+1010+1111+class WikipediaParser(Source):
1212+ """ Wikipedia scraper for chemical properties
1313+1414+ This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values.
1515+ It also returns requests with other external sources which contain information on parsed subject.
1616+ """
1717+1818+ website = "http://en.wikipedia.org/wiki/*"
1919+ __spider = None
2020+ searched_compounds = []
2121+2222+ def __init__(self):
2323+ Source.__init__(self)
2424+2525+ def parse(self, response):
2626+ """ Distributes the above described behaviour """
2727+ log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
2828+ sel = Selector(response)
2929+ compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page
3030+ if compound in self.searched_compounds:
3131+ return None
3232+ else:
3333+ items = self.parse_infobox(sel)
3434+ self.searched_compounds.append(compound)
3535+ return items
3636+3737+ def parse_infobox(self, sel):
3838+ """ scrape data from infobox on wikipedia. """
3939+ items = []
4040+4141+ # be sure to get chembox (wikipedia template)
4242+ tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
4343+ xpath('normalize-space(string())')
4444+ prop_names = tr_list[::2]
4545+ prop_values = tr_list[1::2]
4646+ for i, prop_name in enumerate(prop_names):
4747+ item = Result({
4848+ 'attribute': prop_name.extract().encode('utf-8'),
4949+ 'value': prop_values[i].extract().encode('utf-8'),
5050+ 'source': "Wikipedia",
5151+ 'reliability': "Unknown",
5252+ 'conditions': ""
5353+ })
5454+ items.append(item)
5555+ log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
5656+5757+ #scrape the drugbox (wikipedia template)
5858+ tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
5959+ log.msg('dit: %s' % tr_list2, level=log.DEBUG)
6060+ for tablerow in tr_list2:
6161+ log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
6262+ if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
6363+ 'normalize-space(string())'):
6464+ item = Result({
6565+ 'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
6666+ 'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
6767+ 'source': "Wikipedia",
6868+ 'reliability': "Unknown",
6969+ 'conditions': ""
7070+ })
7171+ items.append(item)
7272+ log.msg(
7373+ 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
7474+ level=log.DEBUG)
7575+7676+ items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
7777+ item_list = self.clean_items(items)
7878+7979+ identifiers = self.get_identifiers(sel)
8080+8181+ #add extra sources to scrape from as requests
8282+ for i, identifier in enumerate(identifiers):
8383+ request = None
8484+ #discard internal wikipedia links
8585+ if re.match('//en\.wikipedia', identifier):
8686+ log.msg('Found link to Wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING)
8787+ #fix links starting with '//www.'
8888+ elif re.match('/{2}', identifier):
8989+ identifier = re.sub("/{2}", "http://", identifier)
9090+ request = Request(identifier)
9191+ else:
9292+ request = Request(identifier)
9393+ log.msg('New identifier found, request: %s' % identifier, level=log.DEBUG)
9494+ item_list.append(request)
9595+9696+ return item_list
9797+9898+ def new_compound_request(self, compound):
9999+ return Request(url=self.website[:-1] + compound, callback=self.parse)
100100+101101+ @staticmethod
102102+ def clean_items(items):
103103+ """ clean up properties using regex, makes it possible to split the values from the units """
104104+ for item in items:
105105+ value = item['value']
106106+ m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F)
107107+ if m:
108108+ item['value'] = m.group(1) + " K"
109109+ m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) # clean up J/K/mol values
110110+ if m:
111111+ item['value'] = m.group(1) + " J/K/mol"
112112+ return items
113113+114114+ @staticmethod
115115+ def get_identifiers(sel):
116116+ """ find external links, named 'Identifiers' to different sources. """
117117+ links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
118118+ '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
119119+ return links
FourmiCrawler/sources/__init__.py
This is a binary file and will not be displayed.
+38
FourmiCrawler/sources/source.py
···11+from scrapy import log
22+# from scrapy.http import Request
33+44+55+class Source:
66+ website = "http://something/*" # Regex of URI's the source is able to parse
77+ _spider = None
88+99+ def __init__(self):
1010+ """
1111+ Initiation of a new Source
1212+ """
1313+ pass
1414+1515+ def parse(self, response):
1616+ """
1717+ This function should be able to parse all Scrapy Response objects with a URL matching the website Regex.
1818+ :param response: A Scrapy Response object
1919+ :return: A list of Result items and new Scrapy Requests
2020+ """
2121+ log.msg("The parse function of the empty source was used.", level=log.WARNING)
2222+ pass
2323+2424+ def new_compound_request(self, compound):
2525+ """
2626+ This function should return a Scrapy Request for the given compound request.
2727+ :param compound: A compound name.
2828+ :return: A new Scrapy Request
2929+ """
3030+ # return Request(url=self.website[:-1] + compound, callback=self.parse)
3131+ pass
3232+3333+ def set_spider(self, spider):
3434+ """
3535+ A Function to save the associated spider.
3636+ :param spider: A FourmiSpider object
3737+ """
3838+ self._spider = spider
+68-7
FourmiCrawler/spider.py
···11+import re
22+13from scrapy.spider import Spider
44+from scrapy import log
253647class FourmiSpider(Spider):
88+ """
99+ A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
1010+ """
511 name = "FourmiSpider"
1212+ _sources = []
1313+ synonyms = set()
61477- def __init__(self, compound=None, *args, **kwargs):
1515+ def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
1616+ """
1717+ Initiation of the Spider
1818+ :param compound: compound that will be searched.
1919+ :param selected_attributes: A list of regular expressions that the attributes should match.
2020+ """
821 super(FourmiSpider, self).__init__(*args, **kwargs)
2222+ self.synonyms.add(compound)
2323+ self.selected_attributes = selected_attributes
9241010- def parse(self, reponse):
1111- # [TODO] - This function should delegate it's functionality to other
1212- # parsers.
1313- pass
2525+ def parse(self, response):
2626+ """
2727+ The function that is called when a response to a request is available. This function distributes this to a
2828+ source which should be able to handle parsing the data.
2929+ :param response: A Scrapy Response object that should be parsed
3030+ :return: A list of Result items and new Request to be handled by the scrapy core.
3131+ """
3232+ for source in self._sources:
3333+ if re.match(source.website, response.url):
3434+ log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
3535+ return source.parse(response)
3636+ return None
3737+3838+ def get_synonym_requests(self, compound):
3939+ """
4040+ A function that generates new Scrapy Request for each source given a new synonym of a compound.
4141+ :param compound: A compound name
4242+ :return: A list of Scrapy Request objects
4343+ """
4444+ requests = []
4545+ if compound not in self.synonyms:
4646+ self.synonyms.add(compound)
4747+ for parser in self._sources:
4848+ parser_requests = parser.new_compound_request(compound)
4949+ if parser_requests is not None:
5050+ requests.append(parser_requests)
5151+ return requests
5252+5353+ def start_requests(self):
5454+ """
5555+ The function called by Scrapy for it's first Requests
5656+ :return: A list of Scrapy Request generated from the known synonyms using the available sources.
5757+ """
5858+ requests = []
5959+ for synonym in self.synonyms:
6060+ requests.extend(self.get_synonym_requests(synonym))
6161+ return requests
6262+6363+ def add_sources(self, sources):
6464+ """
6565+ A function to add a new Parser objects to the list of available sources.
6666+ :param sources: A list of Source Objects.
6767+ """
6868+ for parser in sources:
6969+ self.add_source(parser)
14701515- def add_parser(self, parser):
1616- self.parsers.add(parser)
7171+ def add_source(self, source):
7272+ """
7373+ A function add a new Parser object to the list of available parsers.
7474+ :param source: A Source Object
7575+ """
7676+ self._sources.append(source)
7777+ source.set_spider(self)
+21
LICENSE
···11+The MIT License (MIT)
22+33+Copyright (c) 2014 Ivo B. Rietveld
44+55+Permission is hereby granted, free of charge, to any person obtaining a copy
66+of this software and associated documentation files (the "Software"), to deal
77+in the Software without restriction, including without limitation the rights
88+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
99+copies of the Software, and to permit persons to whom the Software is
1010+furnished to do so, subject to the following conditions:
1111+1212+The above copyright notice and this permission notice shall be included in all
1313+copies or substantial portions of the Software.
1414+1515+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1616+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1717+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1818+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1919+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2020+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2121+SOFTWARE.
+85
README.md
···11+# Fourmi
22+33+**Master branch**: [](https://travis-ci.org/Recondor/Fourmi)
44+55+**Developing branch**: [](https://travis-ci.org/Recondor/Fourmi)
66+77+Fourmi is an web scraper for chemical substances. The program is designed to be
88+used as a search engine to search multiple chemical databases for a specific
99+substance. The program will produce all available attributes of the substance
1010+and conditions associated with the attributes. Fourmi also attempts to estimate
1111+the reliability of each data point to assist the user in deciding which data
1212+should be used.
1313+1414+The Fourmi project is open source project licensed under the MIT license. Feel
1515+free to contribute!
1616+1717+Fourmi is based on the [Scrapy framework](http://scrapy.org/), an open source
1818+web scraping framework for python. Most of the functionality of this project can
1919+be traced to this framework. Should the documentation for this application fall
2020+short, we suggest you take a close look at the [Scrapy architecture]
2121+(http://doc.scrapy.org/en/latest/topics/architecture.html) and the [Scrapy
2222+documentation](http://doc.scrapy.org/en/latest/index.html).
2323+2424+### Installing
2525+2626+If you're installing Fourmi, please take a look at our [installation guide](...)
2727+on our wiki. When you've installed the application, make sure to check our
2828+[usage guide](...).
2929+3030+### Using the Source
3131+3232+To use the Fourmi source code multiple dependencies are required. Take a look at
3333+the [wiki page](...) on using the application source code for a step by step
3434+installation guide.
3535+3636+When developing for the Fourmi project keep in mind that code readability is a
3737+must. To maintain the readability, code should be conform with the
3838+[PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python
3939+code. More information about the different structures and principles of the
4040+Fourmi application can be found on our [wiki](...).
4141+4242+### To Do
4343+4444+The Fourmi project has the following goals for the nearby future:
4545+4646+__Main goals:__
4747+4848+- Improve our documentation and guides. (Assignee: Dekker)
4949+- Build an graphical user interface(GUI) as alternative for the command line
5050+interface(CLI). (Assignee: Harmen)
5151+- Compiling the source into an windows executable. (Assignee: Bas)
5252+- Create an configuration file to hold logins and API keys.
5353+- Determine reliability of our data point.
5454+- Create an module to gather data from NIST. (Assignee: Rob)
5555+- Create an module to gather data from PubChem. (Assignee: Nout)
5656+5757+__Side goals:__
5858+5959+- Clean and unify data.
6060+- Extensive reliability analysis using statistical tests.
6161+- Test data with Descartes 1.
6262+6363+### Project Origin
6464+6565+The Fourmi project was started in February of 2014 as part of a software
6666+engineering course at the Radboud University for students studying Computer
6767+Science, Information Science or Artificial Intelligence. Students participate in
6868+a real software development project as part of the
6969+[Giphouse](http://www.giphouse.nl/).
7070+7171+This particular project was started on behalf of Ivo B. Rietveld. As a chemist
7272+he was in need of an application to automatically search information on chemical
7373+substances and create an phase diagram. The so called "Descrates" project was
7474+split into two teams each creating a different application that has part of the
7575+functionality. We are the team Descartes 2 and as we were responsible for
7676+creating a web crawler, we've named our application Fourmi (Englis: Ants).
7777+7878+The following people were part of the original team:
7979+8080+- [Jip J. Dekker](http://jip.dekker.li)
8181+- Rob ten Berge
8282+- Harmen Prins
8383+- Bas van Berkel
8484+- Nout van Deijck
8585+- Michail Kuznetcov
-16
README.rst
···11-We are the team Descartes 2.
22-----------------------------
33-44-Our team members are:
55-66-+ Rob ten Berge
77-88-+ Bas van Berkel
99-1010-+ Nout van Deijck
1111-1212-+ Jip J. Dekker
1313-1414-+ Michail Kuznetcov
1515-1616-+ Harmen Prins
+117
fourmi.py
···11+# !/usr/bin/env python
22+"""
33+Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms).
44+55+Usage:
66+ fourmi search <compound>
77+ fourmi [options] search <compound>
88+ fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
99+ fourmi list
1010+ fourmi [--include=<sourcename> | --exclude=<sourcename>] list
1111+ fourmi -h | --help
1212+ fourmi --version
1313+1414+Options:
1515+ --attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
1616+ -h --help Show this screen.
1717+ --version Show version.
1818+ --verbose Verbose logging output.
1919+ --log=<file> Save log to an file.
2020+ -o <file> --output=<file> Output file [default: result.*format*]
2121+ -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines]
2222+ --include=<regex> Include only sources that match these regular expressions split by a comma.
2323+ --exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
2424+"""
2525+2626+from twisted.internet import reactor
2727+from scrapy.crawler import Crawler
2828+from scrapy import log, signals
2929+from scrapy.utils.project import get_project_settings
3030+import docopt
3131+3232+from FourmiCrawler.spider import FourmiSpider
3333+from sourceloader import SourceLoader
3434+3535+3636+def setup_crawler(compound, settings, source_loader, attributes):
3737+ """
3838+ This function prepares and start the crawler which starts the actual search on the internet
3939+ :param compound: The compound which should be searched
4040+ :param settings: A scrapy settings object
4141+ :param source_loader: A fully functional SourceLoader object which contains only the sources that should be used.
4242+ :param attributes: A list of regular expressions which the attribute names should match.
4343+ """
4444+ spider = FourmiSpider(compound=compound, selected_attributes=attributes)
4545+ spider.add_sources(source_loader.sources)
4646+ crawler = Crawler(settings)
4747+ crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
4848+ crawler.configure()
4949+ crawler.crawl(spider)
5050+ crawler.start()
5151+5252+5353+def scrapy_settings_manipulation(docopt_arguments):
5454+ """
5555+ This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi
5656+ project these are command line arguments.
5757+ :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
5858+ """
5959+ settings = get_project_settings()
6060+6161+ if docopt_arguments["--output"] != 'result.*format*':
6262+ settings.overrides["FEED_URI"] = docopt_arguments["--output"]
6363+ elif docopt_arguments["--format"] == "jsonlines":
6464+ settings.overrides["FEED_URI"] = "results.json"
6565+ elif docopt_arguments["--format"] is not None:
6666+ settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"]
6767+6868+ if docopt_arguments["--format"] is not None:
6969+ settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"]
7070+7171+ return settings
7272+7373+7474+def start_log(docopt_arguments):
7575+ """
7676+ This function starts the logging functionality of Scrapy using the settings given by the CLI.
7777+ :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
7878+ """
7979+ if docopt_arguments["--log"] is not None:
8080+ if docopt_arguments["--verbose"]:
8181+ log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
8282+ else:
8383+ log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING)
8484+ else:
8585+ if docopt_arguments["--verbose"]:
8686+ log.start(logstdout=False, loglevel=log.DEBUG)
8787+ else:
8888+ log.start(logstdout=True, loglevel=log.WARNING)
8989+9090+9191+def search(docopt_arguments, source_loader):
9292+ """
9393+ The function that facilitates the search for a specific compound.
9494+ :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
9595+ :param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
9696+ """
9797+ start_log(docopt_arguments)
9898+ settings = scrapy_settings_manipulation(docopt_arguments)
9999+ setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
100100+ reactor.run()
101101+102102+103103+# The start for the Fourmi Command Line interface.
104104+if __name__ == '__main__':
105105+ arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.1')
106106+ loader = SourceLoader()
107107+108108+ if arguments["--include"]:
109109+ loader.include(arguments["--include"].split(','))
110110+ elif arguments["--exclude"]:
111111+ loader.exclude(arguments["--exclude"].split(','))
112112+113113+ if arguments["search"]:
114114+ search(arguments, loader)
115115+ elif arguments["list"]:
116116+ print "-== Available Sources ==-"
117117+ print str(loader)
+18
setup.py
···11+import sys
22+from cx_Freeze import setup, Executable
33+44+# After running the setup file (python setup.py build) the scrapy/VERSION file has to be manually put into the
55+# library.zip, also the FourmiCrawler map has to be copied to both the library and the exe.win32-2.7 folder. after
66+# putting the files in the library the library has to be zipped and replace the old library.
77+# Dependencies are automatically detected, but it might need fine tuning.
88+build_exe_options = {"packages": ["os", "scrapy", "lxml", "w3lib", "pkg_resources", "zope.interface", "twisted.internet"], "excludes": []}
99+1010+# GUI applications require a different base on Windows (the default is for a
1111+# console application).
1212+base = None
1313+1414+setup( name = "Scrapy",
1515+ version = "0.1",
1616+ description = "My GUI application!",
1717+ options = {"build_exe": build_exe_options},
1818+ executables = [Executable("fourmi.py", base=base)])
+60
sourceloader.py
···11+import inspect
22+import sys
33+import os
44+import re
55+66+from FourmiCrawler.sources.source import Source
77+88+99+class SourceLoader:
1010+ sources = []
1111+1212+ def __init__(self, rel_dir="FourmiCrawler/sources"):
1313+1414+ if hasattr(sys,'frozen'):
1515+ path = os.path.dirname(sys.executable)
1616+ else:
1717+ path = os.path.dirname(os.path.abspath(__file__))
1818+1919+ path += "/" + rel_dir
2020+ known_parser = set()
2121+2222+ for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
2323+ mod = __import__('.'.join([rel_dir.replace('/', "."), py]), fromlist=[py])
2424+ classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
2525+ for cls in classes:
2626+ if issubclass(cls, Source) and cls not in known_parser:
2727+ self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers?
2828+ # known_parser.add(cls)
2929+3030+ def include(self, source_names):
3131+ """
3232+ This function excludes all sources that don't match the given regular expressions.
3333+ :param source_names: A list of regular expression (strings)
3434+ """
3535+ new = set()
3636+ for name in source_names:
3737+ new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
3838+ self.sources = list(new)
3939+4040+ def exclude(self, source_names):
4141+ """
4242+ This function excludes all sources that match the given regular expressions.
4343+ :param source_names: A list of regular expression (strings)
4444+ """
4545+ exclude = []
4646+ for name in source_names:
4747+ exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
4848+ self.sources = [src for src in self.sources if src not in exclude]
4949+5050+ def __str__(self):
5151+ """
5252+ This function returns a string with all sources currently available in the SourceLoader.
5353+ :return: a string with all available sources.
5454+ """
5555+ string = ""
5656+ for src in self.sources:
5757+ string += "Source: " + src.__class__.__name__
5858+ string += " - "
5959+ string += "URI: " + src.website + "\n"
6060+ return string