···1+# Config file for automatic testing at travis-ci.org
2+3+language: python
4+python: 2.7
5+6+# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
7+install:
8+ - pip install Scrapy docopt
9+10+# command to run tests, e.g. python setup.py test
11+script:
12+ - nosetests tests
13+14+notifications:
15+ slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
-31
Fourmi.py
···1-#!/usr/bin/env python
2-"""
3-Fourmi - An internet webcrawler searching for information on chemical
4-compounds. [todo] - Add some more useful text here.
5-"""
6-7-from twisted.internet import reactor
8-from scrapy.crawler import Crawler
9-from scrapy import log, signals
10-from FourmiCrawler.spiders.Fourmispider import FourmiSpider
11-from scrapy.utils.project import get_project_settings
12-13-14-def setup_crawler(searchable):
15- # [TODO] - Initiate all parsers for the different websites and get
16- # allowed URLs.
17- spider = FourmiSpider(compound=searchable)
18- settings = get_project_settings()
19- crawler = Crawler(settings)
20- crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
21- crawler.configure()
22- crawler.crawl(spider)
23- crawler.start()
24-25-26-def start():
27- setup_crawler("Methane")
28- log.start()
29- reactor.run()
30-31-start()
···0000000000000000000000000000000
+1-3
FourmiCrawler/items.py
···1-# Define here the models for your scraped items
2-#
3-# See documentation in:
4# http://doc.scrapy.org/en/latest/topics/items.html
56from scrapy.item import Item, Field
···1+# For more information on item definitions, see the Scrapy documentation in:
002# http://doc.scrapy.org/en/latest/topics/items.html
34from scrapy.item import Item, Field
FourmiCrawler/parsers/__init__.py
This is a binary file and will not be displayed.
-9
FourmiCrawler/parsers/parser.py
···1-from scrapy import log
2-3-4-class Parser:
5- website = "http://localhost/*"
6-7- def parse(self, reponse):
8- log.msg("The parse function of the empty parser was used.", level=log.Warning)
9- pass
···000000000
+43-7
FourmiCrawler/pipelines.py
···1-# Define your item pipelines here
2-#
3-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
4-# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
5from scrapy.exceptions import DropItem
678-class FourmiPipeline(object):
00000000000000090010 def __init__(self):
11 self.known_values = set()
12···17 :param spider: The spider which scraped the spider
18 :return: :raise DropItem: Returns the item if unique or drops them if it's already known
19 """
20- value = item['attribute'], item['value']
21 if value in self.known_values:
22- raise DropItem("Duplicate item found: %s" % item)
23 else:
24 self.known_values.add(value)
25 return item
0000000000000000000
···1+# For more information on item pipelines, see the Scrapy documentation in:
2+# http://doc.scrapy.org/en/latest/topics/item-pipeline.html
3+import re
4+5from scrapy.exceptions import DropItem
678+class RemoveNonePipeline(object):
9+ def __init__(self):
10+ pass
11+12+ @staticmethod
13+ def process_item(item, spider):
14+ """
15+ Processing the items so None values are replaced by empty strings
16+ :param item: The incoming item
17+ :param spider: The spider which scraped the spider
18+ :return: :raise DropItem: Returns the item if unique or drops them if it's already known
19+ """
20+ for key in item:
21+ if item[key] is None:
22+ item[key] = ""
23+ return item
2425+26+class DuplicatePipeline(object):
27 def __init__(self):
28 self.known_values = set()
29···34 :param spider: The spider which scraped the spider
35 :return: :raise DropItem: Returns the item if unique or drops them if it's already known
36 """
37+ value = (item['attribute'], item['value'], item['conditions'])
38 if value in self.known_values:
39+ raise DropItem("Duplicate item found: %s" % item) # [todo] append sources of first item.
40 else:
41 self.known_values.add(value)
42 return item
43+44+45+class AttributeSelectionPipeline(object):
46+ def __init__(self):
47+ pass
48+49+ @staticmethod
50+ def process_item(item, spider):
51+ """
52+ The items are processed using the selected attribute list available in the spider,
53+ items that don't match the selected items are dropped.
54+ :param item: The incoming item
55+ :param spider: The spider which scraped the item. Should have an attribute "selected_attributes".
56+ :return: :raise DropItem: Returns item if it matches an selected attribute, else it is dropped.
57+ """
58+ if [x for x in spider.selected_attributes if re.match(x, item["attribute"])]:
59+ return item
60+ else:
61+ raise DropItem("Attribute not selected by used: %s" % item)
+7-2
FourmiCrawler/settings.py
···3# For simplicity, this file contains only the most important settings by
4# default. All the other settings are documented here:
5#
6-# http://doc.scrapy.org/en/latest/topics/settings.html
7#
89BOT_NAME = 'FourmiCrawler'
···11SPIDER_MODULES = ['FourmiCrawler']
12NEWSPIDER_MODULE = 'FourmiCrawler'
13ITEM_PIPELINES = {
14- 'FourmiCrawler.pipelines.FourmiPipeline': 100
0015}
0001617# Crawl responsibly by identifying yourself (and your website) on the
18# user-agent
···3# For simplicity, this file contains only the most important settings by
4# default. All the other settings are documented here:
5#
6+# http://doc.scrapy.org/en/latest/topics/settings.html
7#
89BOT_NAME = 'FourmiCrawler'
···11SPIDER_MODULES = ['FourmiCrawler']
12NEWSPIDER_MODULE = 'FourmiCrawler'
13ITEM_PIPELINES = {
14+ "FourmiCrawler.pipelines.RemoveNonePipeline": 100,
15+ 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 200,
16+ 'FourmiCrawler.pipelines.DuplicatePipeline': 300,
17}
18+FEED_URI = 'results.json'
19+FEED_FORMAT = 'jsonlines'
20+2122# Crawl responsibly by identifying yourself (and your website) on the
23# user-agent
···1+import re
2+3+from scrapy.http import Request
4+from scrapy import log
5+from scrapy.selector import Selector
6+7+from source import Source
8+from FourmiCrawler.items import Result
9+10+11+class WikipediaParser(Source):
12+ """ Wikipedia scraper for chemical properties
13+14+ This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values.
15+ It also returns requests with other external sources which contain information on parsed subject.
16+ """
17+18+ website = "http://en.wikipedia.org/wiki/*"
19+ __spider = None
20+ searched_compounds = []
21+22+ def __init__(self):
23+ Source.__init__(self)
24+25+ def parse(self, response):
26+ """ Distributes the above described behaviour """
27+ log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
28+ sel = Selector(response)
29+ compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page
30+ if compound in self.searched_compounds:
31+ return None
32+ else:
33+ items = self.parse_infobox(sel)
34+ self.searched_compounds.append(compound)
35+ return items
36+37+ def parse_infobox(self, sel):
38+ """ scrape data from infobox on wikipedia. """
39+ items = []
40+41+ # be sure to get chembox (wikipedia template)
42+ tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
43+ xpath('normalize-space(string())')
44+ prop_names = tr_list[::2]
45+ prop_values = tr_list[1::2]
46+ for i, prop_name in enumerate(prop_names):
47+ item = Result({
48+ 'attribute': prop_name.extract().encode('utf-8'),
49+ 'value': prop_values[i].extract().encode('utf-8'),
50+ 'source': "Wikipedia",
51+ 'reliability': "Unknown",
52+ 'conditions': ""
53+ })
54+ items.append(item)
55+ log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
56+57+ #scrape the drugbox (wikipedia template)
58+ tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
59+ log.msg('dit: %s' % tr_list2, level=log.DEBUG)
60+ for tablerow in tr_list2:
61+ log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
62+ if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
63+ 'normalize-space(string())'):
64+ item = Result({
65+ 'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
66+ 'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
67+ 'source': "Wikipedia",
68+ 'reliability': "Unknown",
69+ 'conditions': ""
70+ })
71+ items.append(item)
72+ log.msg(
73+ 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
74+ level=log.DEBUG)
75+76+ items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
77+ item_list = self.clean_items(items)
78+79+ identifiers = self.get_identifiers(sel)
80+81+ #add extra sources to scrape from as requests
82+ for i, identifier in enumerate(identifiers):
83+ request = None
84+ #discard internal wikipedia links
85+ if re.match('//en\.wikipedia', identifier):
86+ log.msg('Found link to Wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING)
87+ #fix links starting with '//www.'
88+ elif re.match('/{2}', identifier):
89+ identifier = re.sub("/{2}", "http://", identifier)
90+ request = Request(identifier)
91+ else:
92+ request = Request(identifier)
93+ log.msg('New identifier found, request: %s' % identifier, level=log.DEBUG)
94+ item_list.append(request)
95+96+ return item_list
97+98+ def new_compound_request(self, compound):
99+ return Request(url=self.website[:-1] + compound, callback=self.parse)
100+101+ @staticmethod
102+ def clean_items(items):
103+ """ clean up properties using regex, makes it possible to split the values from the units """
104+ for item in items:
105+ value = item['value']
106+ m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F)
107+ if m:
108+ item['value'] = m.group(1) + " K"
109+ m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) # clean up J/K/mol values
110+ if m:
111+ item['value'] = m.group(1) + " J/K/mol"
112+ return items
113+114+ @staticmethod
115+ def get_identifiers(sel):
116+ """ find external links, named 'Identifiers' to different sources. """
117+ links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
118+ '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
119+ return links
FourmiCrawler/sources/__init__.py
This is a binary file and will not be displayed.
+38
FourmiCrawler/sources/source.py
···00000000000000000000000000000000000000
···1+from scrapy import log
2+# from scrapy.http import Request
3+4+5+class Source:
6+ website = "http://something/*" # Regex of URI's the source is able to parse
7+ _spider = None
8+9+ def __init__(self):
10+ """
11+ Initiation of a new Source
12+ """
13+ pass
14+15+ def parse(self, response):
16+ """
17+ This function should be able to parse all Scrapy Response objects with a URL matching the website Regex.
18+ :param response: A Scrapy Response object
19+ :return: A list of Result items and new Scrapy Requests
20+ """
21+ log.msg("The parse function of the empty source was used.", level=log.WARNING)
22+ pass
23+24+ def new_compound_request(self, compound):
25+ """
26+ This function should return a Scrapy Request for the given compound request.
27+ :param compound: A compound name.
28+ :return: A new Scrapy Request
29+ """
30+ # return Request(url=self.website[:-1] + compound, callback=self.parse)
31+ pass
32+33+ def set_spider(self, spider):
34+ """
35+ A Function to save the associated spider.
36+ :param spider: A FourmiSpider object
37+ """
38+ self._spider = spider
+68-7
FourmiCrawler/spider.py
···001from scrapy.spider import Spider
0234class FourmiSpider(Spider):
0005 name = "FourmiSpider"
0067- def __init__(self, compound=None, *args, **kwargs):
000008 super(FourmiSpider, self).__init__(*args, **kwargs)
00910- def parse(self, reponse):
11- # [TODO] - This function should delegate it's functionality to other
12- # parsers.
13- pass
000000000000000000000000000000000000000001415- def add_parser(self, parser):
16- self.parsers.add(parser)
00000
···1+import re
2+3from scrapy.spider import Spider
4+from scrapy import log
567class FourmiSpider(Spider):
8+ """
9+ A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
10+ """
11 name = "FourmiSpider"
12+ _sources = []
13+ synonyms = set()
1415+ def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
16+ """
17+ Initiation of the Spider
18+ :param compound: compound that will be searched.
19+ :param selected_attributes: A list of regular expressions that the attributes should match.
20+ """
21 super(FourmiSpider, self).__init__(*args, **kwargs)
22+ self.synonyms.add(compound)
23+ self.selected_attributes = selected_attributes
2425+ def parse(self, response):
26+ """
27+ The function that is called when a response to a request is available. This function distributes this to a
28+ source which should be able to handle parsing the data.
29+ :param response: A Scrapy Response object that should be parsed
30+ :return: A list of Result items and new Request to be handled by the scrapy core.
31+ """
32+ for source in self._sources:
33+ if re.match(source.website, response.url):
34+ log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
35+ return source.parse(response)
36+ return None
37+38+ def get_synonym_requests(self, compound):
39+ """
40+ A function that generates new Scrapy Request for each source given a new synonym of a compound.
41+ :param compound: A compound name
42+ :return: A list of Scrapy Request objects
43+ """
44+ requests = []
45+ if compound not in self.synonyms:
46+ self.synonyms.add(compound)
47+ for parser in self._sources:
48+ parser_requests = parser.new_compound_request(compound)
49+ if parser_requests is not None:
50+ requests.append(parser_requests)
51+ return requests
52+53+ def start_requests(self):
54+ """
55+ The function called by Scrapy for it's first Requests
56+ :return: A list of Scrapy Request generated from the known synonyms using the available sources.
57+ """
58+ requests = []
59+ for synonym in self.synonyms:
60+ requests.extend(self.get_synonym_requests(synonym))
61+ return requests
62+63+ def add_sources(self, sources):
64+ """
65+ A function to add a new Parser objects to the list of available sources.
66+ :param sources: A list of Source Objects.
67+ """
68+ for parser in sources:
69+ self.add_source(parser)
7071+ def add_source(self, source):
72+ """
73+ A function add a new Parser object to the list of available parsers.
74+ :param source: A Source Object
75+ """
76+ self._sources.append(source)
77+ source.set_spider(self)
+21
LICENSE
···000000000000000000000
···1+The MIT License (MIT)
2+3+Copyright (c) 2014 Ivo B. Rietveld
4+5+Permission is hereby granted, free of charge, to any person obtaining a copy
6+of this software and associated documentation files (the "Software"), to deal
7+in the Software without restriction, including without limitation the rights
8+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+copies of the Software, and to permit persons to whom the Software is
10+furnished to do so, subject to the following conditions:
11+12+The above copyright notice and this permission notice shall be included in all
13+copies or substantial portions of the Software.
14+15+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+SOFTWARE.
···1+# Fourmi
2+3+**Master branch**: [](https://travis-ci.org/Recondor/Fourmi)
4+5+**Developing branch**: [](https://travis-ci.org/Recondor/Fourmi)
6+7+Fourmi is an web scraper for chemical substances. The program is designed to be
8+used as a search engine to search multiple chemical databases for a specific
9+substance. The program will produce all available attributes of the substance
10+and conditions associated with the attributes. Fourmi also attempts to estimate
11+the reliability of each data point to assist the user in deciding which data
12+should be used.
13+14+The Fourmi project is open source project licensed under the MIT license. Feel
15+free to contribute!
16+17+Fourmi is based on the [Scrapy framework](http://scrapy.org/), an open source
18+web scraping framework for python. Most of the functionality of this project can
19+be traced to this framework. Should the documentation for this application fall
20+short, we suggest you take a close look at the [Scrapy architecture]
21+(http://doc.scrapy.org/en/latest/topics/architecture.html) and the [Scrapy
22+documentation](http://doc.scrapy.org/en/latest/index.html).
23+24+### Installing
25+26+If you're installing Fourmi, please take a look at our [installation guide](...)
27+on our wiki. When you've installed the application, make sure to check our
28+[usage guide](...).
29+30+### Using the Source
31+32+To use the Fourmi source code multiple dependencies are required. Take a look at
33+the [wiki page](...) on using the application source code for a step by step
34+installation guide.
35+36+When developing for the Fourmi project keep in mind that code readability is a
37+must. To maintain the readability, code should be conform with the
38+[PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python
39+code. More information about the different structures and principles of the
40+Fourmi application can be found on our [wiki](...).
41+42+### To Do
43+44+The Fourmi project has the following goals for the nearby future:
45+46+__Main goals:__
47+48+- Improve our documentation and guides. (Assignee: Dekker)
49+- Build an graphical user interface(GUI) as alternative for the command line
50+interface(CLI). (Assignee: Harmen)
51+- Compiling the source into an windows executable. (Assignee: Bas)
52+- Create an configuration file to hold logins and API keys.
53+- Determine reliability of our data point.
54+- Create an module to gather data from NIST. (Assignee: Rob)
55+- Create an module to gather data from PubChem. (Assignee: Nout)
56+57+__Side goals:__
58+59+- Clean and unify data.
60+- Extensive reliability analysis using statistical tests.
61+- Test data with Descartes 1.
62+63+### Project Origin
64+65+The Fourmi project was started in February of 2014 as part of a software
66+engineering course at the Radboud University for students studying Computer
67+Science, Information Science or Artificial Intelligence. Students participate in
68+a real software development project as part of the
69+[Giphouse](http://www.giphouse.nl/).
70+71+This particular project was started on behalf of Ivo B. Rietveld. As a chemist
72+he was in need of an application to automatically search information on chemical
73+substances and create an phase diagram. The so called "Descrates" project was
74+split into two teams each creating a different application that has part of the
75+functionality. We are the team Descartes 2 and as we were responsible for
76+creating a web crawler, we've named our application Fourmi (Englis: Ants).
77+78+The following people were part of the original team:
79+80+- [Jip J. Dekker](http://jip.dekker.li)
81+- Rob ten Berge
82+- Harmen Prins
83+- Bas van Berkel
84+- Nout van Deijck
85+- Michail Kuznetcov
-16
README.rst
···1-We are the team Descartes 2.
2-----------------------------
3-4-Our team members are:
5-6-+ Rob ten Berge
7-8-+ Bas van Berkel
9-10-+ Nout van Deijck
11-12-+ Jip J. Dekker
13-14-+ Michail Kuznetcov
15-16-+ Harmen Prins
···1+# !/usr/bin/env python
2+"""
3+Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms).
4+5+Usage:
6+ fourmi search <compound>
7+ fourmi [options] search <compound>
8+ fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
9+ fourmi list
10+ fourmi [--include=<sourcename> | --exclude=<sourcename>] list
11+ fourmi -h | --help
12+ fourmi --version
13+14+Options:
15+ --attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
16+ -h --help Show this screen.
17+ --version Show version.
18+ --verbose Verbose logging output.
19+ --log=<file> Save log to an file.
20+ -o <file> --output=<file> Output file [default: result.*format*]
21+ -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines]
22+ --include=<regex> Include only sources that match these regular expressions split by a comma.
23+ --exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
24+"""
25+26+from twisted.internet import reactor
27+from scrapy.crawler import Crawler
28+from scrapy import log, signals
29+from scrapy.utils.project import get_project_settings
30+import docopt
31+32+from FourmiCrawler.spider import FourmiSpider
33+from sourceloader import SourceLoader
34+35+36+def setup_crawler(compound, settings, source_loader, attributes):
37+ """
38+ This function prepares and start the crawler which starts the actual search on the internet
39+ :param compound: The compound which should be searched
40+ :param settings: A scrapy settings object
41+ :param source_loader: A fully functional SourceLoader object which contains only the sources that should be used.
42+ :param attributes: A list of regular expressions which the attribute names should match.
43+ """
44+ spider = FourmiSpider(compound=compound, selected_attributes=attributes)
45+ spider.add_sources(source_loader.sources)
46+ crawler = Crawler(settings)
47+ crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
48+ crawler.configure()
49+ crawler.crawl(spider)
50+ crawler.start()
51+52+53+def scrapy_settings_manipulation(docopt_arguments):
54+ """
55+ This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi
56+ project these are command line arguments.
57+ :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
58+ """
59+ settings = get_project_settings()
60+61+ if docopt_arguments["--output"] != 'result.*format*':
62+ settings.overrides["FEED_URI"] = docopt_arguments["--output"]
63+ elif docopt_arguments["--format"] == "jsonlines":
64+ settings.overrides["FEED_URI"] = "results.json"
65+ elif docopt_arguments["--format"] is not None:
66+ settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"]
67+68+ if docopt_arguments["--format"] is not None:
69+ settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"]
70+71+ return settings
72+73+74+def start_log(docopt_arguments):
75+ """
76+ This function starts the logging functionality of Scrapy using the settings given by the CLI.
77+ :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
78+ """
79+ if docopt_arguments["--log"] is not None:
80+ if docopt_arguments["--verbose"]:
81+ log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
82+ else:
83+ log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING)
84+ else:
85+ if docopt_arguments["--verbose"]:
86+ log.start(logstdout=False, loglevel=log.DEBUG)
87+ else:
88+ log.start(logstdout=True, loglevel=log.WARNING)
89+90+91+def search(docopt_arguments, source_loader):
92+ """
93+ The function that facilitates the search for a specific compound.
94+ :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
95+ :param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
96+ """
97+ start_log(docopt_arguments)
98+ settings = scrapy_settings_manipulation(docopt_arguments)
99+ setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
100+ reactor.run()
101+102+103+# The start for the Fourmi Command Line interface.
104+if __name__ == '__main__':
105+ arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.1')
106+ loader = SourceLoader()
107+108+ if arguments["--include"]:
109+ loader.include(arguments["--include"].split(','))
110+ elif arguments["--exclude"]:
111+ loader.exclude(arguments["--exclude"].split(','))
112+113+ if arguments["search"]:
114+ search(arguments, loader)
115+ elif arguments["list"]:
116+ print "-== Available Sources ==-"
117+ print str(loader)
+18
setup.py
···000000000000000000
···1+import sys
2+from cx_Freeze import setup, Executable
3+4+# After running the setup file (python setup.py build) the scrapy/VERSION file has to be manually put into the
5+# library.zip, also the FourmiCrawler map has to be copied to both the library and the exe.win32-2.7 folder. after
6+# putting the files in the library the library has to be zipped and replace the old library.
7+# Dependencies are automatically detected, but it might need fine tuning.
8+build_exe_options = {"packages": ["os", "scrapy", "lxml", "w3lib", "pkg_resources", "zope.interface", "twisted.internet"], "excludes": []}
9+10+# GUI applications require a different base on Windows (the default is for a
11+# console application).
12+base = None
13+14+setup( name = "Scrapy",
15+ version = "0.1",
16+ description = "My GUI application!",
17+ options = {"build_exe": build_exe_options},
18+ executables = [Executable("fourmi.py", base=base)])
···1+import inspect
2+import sys
3+import os
4+import re
5+6+from FourmiCrawler.sources.source import Source
7+8+9+class SourceLoader:
10+ sources = []
11+12+ def __init__(self, rel_dir="FourmiCrawler/sources"):
13+14+ if hasattr(sys,'frozen'):
15+ path = os.path.dirname(sys.executable)
16+ else:
17+ path = os.path.dirname(os.path.abspath(__file__))
18+19+ path += "/" + rel_dir
20+ known_parser = set()
21+22+ for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
23+ mod = __import__('.'.join([rel_dir.replace('/', "."), py]), fromlist=[py])
24+ classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
25+ for cls in classes:
26+ if issubclass(cls, Source) and cls not in known_parser:
27+ self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers?
28+ # known_parser.add(cls)
29+30+ def include(self, source_names):
31+ """
32+ This function excludes all sources that don't match the given regular expressions.
33+ :param source_names: A list of regular expression (strings)
34+ """
35+ new = set()
36+ for name in source_names:
37+ new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
38+ self.sources = list(new)
39+40+ def exclude(self, source_names):
41+ """
42+ This function excludes all sources that match the given regular expressions.
43+ :param source_names: A list of regular expression (strings)
44+ """
45+ exclude = []
46+ for name in source_names:
47+ exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
48+ self.sources = [src for src in self.sources if src not in exclude]
49+50+ def __str__(self):
51+ """
52+ This function returns a string with all sources currently available in the SourceLoader.
53+ :return: a string with all available sources.
54+ """
55+ string = ""
56+ for src in self.sources:
57+ string += "Source: " + src.__class__.__name__
58+ string += " - "
59+ string += "URI: " + src.website + "\n"
60+ return string