A web scraper build to search specific information for a given compound (and its pseudonyms)
1# !/usr/bin/env python
2"""
3Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms).
4
5Usage:
6 fourmi search <compound>
7 fourmi [options] search <compound>
8 fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
9 fourmi list
10 fourmi [--include=<sourcename> | --exclude=<sourcename>] list
11 fourmi -h | --help
12 fourmi --version
13
14Options:
15 --attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
16 -h --help Show this screen.
17 --version Show version.
18 --verbose Verbose logging output.
19 --log=<file> Save log to an file.
20 -o <file> --output=<file> Output file [default: result.*format*]
21 -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines]
22 --include=<regex> Include only sources that match these regular expressions split by a comma.
23 --exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
24"""
25
26from twisted.internet import reactor
27from scrapy.crawler import Crawler
28from scrapy import log, signals
29from scrapy.utils.project import get_project_settings
30import docopt
31
32from FourmiCrawler.spider import FourmiSpider
33from sourceloader import SourceLoader
34
35
36def setup_crawler(compound, settings, source_loader, attributes):
37 """
38 This function prepares and start the crawler which starts the actual search on the internet
39 :param compound: The compound which should be searched
40 :param settings: A scrapy settings object
41 :param source_loader: A fully functional SourceLoader object which contains only the sources that should be used.
42 :param attributes: A list of regular expressions which the attribute names should match.
43 """
44 spider = FourmiSpider(compound=compound, selected_attributes=attributes)
45 spider.add_sources(source_loader.sources)
46 crawler = Crawler(settings)
47 crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
48 crawler.configure()
49 crawler.crawl(spider)
50 crawler.start()
51
52
53def scrapy_settings_manipulation(docopt_arguments):
54 """
55 This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi
56 project these are command line arguments.
57 :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
58 """
59 settings = get_project_settings()
60
61 if docopt_arguments["--output"] != 'result.*format*':
62 settings.overrides["FEED_URI"] = docopt_arguments["--output"]
63 elif docopt_arguments["--format"] == "jsonlines":
64 settings.overrides["FEED_URI"] = "results.json"
65 elif docopt_arguments["--format"] is not None:
66 settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"]
67
68 if docopt_arguments["--format"] is not None:
69 settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"]
70
71 return settings
72
73
74def start_log(docopt_arguments):
75 """
76 This function starts the logging functionality of Scrapy using the settings given by the CLI.
77 :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
78 """
79 if docopt_arguments["--log"] is not None:
80 if docopt_arguments["--verbose"]:
81 log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
82 else:
83 log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING)
84 else:
85 if docopt_arguments["--verbose"]:
86 log.start(logstdout=False, loglevel=log.DEBUG)
87 else:
88 log.start(logstdout=True, loglevel=log.WARNING)
89
90
91def search(docopt_arguments, source_loader):
92 """
93 The function that facilitates the search for a specific compound.
94 :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
95 :param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
96 """
97 start_log(docopt_arguments)
98 settings = scrapy_settings_manipulation(docopt_arguments)
99 setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
100 reactor.run()
101
102
103# The start for the Fourmi Command Line interface.
104if __name__ == '__main__':
105 arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.1')
106 loader = SourceLoader()
107
108 if arguments["--include"]:
109 loader.include(arguments["--include"].split(','))
110 elif arguments["--exclude"]:
111 loader.exclude(arguments["--exclude"].split(','))
112
113 if arguments["search"]:
114 search(arguments, loader)
115 elif arguments["list"]:
116 print "-== Available Sources ==-"
117 print str(loader)