A web scraper build to search specific information for a given compound (and its pseudonyms)
1#!/usr/bin/env python
2"""
3Fourmi, a web scraper build to search specific information for a given compound (and its pseudonyms).
4
5Usage:
6 fourmi
7 fourmi search <compound>
8 fourmi [options] search <compound>
9 fourmi [options] [-v | -vv | -vvv] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
10 fourmi list
11 fourmi [--include=<sourcename> | --exclude=<sourcename>] list
12 fourmi -h | --help
13 fourmi --version
14
15Options:
16 --attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
17 -h --help Show this screen.
18 --version Show version.
19 -v Verbose logging output. (Multiple occurrences increase logging level)
20 --log=<file> Save log to an file.
21 -o <file> --output=<file> Output file [default: <compound>.*format*]
22 -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
23 --include=<regex> Include only sources that match these regular expressions split by a comma.
24 --exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
25"""
26
27from twisted.internet import reactor
28from scrapy.crawler import Crawler
29from scrapy import signals, log
30import docopt
31
32from FourmiCrawler.spider import FourmiSpider
33from utils.configurator import Configurator
34from utils.sourceloader import SourceLoader
35from GUI import gui
36
37
38def setup_crawler(compound, settings, source_loader, attributes):
39 """
40 This function prepares and start the crawler which starts the actual search on the internet
41 :param compound: The compound which should be searched
42 :param settings: A scrapy settings object
43 :param source_loader: A fully functional SourceLoader object which contains only the sources that should be used.
44 :param attributes: A list of regular expressions which the attribute names should match.
45 """
46 spider = FourmiSpider(compound=compound, selected_attributes=attributes)
47 spider.add_sources(source_loader.sources)
48 crawler = Crawler(settings)
49 crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
50 crawler.configure()
51 crawler.crawl(spider)
52 crawler.start()
53
54
55def search(docopt_arguments, source_loader):
56 """
57 The function that facilitates the search for a specific compound.
58 :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
59 :param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
60 """
61 conf = Configurator()
62 conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
63 conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments["<compound>"])
64 setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
65 source_loader, docopt_arguments["--attributes"].split(','))
66 if conf.scrapy_settings.getbool("LOG_ENABLED"):
67 log.start(conf.scrapy_settings.get("LOG_FILE"),
68 conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
69 reactor.run()
70
71
72# The start for the Fourmi Command Line interface.
73if __name__ == '__main__':
74 arguments = docopt.docopt(__doc__, version='Fourmi - V0.6.0')
75 loader = SourceLoader()
76
77 if arguments["--include"]:
78 loader.include(arguments["--include"].split(','))
79 elif arguments["--exclude"]:
80 loader.exclude(arguments["--exclude"].split(','))
81
82 if arguments["search"]:
83 search(arguments, loader)
84 elif arguments["list"]:
85 print "-== Available Sources ==-"
86 print str(loader)
87 else:
88 gui_window = gui.GUI(search, sourceloader=SourceLoader())
89 gui_window.run()