A web scraper build to search specific information for a given compound (and its pseudonyms)
1# !/usr/bin/env python 2""" 3Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms). 4 5Usage: 6 fourmi search <compound> 7 fourmi [options] search <compound> 8 fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound> 9 fourmi list 10 fourmi [--include=<sourcename> | --exclude=<sourcename>] list 11 fourmi -h | --help 12 fourmi --version 13 14Options: 15 --attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*] 16 -h --help Show this screen. 17 --version Show version. 18 --verbose Verbose logging output. 19 --log=<file> Save log to an file. 20 -o <file> --output=<file> Output file [default: result.*format*] 21 -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines] 22 --include=<regex> Include only sources that match these regular expressions split by a comma. 23 --exclude=<regex> Exclude the sources that match these regular expressions split by a comma. 24""" 25 26from twisted.internet import reactor 27from scrapy.crawler import Crawler 28from scrapy import log, signals 29from scrapy.utils.project import get_project_settings 30import docopt 31 32from FourmiCrawler.spider import FourmiSpider 33from sourceloader import SourceLoader 34 35 36def setup_crawler(compound, settings, source_loader, attributes): 37 """ 38 This function prepares and start the crawler which starts the actual search on the internet 39 :param compound: The compound which should be searched 40 :param settings: A scrapy settings object 41 :param source_loader: A fully functional SourceLoader object which contains only the sources that should be used. 42 :param attributes: A list of regular expressions which the attribute names should match. 43 """ 44 spider = FourmiSpider(compound=compound, selected_attributes=attributes) 45 spider.add_sources(source_loader.sources) 46 crawler = Crawler(settings) 47 crawler.signals.connect(reactor.stop, signal=signals.spider_closed) 48 crawler.configure() 49 crawler.crawl(spider) 50 crawler.start() 51 52 53def scrapy_settings_manipulation(docopt_arguments): 54 """ 55 This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi 56 project these are command line arguments. 57 :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. 58 """ 59 settings = get_project_settings() 60 61 if docopt_arguments["--output"] != 'result.*format*': 62 settings.overrides["FEED_URI"] = docopt_arguments["--output"] 63 elif docopt_arguments["--format"] == "jsonlines": 64 settings.overrides["FEED_URI"] = "results.json" 65 elif docopt_arguments["--format"] is not None: 66 settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"] 67 68 if docopt_arguments["--format"] is not None: 69 settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"] 70 71 return settings 72 73 74def start_log(docopt_arguments): 75 """ 76 This function starts the logging functionality of Scrapy using the settings given by the CLI. 77 :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. 78 """ 79 if docopt_arguments["--log"] is not None: 80 if docopt_arguments["--verbose"]: 81 log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG) 82 else: 83 log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING) 84 else: 85 if docopt_arguments["--verbose"]: 86 log.start(logstdout=False, loglevel=log.DEBUG) 87 else: 88 log.start(logstdout=True, loglevel=log.WARNING) 89 90 91def search(docopt_arguments, source_loader): 92 """ 93 The function that facilitates the search for a specific compound. 94 :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. 95 :param source_loader: An initiated SourceLoader object pointed at the directory with the sources. 96 """ 97 start_log(docopt_arguments) 98 settings = scrapy_settings_manipulation(docopt_arguments) 99 setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(',')) 100 reactor.run() 101 102 103# The start for the Fourmi Command Line interface. 104if __name__ == '__main__': 105 arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.1') 106 loader = SourceLoader() 107 108 if arguments["--include"]: 109 loader.include(arguments["--include"].split(',')) 110 elif arguments["--exclude"]: 111 loader.exclude(arguments["--exclude"].split(',')) 112 113 if arguments["search"]: 114 search(arguments, loader) 115 elif arguments["list"]: 116 print "-== Available Sources ==-" 117 print str(loader)