A web scraper build to search specific information for a given compound (and its pseudonyms)
at develop 3.8 kB view raw
1#!/usr/bin/env python 2""" 3Fourmi, a web scraper build to search specific information for a given compound (and its pseudonyms). 4 5Usage: 6 fourmi 7 fourmi search <compound> 8 fourmi [options] search <compound> 9 fourmi [options] [-v | -vv | -vvv] [--include=<sourcename> | --exclude=<sourcename>] search <compound> 10 fourmi list 11 fourmi [--include=<sourcename> | --exclude=<sourcename>] list 12 fourmi -h | --help 13 fourmi --version 14 15Options: 16 --attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*] 17 -h --help Show this screen. 18 --version Show version. 19 -v Verbose logging output. (Multiple occurrences increase logging level) 20 --log=<file> Save log to an file. 21 -o <file> --output=<file> Output file [default: <compound>.*format*] 22 -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv] 23 --include=<regex> Include only sources that match these regular expressions split by a comma. 24 --exclude=<regex> Exclude the sources that match these regular expressions split by a comma. 25""" 26 27from twisted.internet import reactor 28from scrapy.crawler import Crawler 29from scrapy import signals, log 30import docopt 31 32from FourmiCrawler.spider import FourmiSpider 33from utils.configurator import Configurator 34from utils.sourceloader import SourceLoader 35from GUI import gui 36 37 38def setup_crawler(compound, settings, source_loader, attributes): 39 """ 40 This function prepares and start the crawler which starts the actual search on the internet 41 :param compound: The compound which should be searched 42 :param settings: A scrapy settings object 43 :param source_loader: A fully functional SourceLoader object which contains only the sources that should be used. 44 :param attributes: A list of regular expressions which the attribute names should match. 45 """ 46 spider = FourmiSpider(compound=compound, selected_attributes=attributes) 47 spider.add_sources(source_loader.sources) 48 crawler = Crawler(settings) 49 crawler.signals.connect(reactor.stop, signal=signals.spider_closed) 50 crawler.configure() 51 crawler.crawl(spider) 52 crawler.start() 53 54 55def search(docopt_arguments, source_loader): 56 """ 57 The function that facilitates the search for a specific compound. 58 :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. 59 :param source_loader: An initiated SourceLoader object pointed at the directory with the sources. 60 """ 61 conf = Configurator() 62 conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"]) 63 conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments["<compound>"]) 64 setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, 65 source_loader, docopt_arguments["--attributes"].split(',')) 66 if conf.scrapy_settings.getbool("LOG_ENABLED"): 67 log.start(conf.scrapy_settings.get("LOG_FILE"), 68 conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) 69 reactor.run() 70 71 72# The start for the Fourmi Command Line interface. 73if __name__ == '__main__': 74 arguments = docopt.docopt(__doc__, version='Fourmi - V0.6.0') 75 loader = SourceLoader() 76 77 if arguments["--include"]: 78 loader.include(arguments["--include"].split(',')) 79 elif arguments["--exclude"]: 80 loader.exclude(arguments["--exclude"].split(',')) 81 82 if arguments["search"]: 83 search(arguments, loader) 84 elif arguments["list"]: 85 print "-== Available Sources ==-" 86 print str(loader) 87 else: 88 gui_window = gui.GUI(search, sourceloader=SourceLoader()) 89 gui_window.run()