A web scraper build to search specific information for a given compound (and its pseudonyms)
at develop 101 lines 4.0 kB view raw
1import ConfigParser 2import os 3import shutil 4 5from scrapy.utils.project import get_project_settings 6 7 8class Configurator: 9 """ 10 A helper class in the fourmi class. This class is used to process the settings as set 11 from one of the Fourmi applications. 12 """ 13 14 def __init__(self): 15 self.scrapy_settings = get_project_settings() 16 17 def set_output(self, filename, fileformat, compound): 18 """ 19 This function manipulates the Scrapy output file settings that normally would be set in the settings file. 20 In the Fourmi project these are command line arguments. 21 :param filename: The filename of the file where the output will be put. 22 :param fileformat: The format in which the output will be. 23 """ 24 25 if filename != '<compound>.*format*': 26 self.scrapy_settings.overrides["FEED_URI"] = filename 27 elif fileformat == "jsonlines": 28 self.scrapy_settings.overrides["FEED_URI"] = compound + ".json" 29 elif fileformat is not None: 30 self.scrapy_settings.overrides["FEED_URI"] = compound + "." + fileformat 31 32 if fileformat is not None: 33 self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat 34 35 def set_logging(self, logfile=None, verbose=0): 36 """ 37 This function changes the default settings of Scapy's logging functionality 38 using the settings given by the CLI. 39 :param logfile: The location where the logfile will be saved. 40 :param verbose: A integer value to switch between loglevels. 41 """ 42 if verbose != 0: 43 self.scrapy_settings.overrides["LOG_ENABLED"] = True 44 else: 45 self.scrapy_settings.overrides["LOG_ENABLED"] = False 46 47 if verbose == 1: 48 self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING" 49 elif verbose == 2: 50 self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO" 51 else: 52 self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG" 53 54 if verbose > 1: 55 self.scrapy_settings.overrides["LOG_STDOUT"] = False 56 else: 57 self.scrapy_settings.overrides["LOG_STDOUT"] = True 58 59 if logfile is not None: 60 self.scrapy_settings.overrides["LOG_FILE"] = logfile 61 else: 62 self.scrapy_settings.overrides["LOG_FILE"] = None 63 64 @staticmethod 65 def read_sourceconfiguration(): 66 """ 67 This function reads sources.cfg in the main folder for configuration 68 variables for sources 69 :return a ConfigParser object of sources.cfg 70 """ 71 current_dir = os.path.dirname(os.path.abspath(__file__)) 72 config_path = current_dir + '/../sources.cfg' 73 # [TODO]: location of sources.cfg should be softcoded eventually 74 if not os.path.isfile(config_path): 75 try: 76 shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../sources.cfg.sample", config_path) 77 except IOError: 78 print "WARNING: Source configuration couldn't be found and couldn't be created." 79 config = ConfigParser.ConfigParser() 80 config.read(config_path) 81 return config 82 83 @staticmethod 84 def get_section(config, sourcename): 85 """ 86 This function reads a config section labeled in variable sourcename and 87 tests whether the reliability variable is set else set to empty string. 88 Return the default section if the labeled config section does not exist 89 :param config: a ConfigParser object 90 :param sourcename: the name of the section to be read 91 :return a dictionary of the section in the config labeled in sourcename 92 """ 93 section = dict() 94 if config.has_section(sourcename): 95 section = dict(config.items(sourcename)) 96 elif config.defaults(): 97 section = config.defaults() 98 if 'reliability' not in section: 99 print 'WARNING: Reliability not set for %s' % sourcename 100 section['reliability'] = '' 101 return section