A web scraper build to search specific information for a given compound (and its pseudonyms)
at develop 2.5 kB view raw
1import inspect 2import os 3import re 4 5from FourmiCrawler.sources.source import Source 6from utils.configurator import Configurator 7 8 9class SourceLoader: 10 sources = [] 11 12 def __init__(self, rel_dir="../FourmiCrawler/sources"): 13 """ 14 The initiation of a SourceLoader, selects and indexes a directory for usable sources. 15 Also loads a configuration file for Sources and passes the arguments in 16 the named section to the source 17 :param rel_dir: A relative path to a directory. 18 """ 19 path = os.path.dirname(os.path.abspath(__file__)) 20 path += "/" + rel_dir 21 known_parser = set() 22 23 config = Configurator.read_sourceconfiguration() 24 25 for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: 26 mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py]) 27 classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] 28 for cls in classes: 29 if issubclass(cls, Source) and cls not in known_parser: 30 sourcecfg = Configurator.get_section(config, cls.__name__) 31 self.sources.append(cls(sourcecfg)) 32 known_parser.add(cls) 33 34 def include(self, source_names): 35 """ 36 This function excludes all sources that don't match the given regular expressions. 37 :param source_names: A list of regular expression (strings) 38 """ 39 new = set() 40 for name in source_names: 41 new.update([src for src in self.sources if re.match(name, src.__class__.__name__)]) 42 self.sources = list(new) 43 44 def exclude(self, source_names): 45 """ 46 This function excludes all sources that match the given regular expressions. 47 :param source_names: A list of regular expression (strings) 48 """ 49 exclude = [] 50 for name in source_names: 51 exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)]) 52 self.sources = [src for src in self.sources if src not in exclude] 53 54 def __str__(self): 55 """ 56 This function returns a string with all sources currently available in the SourceLoader. 57 :return: a string with all available sources. 58 """ 59 string = "" 60 for src in self.sources: 61 string += "Source: " + src.__class__.__name__ 62 string += " - " 63 string += "URI: " + src.website + "\n" 64 return string