A web scraper build to search specific information for a given compound (and its pseudonyms)
at 0.4.2 2.3 kB view raw
1import inspect 2import os 3import re 4 5from FourmiCrawler.sources.source import Source 6 7 8class SourceLoader: 9 sources = [] 10 11 def __init__(self, rel_dir="FourmiCrawler/sources"): 12 """ 13 The initiation of a SourceLoader, selects and indexes a directory for usable sources. 14 :param rel_dir: A relative path to a directory. 15 """ 16 path = os.path.dirname(os.path.abspath(__file__)) 17 path += "/" + rel_dir 18 known_parser = set() 19 20 for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: 21 mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) 22 classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] 23 for cls in classes: 24 if issubclass(cls, Source) and cls not in known_parser: 25 self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers? 26 known_parser.add(cls) 27 28 def include(self, source_names): 29 """ 30 This function excludes all sources that don't match the given regular expressions. 31 :param source_names: A list of regular expression (strings) 32 """ 33 new = set() 34 for name in source_names: 35 new.update([src for src in self.sources if re.match(name, src.__class__.__name__)]) 36 self.sources = list(new) 37 38 def exclude(self, source_names): 39 """ 40 This function excludes all sources that match the given regular expressions. 41 :param source_names: A list of regular expression (strings) 42 """ 43 exclude = [] 44 for name in source_names: 45 exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)]) 46 self.sources = [src for src in self.sources if src not in exclude] 47 48 def __str__(self): 49 """ 50 This function returns a string with all sources currently available in the SourceLoader. 51 :return: a string with all available sources. 52 """ 53 string = "" 54 for src in self.sources: 55 string += "Source: " + src.__class__.__name__ 56 string += " - " 57 string += "URI: " + src.website + "\n" 58 return string