A web scraper build to search specific information for a given compound (and its pseudonyms)
1import inspect 2import sys 3import os 4import re 5 6from FourmiCrawler.sources.source import Source 7 8 9class SourceLoader: 10 sources = [] 11 12 def __init__(self, rel_dir="FourmiCrawler/sources"): 13 14 if hasattr(sys,'frozen'): 15 path = os.path.dirname(sys.executable) 16 else: 17 path = os.path.dirname(os.path.abspath(__file__)) 18 19 path += "/" + rel_dir 20 known_parser = set() 21 22 for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: 23 mod = __import__('.'.join([rel_dir.replace('/', "."), py]), fromlist=[py]) 24 classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] 25 for cls in classes: 26 if issubclass(cls, Source) and cls not in known_parser: 27 self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers? 28 # known_parser.add(cls) 29 30 def include(self, source_names): 31 """ 32 This function excludes all sources that don't match the given regular expressions. 33 :param source_names: A list of regular expression (strings) 34 """ 35 new = set() 36 for name in source_names: 37 new.update([src for src in self.sources if re.match(name, src.__class__.__name__)]) 38 self.sources = list(new) 39 40 def exclude(self, source_names): 41 """ 42 This function excludes all sources that match the given regular expressions. 43 :param source_names: A list of regular expression (strings) 44 """ 45 exclude = [] 46 for name in source_names: 47 exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)]) 48 self.sources = [src for src in self.sources if src not in exclude] 49 50 def __str__(self): 51 """ 52 This function returns a string with all sources currently available in the SourceLoader. 53 :return: a string with all available sources. 54 """ 55 string = "" 56 for src in self.sources: 57 string += "Source: " + src.__class__.__name__ 58 string += " - " 59 string += "URI: " + src.website + "\n" 60 return string