A web scraper build to search specific information for a given compound (and its pseudonyms)
at assignees 1.6 kB view raw
1import inspect 2import os 3import re 4from FourmiCrawler.sources.source import Source 5 6 7class SourceLoader: 8 sources = [] 9 10 def __init__(self, rel_dir="FourmiCrawler/sources"): 11 path = os.path.dirname(os.path.abspath(__file__)) 12 path += "/" + rel_dir 13 known_parser = set() 14 15 for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: 16 mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) 17 classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] 18 for cls in classes: 19 if issubclass(cls, Source) and cls not in known_parser: 20 self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers? 21 known_parser.add(cls) 22 23 def include(self, source_names): 24 new = set() 25 for name in source_names: 26 new.update([src for src in self.sources if re.match(name, src.__class__.__name__)]) 27 self.sources = list(new) 28 29 def exclude(self, source_names): 30 exclude = [] 31 for name in source_names: 32 exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)]) 33 self.sources = [src for src in self.sources if src not in exclude] 34 35 def __str__(self): 36 string = "" 37 for src in self.sources: 38 string += "Source: " + src.__class__.__name__ 39 string += " - " 40 string += "URI: " + src.website + "\n" 41 return string