sourceloader.py at assignees · dekker.one/Fourmi

dekker.one / Fourmi

A web scraper build to search specific information for a given compound (and its pseudonyms)

Fourmi / sourceloader.py

at assignees 1.6 kB view raw

 1import inspect
 2import os
 3import re
 4from FourmiCrawler.sources.source import Source
 5
 6
 7class SourceLoader:
 8    sources = []
 9
10    def __init__(self, rel_dir="FourmiCrawler/sources"):
11        path = os.path.dirname(os.path.abspath(__file__))
12        path += "/" + rel_dir
13        known_parser = set()
14
15        for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
16            mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
17            classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
18            for cls in classes:
19                if issubclass(cls, Source) and cls not in known_parser:
20                    self.sources.append(cls())  # [review] - Would we ever need arguments for the parsers?
21                    known_parser.add(cls)
22
23    def include(self, source_names):
24        new = set()
25        for name in source_names:
26            new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
27        self.sources = list(new)
28
29    def exclude(self, source_names):
30        exclude = []
31        for name in source_names:
32            exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
33        self.sources = [src for src in self.sources if src not in exclude]
34
35    def __str__(self):
36        string = ""
37        for src in self.sources:
38            string += "Source: " + src.__class__.__name__
39            string += " - "
40            string += "URI: " + src.website + "\n"
41        return string