A web scraper build to search specific information for a given compound (and its pseudonyms)
1import inspect
2import os
3import re
4from FourmiCrawler.sources.source import Source
5
6
7class SourceLoader:
8 sources = []
9
10 def __init__(self, rel_dir="FourmiCrawler/sources"):
11 path = os.path.dirname(os.path.abspath(__file__))
12 path += "/" + rel_dir
13 known_parser = set()
14
15 for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
16 mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
17 classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
18 for cls in classes:
19 if issubclass(cls, Source) and cls not in known_parser:
20 self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers?
21 known_parser.add(cls)
22
23 def include(self, source_names):
24 new = set()
25 for name in source_names:
26 new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
27 self.sources = list(new)
28
29 def exclude(self, source_names):
30 exclude = []
31 for name in source_names:
32 exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
33 self.sources = [src for src in self.sources if src not in exclude]
34
35 def __str__(self):
36 string = ""
37 for src in self.sources:
38 string += "Source: " + src.__class__.__name__
39 string += " - "
40 string += "URI: " + src.website + "\n"
41 return string