A web scraper build to search specific information for a given compound (and its pseudonyms)
1import inspect
2import os
3import re
4
5from FourmiCrawler.sources.source import Source
6from utils.configurator import Configurator
7
8
9class SourceLoader:
10 sources = []
11
12 def __init__(self, rel_dir="../FourmiCrawler/sources"):
13 """
14 The initiation of a SourceLoader, selects and indexes a directory for usable sources.
15 Also loads a configuration file for Sources and passes the arguments in
16 the named section to the source
17 :param rel_dir: A relative path to a directory.
18 """
19 path = os.path.dirname(os.path.abspath(__file__))
20 path += "/" + rel_dir
21 known_parser = set()
22
23 config = Configurator.read_sourceconfiguration()
24
25 for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
26 mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py])
27 classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
28 for cls in classes:
29 if issubclass(cls, Source) and cls not in known_parser:
30 sourcecfg = Configurator.get_section(config, cls.__name__)
31 self.sources.append(cls(sourcecfg))
32 known_parser.add(cls)
33
34 def include(self, source_names):
35 """
36 This function excludes all sources that don't match the given regular expressions.
37 :param source_names: A list of regular expression (strings)
38 """
39 new = set()
40 for name in source_names:
41 new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
42 self.sources = list(new)
43
44 def exclude(self, source_names):
45 """
46 This function excludes all sources that match the given regular expressions.
47 :param source_names: A list of regular expression (strings)
48 """
49 exclude = []
50 for name in source_names:
51 exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
52 self.sources = [src for src in self.sources if src not in exclude]
53
54 def __str__(self):
55 """
56 This function returns a string with all sources currently available in the SourceLoader.
57 :return: a string with all available sources.
58 """
59 string = ""
60 for src in self.sources:
61 string += "Source: " + src.__class__.__name__
62 string += " - "
63 string += "URI: " + src.website + "\n"
64 return string