A web scraper build to search specific information for a given compound (and its pseudonyms)
1import inspect
2import sys
3import os
4import re
5
6from FourmiCrawler.sources.source import Source
7
8
9class SourceLoader:
10 sources = []
11
12 def __init__(self, rel_dir="FourmiCrawler/sources"):
13
14 if hasattr(sys,'frozen'):
15 path = os.path.dirname(sys.executable)
16 else:
17 path = os.path.dirname(os.path.abspath(__file__))
18
19 path += "/" + rel_dir
20 known_parser = set()
21
22 for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
23 mod = __import__('.'.join([rel_dir.replace('/', "."), py]), fromlist=[py])
24 classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
25 for cls in classes:
26 if issubclass(cls, Source) and cls not in known_parser:
27 self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers?
28 # known_parser.add(cls)
29
30 def include(self, source_names):
31 """
32 This function excludes all sources that don't match the given regular expressions.
33 :param source_names: A list of regular expression (strings)
34 """
35 new = set()
36 for name in source_names:
37 new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
38 self.sources = list(new)
39
40 def exclude(self, source_names):
41 """
42 This function excludes all sources that match the given regular expressions.
43 :param source_names: A list of regular expression (strings)
44 """
45 exclude = []
46 for name in source_names:
47 exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
48 self.sources = [src for src in self.sources if src not in exclude]
49
50 def __str__(self):
51 """
52 This function returns a string with all sources currently available in the SourceLoader.
53 :return: a string with all available sources.
54 """
55 string = ""
56 for src in self.sources:
57 string += "Source: " + src.__class__.__name__
58 string += " - "
59 string += "URI: " + src.website + "\n"
60 return string