A web scraper build to search specific information for a given compound (and its pseudonyms)

Compare changes

Choose any two refs to compare.

+36 -23
+2 -6
.travis.yml
··· 6 6 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors 7 7 install: 8 8 - pip install Scrapy docopt 9 - - pip install coveralls 10 9 11 10 # command to run tests, e.g. python setup.py test 12 11 script: 13 - - nosetests --with-coverage --cover-package=FourmiCrawler tests 12 + - nosetests tests 14 13 15 14 notifications: 16 - slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM 17 - 18 - after_success: 19 - coveralls --verbose 15 + slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
+3 -3
FourmiCrawler/spider.py
··· 35 35 return source.parse(response) 36 36 return None 37 37 38 - def get_synonym_requests(self, compound, force=False): 38 + def get_synonym_requests(self, compound): 39 39 """ 40 40 A function that generates new Scrapy Request for each source given a new synonym of a compound. 41 41 :param compound: A compound name 42 42 :return: A list of Scrapy Request objects 43 43 """ 44 44 requests = [] 45 - if force or compound not in self.synonyms: 45 + if compound not in self.synonyms: 46 46 self.synonyms.add(compound) 47 47 for parser in self._sources: 48 48 parser_requests = parser.new_compound_request(compound) ··· 57 57 """ 58 58 requests = [] 59 59 for synonym in self.synonyms: 60 - requests.extend(self.get_synonym_requests(synonym, force=True)) 60 + requests.extend(self.get_synonym_requests(synonym)) 61 61 return requests 62 62 63 63 def add_sources(self, sources):
+2 -2
fourmi.py
··· 1 - #!/usr/bin/env python 1 + # !/usr/bin/env python 2 2 """ 3 3 Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms). 4 4 ··· 102 102 103 103 # The start for the Fourmi Command Line interface. 104 104 if __name__ == '__main__': 105 - arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.2') 105 + arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.1') 106 106 loader = SourceLoader() 107 107 108 108 if arguments["--include"]:
+18
setup.py
··· 1 + import sys 2 + from cx_Freeze import setup, Executable 3 + 4 + # After running the setup file (python setup.py build) the scrapy/VERSION file has to be manually put into the 5 + # library.zip, also the FourmiCrawler map has to be copied to both the library and the exe.win32-2.7 folder. after 6 + # putting the files in the library the library has to be zipped and replace the old library. 7 + # Dependencies are automatically detected, but it might need fine tuning. 8 + build_exe_options = {"packages": ["os", "scrapy", "lxml", "w3lib", "pkg_resources", "zope.interface", "twisted.internet"], "excludes": []} 9 + 10 + # GUI applications require a different base on Windows (the default is for a 11 + # console application). 12 + base = None 13 + 14 + setup( name = "Scrapy", 15 + version = "0.1", 16 + description = "My GUI application!", 17 + options = {"build_exe": build_exe_options}, 18 + executables = [Executable("fourmi.py", base=base)])
+10 -8
sourceloader.py
··· 1 1 import inspect 2 + import sys 2 3 import os 3 4 import re 4 5 ··· 9 10 sources = [] 10 11 11 12 def __init__(self, rel_dir="FourmiCrawler/sources"): 12 - """ 13 - The initiation of a SourceLoader, selects and indexes a directory for usable sources. 14 - :param rel_dir: A relative path to a directory. 15 - """ 16 - path = os.path.dirname(os.path.abspath(__file__)) 13 + 14 + if hasattr(sys,'frozen'): 15 + path = os.path.dirname(sys.executable) 16 + else: 17 + path = os.path.dirname(os.path.abspath(__file__)) 18 + 17 19 path += "/" + rel_dir 18 20 known_parser = set() 19 21 20 22 for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: 21 - mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) 23 + mod = __import__('.'.join([rel_dir.replace('/', "."), py]), fromlist=[py]) 22 24 classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] 23 25 for cls in classes: 24 26 if issubclass(cls, Source) and cls not in known_parser: 25 - self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers? 26 - known_parser.add(cls) 27 + self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers? 28 + # known_parser.add(cls) 27 29 28 30 def include(self, source_names): 29 31 """
+1 -4
tests/test_spider.py
··· 43 43 44 44 src2 = ChemSpider() 45 45 self.spi.add_source(src2) 46 - requests = self.spi.start_requests() 47 - self.assertGreater(len(requests), 0) 48 - self.assertIsInstance(requests[0], Request) 49 - 46 + self.assertIsNotNone(self.spi.start_requests()) 50 47 51 48 def test_synonym_requests(self): 52 49 # A test for the synonym request function