A web scraper build to search specific information for a given compound (and its pseudonyms)
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Added a pipeline to replace None values with empty strings

+19 -2
+16
FourmiCrawler/pipelines.py
··· 5 5 import re 6 6 from scrapy.exceptions import DropItem 7 7 8 + class RemoveNonePipeline(object): 9 + 10 + def __init__(self): 11 + self.known_values = set() 12 + 13 + def process_item(self, item, spider): 14 + """ 15 + Processing the items so None values are replaced by empty strings 16 + :param item: The incoming item 17 + :param spider: The spider which scraped the spider 18 + :return: :raise DropItem: Returns the item if unique or drops them if it's already known 19 + """ 20 + for key in item: 21 + if item[key] is None: 22 + item[key] = "" 23 + return item 8 24 9 25 class DuplicatePipeline(object): 10 26
+3 -2
FourmiCrawler/settings.py
··· 11 11 SPIDER_MODULES = ['FourmiCrawler'] 12 12 NEWSPIDER_MODULE = 'FourmiCrawler' 13 13 ITEM_PIPELINES = { 14 - 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 100, 15 - 'FourmiCrawler.pipelines.DuplicatePipeline': 200, 14 + "FourmiCrawler.pipelines.RemoveNonePipeline": 100, 15 + 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 200, 16 + 'FourmiCrawler.pipelines.DuplicatePipeline': 300, 16 17 } 17 18 FEED_URI = 'results.json' 18 19 FEED_FORMAT = 'jsonlines'