Added a pipeline to replace None values with empty strings

dekker.one / Fourmi

fork

A web scraper build to search specific information for a given compound (and its pseudonyms)

fork

+19 -2

2 changed files

expand all

FourmiCrawler

pipelines.py

settings.py

+16

FourmiCrawler/pipelines.py

··· 5 5 import re 6 6 from scrapy.exceptions import DropItem 7 7 8 + class RemoveNonePipeline(object): 9 + 10 + def __init__(self): 11 + self.known_values = set() 12 + 13 + def process_item(self, item, spider): 14 + """ 15 + Processing the items so None values are replaced by empty strings 16 + :param item: The incoming item 17 + :param spider: The spider which scraped the spider 18 + :return: :raise DropItem: Returns the item if unique or drops them if it's already known 19 + """ 20 + for key in item: 21 + if item[key] is None: 22 + item[key] = "" 23 + return item 8 24 9 25 class DuplicatePipeline(object): 10 26

+3 -2

FourmiCrawler/settings.py

··· 11 11 SPIDER_MODULES = ['FourmiCrawler'] 12 12 NEWSPIDER_MODULE = 'FourmiCrawler' 13 13 ITEM_PIPELINES = { 14 - 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 100, 15 - 'FourmiCrawler.pipelines.DuplicatePipeline': 200, 14 + "FourmiCrawler.pipelines.RemoveNonePipeline": 100, 15 + 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 200, 16 + 'FourmiCrawler.pipelines.DuplicatePipeline': 300, 16 17 } 17 18 FEED_URI = 'results.json' 18 19 FEED_FORMAT = 'jsonlines'

Configure Feed

Configure Feed