···22#
33# Don't forget to add your pipeline to the ITEM_PIPELINES setting
44# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
55+from scrapy.exceptions import DropItem
66+5768class FourmiPipeline(object):
99+1010+ def __init__(self):
1111+ self.known_values = set()
1212+713 def process_item(self, item, spider):
88- return item
1414+ """
1515+ Processing the items so exact doubles are dropped
1616+ :param item: The incoming item
1717+ :param spider: The spider which scraped the spider
1818+ :return: :raise DropItem: Returns the item if unique or drops them if it's already known
1919+ """
2020+ value = item['attribute'], item['value']
2121+ if value in self.known_values:
2222+ raise DropItem("Duplicate item found: %s" % item)
2323+ else:
2424+ self.known_values.add(value)
2525+ return item
+3
Scrapy/settings.py
···10101111SPIDER_MODULES = ['Scrapy.spiders']
1212NEWSPIDER_MODULE = 'Scrapy.spiders'
1313+ITEM_PIPELINES = {
1414+ 'Scrapy.pipelines.FourmiPipeline': 100
1515+}
13161417# Crawl responsibly by identifying yourself (and your website) on the user-agent
1518#USER_AGENT = 'Fourmi (+http://www.yourdomain.com)'