fmartingr
/
shelfzilla
Archived
1
0
Fork 0
This repository has been archived on 2021-06-29. You can view files and clone it, but cannot push or open issues or pull requests.
shelfzilla/crawler/crawler/pipelines.py

48 lines
1.1 KiB
Python

# -*- coding: utf-8 -*-
from scrapy.exceptions import DropItem
class DuplicatesPipeline(object):
def __init__(self):
self.seen = set()
def process_item(self, item, spider):
if item['uuid'] in self.seen:
raise DropItem("Duplicate found: %s" % item)
else:
self.seen.add(item['uuid'])
return item
class CheckLanguagePipeline(object):
def process_item(self, item, spider):
if u'CATALÁN' in item['uuid'].upper():
raise DropItem("Not supported language: %s" % item)
else:
return item
class CleanFieldsPipeline(object):
fields = ('isbn', 'price', )
def clean_isbn(self, item):
isbn = item['isbn'].replace('-', '')
if len(isbn) < 13:
isbn = isbn[0:10]
return isbn
def clean_price(self, item):
price = float(item['price'][:-1].replace(',', '.'))
return price
def process_item(self, item, spider):
for field in self.fields:
if field in item:
item[field] = getattr(self, 'clean_%s' % field)(item)
return item