From 41d074b102a47edd935733353ff44983e54c5b90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felipe=20Marti=CC=81n?= Date: Tue, 7 Oct 2014 19:22:21 +0200 Subject: [PATCH] Crawlers [norma editorial] Base --- crawler/crawler/__init__.py | 0 crawler/crawler/items.py | 32 +++++++ crawler/crawler/pipelines.py | 47 +++++++++ crawler/crawler/settings.py | 38 ++++++++ crawler/crawler/spiders/__init__.py | 4 + crawler/crawler/spiders/normaeditorial.py | 111 ++++++++++++++++++++++ crawler/scrapy.cfg | 11 +++ crawler/scrapy.log | 0 8 files changed, 243 insertions(+) create mode 100644 crawler/crawler/__init__.py create mode 100644 crawler/crawler/items.py create mode 100644 crawler/crawler/pipelines.py create mode 100644 crawler/crawler/settings.py create mode 100644 crawler/crawler/spiders/__init__.py create mode 100644 crawler/crawler/spiders/normaeditorial.py create mode 100644 crawler/scrapy.cfg create mode 100644 crawler/scrapy.log diff --git a/crawler/crawler/__init__.py b/crawler/crawler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawler/crawler/items.py b/crawler/crawler/items.py new file mode 100644 index 0000000..6b1a97a --- /dev/null +++ b/crawler/crawler/items.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See docum1entation in: +# http://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class VolumeItem(scrapy.Item): + uuid = scrapy.Field() + url = scrapy.Field() + publisher_name = scrapy.Field() + series_name = scrapy.Field() + number = scrapy.Field() + name = scrapy.Field() + cover = scrapy.Field() + format = scrapy.Field() + size = scrapy.Field() + pages = scrapy.Field() + color = scrapy.Field() + + isbn = scrapy.Field() + isbn_10 = scrapy.Field() + isbn_13 = scrapy.Field() + + price = scrapy.Field() + + hide = scrapy.Field() + is_pack = scrapy.Field() + not_released = scrapy.Field() diff --git a/crawler/crawler/pipelines.py b/crawler/crawler/pipelines.py new file mode 100644 index 0000000..2f5f9bc --- /dev/null +++ b/crawler/crawler/pipelines.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +from scrapy.exceptions import DropItem + + +class DuplicatesPipeline(object): + def __init__(self): + self.seen = set() + + def process_item(self, item, spider): + if item['uuid'] in self.seen: + raise DropItem("Duplicate found: %s" % item) + else: + self.seen.add(item['uuid']) + return item + + +class CheckLanguagePipeline(object): + def process_item(self, item, spider): + if u'CATALÁN' in item['uuid'].upper(): + raise DropItem("Not supported language: %s" % item) + else: + return item + + +class CleanFieldsPipeline(object): + fields = ('isbn', 'price', ) + + def clean_isbn(self, item): + isbn = item['isbn'].replace('-', '') + + if len(isbn) < 13: + isbn = isbn[0:10] + + return isbn + + def clean_price(self, item): + price = float(item['price'][:-1].replace(',', '.')) + + return price + + + def process_item(self, item, spider): + for field in self.fields: + if field in item: + item[field] = getattr(self, 'clean_%s' % field)(item) + + return item diff --git a/crawler/crawler/settings.py b/crawler/crawler/settings.py new file mode 100644 index 0000000..8f32894 --- /dev/null +++ b/crawler/crawler/settings.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for crawler project +# +# For simplicity, this file contains only the most important settings by +# default. All the other settings are documented here: +# +# http://doc.scrapy.org/en/latest/topics/settings.html +# + +BOT_NAME = 'crawler' + +SPIDER_MODULES = ['crawler.spiders'] +NEWSPIDER_MODULE = 'crawler.spiders' + +ITEM_PIPELINES = { + # Check duplicate + # Determine if other language than Spanish (drop them if is) + # Clean fields + 'crawler.pipelines.DuplicatesPipeline': 100, + 'crawler.pipelines.CheckLanguagePipeline': 200, + 'crawler.pipelines.CleanFieldsPipeline': 300, + # ... + 'scrapycouchdb.CouchDBPipeline': 1000, +} + + + +COUCHDB_SERVER = 'http://127.0.0.1:5984/' +COUCHDB_DB = 'norma' +COUCHDB_UNIQ_KEY = 'uuid' +COUCHDB_IGNORE_FIELDS = [] + +LOG_LEVEL = 'ERROR' +LOG_FILE = 'scrapy.log' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +# USER_AGENT = 'Test Crawler (+http://www.yourdomain.com)' diff --git a/crawler/crawler/spiders/__init__.py b/crawler/crawler/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/crawler/crawler/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/crawler/crawler/spiders/normaeditorial.py b/crawler/crawler/spiders/normaeditorial.py new file mode 100644 index 0000000..bd65956 --- /dev/null +++ b/crawler/crawler/spiders/normaeditorial.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +import scrapy +from scrapy.contrib.spiders import CrawlSpider, Rule +from scrapy.contrib.linkextractors import LinkExtractor +from crawler.items import VolumeItem + + +def join_strings(string_list, convert_to=None): + def convert(value): + if convert_to: + return convert_to(value) + return value + return convert("".join(string_list)) + + +class NormaeditorialSpider(CrawlSpider): + name = "normaeditorial" + allowed_domains = ["normaeditorial.com", "www.normaeditorial.com"] + start_urls = ( + 'http://www.normaeditorial.com/catalogo.asp?T/5/0/0/Catalogo_Manga', + ) + rules = ( + # Catalog details + # Series: /catalogo.asp?S/3203/0/0/fullmetal_alchemist_kanzenban + # Page: /catalogo.asp?T/5/0/0,D/ + Rule(LinkExtractor( + allow=( + '\/catalogo\.asp\?(\w)\/5\/\d\/\d\/[\w\d\_]+', + '\/catalogo\.asp\?(\w)\/5\/\d\/[\d\,\w]+\/', + '\/catalogo\.asp\?(\w)\/5\/\d\/[\d\,\w]+', + ), + allow_domains=allowed_domains, + canonicalize=False + ) + ), + # Next releases + Rule(LinkExtractor( + allow=("\/blogmanga\/blog\/\?page_id\=275",), + canonicalize=False + ), + callback="parse_next_releases" + ), + # Volume details + # /ficha.asp?0/0/012770008/0/fullmetal_alchemist_kanzenban_08 + Rule(LinkExtractor( + allow=( + '\/ficha\.asp\?(\d+)\/(\d+)\/(\d+)\/(\d+)\/([\w\d\_\.\-]+)', + ), + allow_domains=allowed_domains, + canonicalize=False + ), + callback='parse_volume' + ), + ) + + _publisher_name = 'Norma Editorial' + + def parse_start_page(self, response): + pass + + def parse_volume(self, response): + item = VolumeItem() + + item['url'] = response.url + item['publisher_name'] = self._publisher_name + item['series_name'] = response.xpath( + '//div[@id="basic_info"]/h2[contains(., "Serie")]/a/text()' + ).extract()[0].strip() + + pairs = ( + ('size', 'Tama'), + ('color', 'Color'), + ('isbn', 'ISBN'), + ('price', 'PVP'), + ) + + not_released = response.xpath('//div[@id="basic_info"]/h2/img[@alt="proximamente"]') + + if len(not_released) > 0: + item['not_released'] = True + + for k, v in pairs: + try: + item[k] = response.xpath( + '//div[@id="basic_info"]/h3[contains(., "{}")]/span/text()'.format(v) + ).extract()[0].strip() + except IndexError: + pass + + big_name = response.xpath('//div[@id="basic_info"]/h1/text()')\ + .extract()[0].strip() + + try: + item['number'] = int(big_name.replace(item['series_name'], '').strip()) + except: + item['name'] = big_name.replace(item['series_name'], '').strip() + + item['cover'] = response.xpath('//div[@id="SUB_centro_IZ_FICHA_MENU_im"]/img/@src').extract()[0] + + # Other + if 'name' in item and 'pack' in item['name'].lower(): + item['hide'] = True + item['is_pack'] = True + + # TEST + item['uuid'] = big_name + + yield item + + def parse_next_releases(self, request): + pass diff --git a/crawler/scrapy.cfg b/crawler/scrapy.cfg new file mode 100644 index 0000000..4a399c6 --- /dev/null +++ b/crawler/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# http://doc.scrapy.org/en/latest/topics/scrapyd.html + +[settings] +default = crawler.settings + +[deploy] +# url = http://localhost:6800/ +project = crawler diff --git a/crawler/scrapy.log b/crawler/scrapy.log new file mode 100644 index 0000000..e69de29