Crawlers [norma editorial] Base

2014-10-07 19:22:21 +02:00 · 2014-10-07 19:22:21 +02:00 · 41d074b102
parent 9e404aeb9b
commit 41d074b102
8 changed files with 243 additions and 0 deletions
--- a/crawler/crawler/init.py
+++ b/crawler/crawler/init.py
--- a/crawler/crawler/items.py
+++ b/crawler/crawler/items.py
@ -0,0 +1,32 @@
 # -*- coding: utf-8 -*-
 # Define here the models for your scraped items
 #
 # See docum1entation in:
 # http://doc.scrapy.org/en/latest/topics/items.html
 import scrapy
 class VolumeItem(scrapy.Item):
    uuid = scrapy.Field()
    url = scrapy.Field()
    publisher_name = scrapy.Field()
    series_name = scrapy.Field()
    number = scrapy.Field()
    name = scrapy.Field()
    cover = scrapy.Field()
    format = scrapy.Field()
    size = scrapy.Field()
    pages = scrapy.Field()
    color = scrapy.Field()
    isbn = scrapy.Field()
    isbn_10 = scrapy.Field()
    isbn_13 = scrapy.Field()
    price = scrapy.Field()
    hide = scrapy.Field()
    is_pack = scrapy.Field()
    not_released = scrapy.Field()
--- a/crawler/crawler/pipelines.py
+++ b/crawler/crawler/pipelines.py
@ -0,0 +1,47 @@
 # -*- coding: utf-8 -*-
 from scrapy.exceptions import DropItem
 class DuplicatesPipeline(object):
    def __init__(self):
        self.seen = set()
    def process_item(self, item, spider):
        if item['uuid'] in self.seen:
            raise DropItem("Duplicate found: %s" % item)
        else:
            self.seen.add(item['uuid'])
            return item
 class CheckLanguagePipeline(object):
    def process_item(self, item, spider):
        if u'CATALÁN' in item['uuid'].upper():
            raise DropItem("Not supported language: %s" % item)
        else:
            return item
 class CleanFieldsPipeline(object):
    fields = ('isbn', 'price', )
    def clean_isbn(self, item):
        isbn = item['isbn'].replace('-', '')
        if len(isbn) < 13:
            isbn = isbn[0:10]
        return isbn
    def clean_price(self, item):
        price = float(item['price'][:-1].replace(',', '.'))
        return price
    def process_item(self, item, spider):
        for field in self.fields:
            if field in item:
                item[field] = getattr(self, 'clean_%s' % field)(item)
        return item
--- a/crawler/crawler/settings.py
+++ b/crawler/crawler/settings.py
@ -0,0 +1,38 @@
 # -*- coding: utf-8 -*-
 # Scrapy settings for crawler project
 #
 # For simplicity, this file contains only the most important settings by
 # default. All the other settings are documented here:
 #
 #     http://doc.scrapy.org/en/latest/topics/settings.html
 #
 BOT_NAME = 'crawler'
 SPIDER_MODULES = ['crawler.spiders']
 NEWSPIDER_MODULE = 'crawler.spiders'
 ITEM_PIPELINES = {
    # Check duplicate
    # Determine if other language than Spanish (drop them if is)
    # Clean fields
    'crawler.pipelines.DuplicatesPipeline': 100,
    'crawler.pipelines.CheckLanguagePipeline': 200,
    'crawler.pipelines.CleanFieldsPipeline': 300,
    # ...
    'scrapycouchdb.CouchDBPipeline': 1000,
 }
 COUCHDB_SERVER = 'http://127.0.0.1:5984/'
 COUCHDB_DB = 'norma'
 COUCHDB_UNIQ_KEY = 'uuid'
 COUCHDB_IGNORE_FIELDS = []
 LOG_LEVEL = 'ERROR'
 LOG_FILE = 'scrapy.log'
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 # USER_AGENT = 'Test Crawler (+http://www.yourdomain.com)'
--- a/crawler/crawler/spiders/init.py
+++ b/crawler/crawler/spiders/init.py
@ -0,0 +1,4 @@
 # This package will contain the spiders of your Scrapy project
 #
 # Please refer to the documentation for information on how to create and manage
 # your spiders.
--- a/crawler/crawler/spiders/normaeditorial.py
+++ b/crawler/crawler/spiders/normaeditorial.py
@ -0,0 +1,111 @@
 # -*- coding: utf-8 -*-
 import scrapy
 from scrapy.contrib.spiders import CrawlSpider, Rule
 from scrapy.contrib.linkextractors import LinkExtractor
 from crawler.items import VolumeItem
 def join_strings(string_list, convert_to=None):
    def convert(value):
        if convert_to:
            return convert_to(value)
        return value
    return convert("".join(string_list))
 class NormaeditorialSpider(CrawlSpider):
    name = "normaeditorial"
    allowed_domains = ["normaeditorial.com", "www.normaeditorial.com"]
    start_urls = (
        'http://www.normaeditorial.com/catalogo.asp?T/5/0/0/Catalogo_Manga',
    )
    rules = (
        # Catalog details
        # Series: /catalogo.asp?S/3203/0/0/fullmetal_alchemist_kanzenban
        # Page: /catalogo.asp?T/5/0/0,D/
        Rule(LinkExtractor(
                allow=(
                    '\/catalogo\.asp\?(\w)\/5\/\d\/\d\/[\w\d\_]+',
                    '\/catalogo\.asp\?(\w)\/5\/\d\/[\d\,\w]+\/',
                    '\/catalogo\.asp\?(\w)\/5\/\d\/[\d\,\w]+',
                ),
                allow_domains=allowed_domains,
                canonicalize=False
            )
        ),
        # Next releases
        Rule(LinkExtractor(
                allow=("\/blogmanga\/blog\/\?page_id\=275",),
                canonicalize=False
            ),
            callback="parse_next_releases"
        ),
        # Volume details
        # /ficha.asp?0/0/012770008/0/fullmetal_alchemist_kanzenban_08
        Rule(LinkExtractor(
                allow=(
                    '\/ficha\.asp\?(\d+)\/(\d+)\/(\d+)\/(\d+)\/([\w\d\_\.\-]+)',
                ),
                allow_domains=allowed_domains,
                canonicalize=False
            ),
            callback='parse_volume'
        ),
    )
    _publisher_name = 'Norma Editorial'
    def parse_start_page(self, response):
        pass
    def parse_volume(self, response):
        item = VolumeItem()
        item['url'] = response.url
        item['publisher_name'] = self._publisher_name
        item['series_name'] = response.xpath(
            '//div[@id="basic_info"]/h2[contains(., "Serie")]/a/text()'
        ).extract()[0].strip()
        pairs = (
            ('size', 'Tama'),
            ('color', 'Color'),
            ('isbn', 'ISBN'),
            ('price', 'PVP'),
        )
        not_released = response.xpath('//div[@id="basic_info"]/h2/img[@alt="proximamente"]')
        if len(not_released) > 0:
            item['not_released'] = True
        for k, v in pairs:
            try:
                item[k] = response.xpath(
                    '//div[@id="basic_info"]/h3[contains(., "{}")]/span/text()'.format(v)
                ).extract()[0].strip()
            except IndexError:
                pass
        big_name = response.xpath('//div[@id="basic_info"]/h1/text()')\
            .extract()[0].strip()
        try:
            item['number'] = int(big_name.replace(item['series_name'], '').strip())
        except:
            item['name'] = big_name.replace(item['series_name'], '').strip()
        item['cover'] = response.xpath('//div[@id="SUB_centro_IZ_FICHA_MENU_im"]/img/@src').extract()[0]
        # Other
        if 'name' in item and 'pack' in item['name'].lower():
            item['hide'] = True
            item['is_pack'] = True
        # TEST
        item['uuid'] = big_name
        yield item
    def parse_next_releases(self, request):
        pass
--- a/crawler/scrapy.cfg
+++ b/crawler/scrapy.cfg
@ -0,0 +1,11 @@
 # Automatically created by: scrapy startproject
 #
 # For more information about the [deploy] section see:
 # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 [settings]
 default = crawler.settings
 [deploy]
 # url = http://localhost:6800/
 project = crawler
--- a/crawler/scrapy.log
+++ b/crawler/scrapy.log