Crawlers [norma editorial] Base

2014-10-07 19:22:21 +02:00 · 2014-10-07 19:22:21 +02:00 · 41d074b102
parent 9e404aeb9b
commit 41d074b102
8 changed files with 243 additions and 0 deletions
--- a/crawler/crawler/init.py
+++ b/crawler/crawler/init.py
--- a/crawler/crawler/items.py
+++ b/crawler/crawler/items.py
@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See docum1entation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class VolumeItem(scrapy.Item):
+    uuid = scrapy.Field()
+    url = scrapy.Field()
+    publisher_name = scrapy.Field()
+    series_name = scrapy.Field()
+    number = scrapy.Field()
+    name = scrapy.Field()
+    cover = scrapy.Field()
+    format = scrapy.Field()
+    size = scrapy.Field()
+    pages = scrapy.Field()
+    color = scrapy.Field()
+
+    isbn = scrapy.Field()
+    isbn_10 = scrapy.Field()
+    isbn_13 = scrapy.Field()
+
+    price = scrapy.Field()
+
+    hide = scrapy.Field()
+    is_pack = scrapy.Field()
+    not_released = scrapy.Field()
--- a/crawler/crawler/pipelines.py
+++ b/crawler/crawler/pipelines.py
@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+from scrapy.exceptions import DropItem
+
+
+class DuplicatesPipeline(object):
+    def __init__(self):
+        self.seen = set()
+
+    def process_item(self, item, spider):
+        if item['uuid'] in self.seen:
+            raise DropItem("Duplicate found: %s" % item)
+        else:
+            self.seen.add(item['uuid'])
+            return item
+
+
+class CheckLanguagePipeline(object):
+    def process_item(self, item, spider):
+        if u'CATALÁN' in item['uuid'].upper():
+            raise DropItem("Not supported language: %s" % item)
+        else:
+            return item
+
+
+class CleanFieldsPipeline(object):
+    fields = ('isbn', 'price', )
+
+    def clean_isbn(self, item):
+        isbn = item['isbn'].replace('-', '')
+
+        if len(isbn) < 13:
+            isbn = isbn[0:10]
+
+        return isbn
+
+    def clean_price(self, item):
+        price = float(item['price'][:-1].replace(',', '.'))
+
+        return price
+
+
+    def process_item(self, item, spider):
+        for field in self.fields:
+            if field in item:
+                item[field] = getattr(self, 'clean_%s' % field)(item)
+
+        return item
--- a/crawler/crawler/settings.py
+++ b/crawler/crawler/settings.py
@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for crawler project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#
+
+BOT_NAME = 'crawler'
+
+SPIDER_MODULES = ['crawler.spiders']
+NEWSPIDER_MODULE = 'crawler.spiders'
+
+ITEM_PIPELINES = {
+    # Check duplicate
+    # Determine if other language than Spanish (drop them if is)
+    # Clean fields
+    'crawler.pipelines.DuplicatesPipeline': 100,
+    'crawler.pipelines.CheckLanguagePipeline': 200,
+    'crawler.pipelines.CleanFieldsPipeline': 300,
+    # ...
+    'scrapycouchdb.CouchDBPipeline': 1000,
+}
+
+
+
+COUCHDB_SERVER = 'http://127.0.0.1:5984/'
+COUCHDB_DB = 'norma'
+COUCHDB_UNIQ_KEY = 'uuid'
+COUCHDB_IGNORE_FIELDS = []
+
+LOG_LEVEL = 'ERROR'
+LOG_FILE = 'scrapy.log'
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+# USER_AGENT = 'Test Crawler (+http://www.yourdomain.com)'
--- a/crawler/crawler/spiders/init.py
+++ b/crawler/crawler/spiders/init.py
@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/crawler/crawler/spiders/normaeditorial.py
+++ b/crawler/crawler/spiders/normaeditorial.py
@ -0,0 +1,111 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from scrapy.contrib.linkextractors import LinkExtractor
+from crawler.items import VolumeItem
+
+
+def join_strings(string_list, convert_to=None):
+    def convert(value):
+        if convert_to:
+            return convert_to(value)
+        return value
+    return convert("".join(string_list))
+
+
+class NormaeditorialSpider(CrawlSpider):
+    name = "normaeditorial"
+    allowed_domains = ["normaeditorial.com", "www.normaeditorial.com"]
+    start_urls = (
+        'http://www.normaeditorial.com/catalogo.asp?T/5/0/0/Catalogo_Manga',
+    )
+    rules = (
+        # Catalog details
+        # Series: /catalogo.asp?S/3203/0/0/fullmetal_alchemist_kanzenban
+        # Page: /catalogo.asp?T/5/0/0,D/
+        Rule(LinkExtractor(
+                allow=(
+                    '\/catalogo\.asp\?(\w)\/5\/\d\/\d\/[\w\d\_]+',
+                    '\/catalogo\.asp\?(\w)\/5\/\d\/[\d\,\w]+\/',
+                    '\/catalogo\.asp\?(\w)\/5\/\d\/[\d\,\w]+',
+                ),
+                allow_domains=allowed_domains,
+                canonicalize=False
+            )
+        ),
+        # Next releases
+        Rule(LinkExtractor(
+                allow=("\/blogmanga\/blog\/\?page_id\=275",),
+                canonicalize=False
+            ),
+            callback="parse_next_releases"
+        ),
+        # Volume details
+        # /ficha.asp?0/0/012770008/0/fullmetal_alchemist_kanzenban_08
+        Rule(LinkExtractor(
+                allow=(
+                    '\/ficha\.asp\?(\d+)\/(\d+)\/(\d+)\/(\d+)\/([\w\d\_\.\-]+)',
+                ),
+                allow_domains=allowed_domains,
+                canonicalize=False
+            ),
+            callback='parse_volume'
+        ),
+    )
+
+    _publisher_name = 'Norma Editorial'
+
+    def parse_start_page(self, response):
+        pass
+
+    def parse_volume(self, response):
+        item = VolumeItem()
+
+        item['url'] = response.url
+        item['publisher_name'] = self._publisher_name
+        item['series_name'] = response.xpath(
+            '//div[@id="basic_info"]/h2[contains(., "Serie")]/a/text()'
+        ).extract()[0].strip()
+
+        pairs = (
+            ('size', 'Tama'),
+            ('color', 'Color'),
+            ('isbn', 'ISBN'),
+            ('price', 'PVP'),
+        )
+
+        not_released = response.xpath('//div[@id="basic_info"]/h2/img[@alt="proximamente"]')
+
+        if len(not_released) > 0:
+            item['not_released'] = True
+
+        for k, v in pairs:
+            try:
+                item[k] = response.xpath(
+                    '//div[@id="basic_info"]/h3[contains(., "{}")]/span/text()'.format(v)
+                ).extract()[0].strip()
+            except IndexError:
+                pass
+
+        big_name = response.xpath('//div[@id="basic_info"]/h1/text()')\
+            .extract()[0].strip()
+
+        try:
+            item['number'] = int(big_name.replace(item['series_name'], '').strip())
+        except:
+            item['name'] = big_name.replace(item['series_name'], '').strip()
+
+        item['cover'] = response.xpath('//div[@id="SUB_centro_IZ_FICHA_MENU_im"]/img/@src').extract()[0]
+
+        # Other
+        if 'name' in item and 'pack' in item['name'].lower():
+            item['hide'] = True
+            item['is_pack'] = True
+
+        # TEST
+        item['uuid'] = big_name
+
+        yield item
+
+    def parse_next_releases(self, request):
+        pass
--- a/crawler/scrapy.cfg
+++ b/crawler/scrapy.cfg
@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# http://doc.scrapy.org/en/latest/topics/scrapyd.html
+
+[settings]
+default = crawler.settings
+
+[deploy]
+# url = http://localhost:6800/
+project = crawler
--- a/crawler/scrapy.log
+++ b/crawler/scrapy.log