Crawlers [norma editorial] Base
This commit is contained in:
parent
9e404aeb9b
commit
41d074b102
|
@ -0,0 +1,32 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See docum1entation in:
|
||||
# http://doc.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class VolumeItem(scrapy.Item):
|
||||
uuid = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
publisher_name = scrapy.Field()
|
||||
series_name = scrapy.Field()
|
||||
number = scrapy.Field()
|
||||
name = scrapy.Field()
|
||||
cover = scrapy.Field()
|
||||
format = scrapy.Field()
|
||||
size = scrapy.Field()
|
||||
pages = scrapy.Field()
|
||||
color = scrapy.Field()
|
||||
|
||||
isbn = scrapy.Field()
|
||||
isbn_10 = scrapy.Field()
|
||||
isbn_13 = scrapy.Field()
|
||||
|
||||
price = scrapy.Field()
|
||||
|
||||
hide = scrapy.Field()
|
||||
is_pack = scrapy.Field()
|
||||
not_released = scrapy.Field()
|
|
@ -0,0 +1,47 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from scrapy.exceptions import DropItem
|
||||
|
||||
|
||||
class DuplicatesPipeline(object):
|
||||
def __init__(self):
|
||||
self.seen = set()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
if item['uuid'] in self.seen:
|
||||
raise DropItem("Duplicate found: %s" % item)
|
||||
else:
|
||||
self.seen.add(item['uuid'])
|
||||
return item
|
||||
|
||||
|
||||
class CheckLanguagePipeline(object):
|
||||
def process_item(self, item, spider):
|
||||
if u'CATALÁN' in item['uuid'].upper():
|
||||
raise DropItem("Not supported language: %s" % item)
|
||||
else:
|
||||
return item
|
||||
|
||||
|
||||
class CleanFieldsPipeline(object):
|
||||
fields = ('isbn', 'price', )
|
||||
|
||||
def clean_isbn(self, item):
|
||||
isbn = item['isbn'].replace('-', '')
|
||||
|
||||
if len(isbn) < 13:
|
||||
isbn = isbn[0:10]
|
||||
|
||||
return isbn
|
||||
|
||||
def clean_price(self, item):
|
||||
price = float(item['price'][:-1].replace(',', '.'))
|
||||
|
||||
return price
|
||||
|
||||
|
||||
def process_item(self, item, spider):
|
||||
for field in self.fields:
|
||||
if field in item:
|
||||
item[field] = getattr(self, 'clean_%s' % field)(item)
|
||||
|
||||
return item
|
|
@ -0,0 +1,38 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Scrapy settings for crawler project
|
||||
#
|
||||
# For simplicity, this file contains only the most important settings by
|
||||
# default. All the other settings are documented here:
|
||||
#
|
||||
# http://doc.scrapy.org/en/latest/topics/settings.html
|
||||
#
|
||||
|
||||
BOT_NAME = 'crawler'
|
||||
|
||||
SPIDER_MODULES = ['crawler.spiders']
|
||||
NEWSPIDER_MODULE = 'crawler.spiders'
|
||||
|
||||
ITEM_PIPELINES = {
|
||||
# Check duplicate
|
||||
# Determine if other language than Spanish (drop them if is)
|
||||
# Clean fields
|
||||
'crawler.pipelines.DuplicatesPipeline': 100,
|
||||
'crawler.pipelines.CheckLanguagePipeline': 200,
|
||||
'crawler.pipelines.CleanFieldsPipeline': 300,
|
||||
# ...
|
||||
'scrapycouchdb.CouchDBPipeline': 1000,
|
||||
}
|
||||
|
||||
|
||||
|
||||
COUCHDB_SERVER = 'http://127.0.0.1:5984/'
|
||||
COUCHDB_DB = 'norma'
|
||||
COUCHDB_UNIQ_KEY = 'uuid'
|
||||
COUCHDB_IGNORE_FIELDS = []
|
||||
|
||||
LOG_LEVEL = 'ERROR'
|
||||
LOG_FILE = 'scrapy.log'
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
# USER_AGENT = 'Test Crawler (+http://www.yourdomain.com)'
|
|
@ -0,0 +1,4 @@
|
|||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
|
@ -0,0 +1,111 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import scrapy
|
||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||
from scrapy.contrib.linkextractors import LinkExtractor
|
||||
from crawler.items import VolumeItem
|
||||
|
||||
|
||||
def join_strings(string_list, convert_to=None):
|
||||
def convert(value):
|
||||
if convert_to:
|
||||
return convert_to(value)
|
||||
return value
|
||||
return convert("".join(string_list))
|
||||
|
||||
|
||||
class NormaeditorialSpider(CrawlSpider):
|
||||
name = "normaeditorial"
|
||||
allowed_domains = ["normaeditorial.com", "www.normaeditorial.com"]
|
||||
start_urls = (
|
||||
'http://www.normaeditorial.com/catalogo.asp?T/5/0/0/Catalogo_Manga',
|
||||
)
|
||||
rules = (
|
||||
# Catalog details
|
||||
# Series: /catalogo.asp?S/3203/0/0/fullmetal_alchemist_kanzenban
|
||||
# Page: /catalogo.asp?T/5/0/0,D/
|
||||
Rule(LinkExtractor(
|
||||
allow=(
|
||||
'\/catalogo\.asp\?(\w)\/5\/\d\/\d\/[\w\d\_]+',
|
||||
'\/catalogo\.asp\?(\w)\/5\/\d\/[\d\,\w]+\/',
|
||||
'\/catalogo\.asp\?(\w)\/5\/\d\/[\d\,\w]+',
|
||||
),
|
||||
allow_domains=allowed_domains,
|
||||
canonicalize=False
|
||||
)
|
||||
),
|
||||
# Next releases
|
||||
Rule(LinkExtractor(
|
||||
allow=("\/blogmanga\/blog\/\?page_id\=275",),
|
||||
canonicalize=False
|
||||
),
|
||||
callback="parse_next_releases"
|
||||
),
|
||||
# Volume details
|
||||
# /ficha.asp?0/0/012770008/0/fullmetal_alchemist_kanzenban_08
|
||||
Rule(LinkExtractor(
|
||||
allow=(
|
||||
'\/ficha\.asp\?(\d+)\/(\d+)\/(\d+)\/(\d+)\/([\w\d\_\.\-]+)',
|
||||
),
|
||||
allow_domains=allowed_domains,
|
||||
canonicalize=False
|
||||
),
|
||||
callback='parse_volume'
|
||||
),
|
||||
)
|
||||
|
||||
_publisher_name = 'Norma Editorial'
|
||||
|
||||
def parse_start_page(self, response):
|
||||
pass
|
||||
|
||||
def parse_volume(self, response):
|
||||
item = VolumeItem()
|
||||
|
||||
item['url'] = response.url
|
||||
item['publisher_name'] = self._publisher_name
|
||||
item['series_name'] = response.xpath(
|
||||
'//div[@id="basic_info"]/h2[contains(., "Serie")]/a/text()'
|
||||
).extract()[0].strip()
|
||||
|
||||
pairs = (
|
||||
('size', 'Tama'),
|
||||
('color', 'Color'),
|
||||
('isbn', 'ISBN'),
|
||||
('price', 'PVP'),
|
||||
)
|
||||
|
||||
not_released = response.xpath('//div[@id="basic_info"]/h2/img[@alt="proximamente"]')
|
||||
|
||||
if len(not_released) > 0:
|
||||
item['not_released'] = True
|
||||
|
||||
for k, v in pairs:
|
||||
try:
|
||||
item[k] = response.xpath(
|
||||
'//div[@id="basic_info"]/h3[contains(., "{}")]/span/text()'.format(v)
|
||||
).extract()[0].strip()
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
big_name = response.xpath('//div[@id="basic_info"]/h1/text()')\
|
||||
.extract()[0].strip()
|
||||
|
||||
try:
|
||||
item['number'] = int(big_name.replace(item['series_name'], '').strip())
|
||||
except:
|
||||
item['name'] = big_name.replace(item['series_name'], '').strip()
|
||||
|
||||
item['cover'] = response.xpath('//div[@id="SUB_centro_IZ_FICHA_MENU_im"]/img/@src').extract()[0]
|
||||
|
||||
# Other
|
||||
if 'name' in item and 'pack' in item['name'].lower():
|
||||
item['hide'] = True
|
||||
item['is_pack'] = True
|
||||
|
||||
# TEST
|
||||
item['uuid'] = big_name
|
||||
|
||||
yield item
|
||||
|
||||
def parse_next_releases(self, request):
|
||||
pass
|
|
@ -0,0 +1,11 @@
|
|||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# http://doc.scrapy.org/en/latest/topics/scrapyd.html
|
||||
|
||||
[settings]
|
||||
default = crawler.settings
|
||||
|
||||
[deploy]
|
||||
# url = http://localhost:6800/
|
||||
project = crawler
|
Reference in New Issue