112 lines
3.4 KiB
Python
112 lines
3.4 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
import scrapy
|
||
|
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||
|
from scrapy.contrib.linkextractors import LinkExtractor
|
||
|
from crawler.items import VolumeItem
|
||
|
|
||
|
|
||
|
def join_strings(string_list, convert_to=None):
|
||
|
def convert(value):
|
||
|
if convert_to:
|
||
|
return convert_to(value)
|
||
|
return value
|
||
|
return convert("".join(string_list))
|
||
|
|
||
|
|
||
|
class NormaeditorialSpider(CrawlSpider):
|
||
|
name = "normaeditorial"
|
||
|
allowed_domains = ["normaeditorial.com", "www.normaeditorial.com"]
|
||
|
start_urls = (
|
||
|
'http://www.normaeditorial.com/catalogo.asp?T/5/0/0/Catalogo_Manga',
|
||
|
)
|
||
|
rules = (
|
||
|
# Catalog details
|
||
|
# Series: /catalogo.asp?S/3203/0/0/fullmetal_alchemist_kanzenban
|
||
|
# Page: /catalogo.asp?T/5/0/0,D/
|
||
|
Rule(LinkExtractor(
|
||
|
allow=(
|
||
|
'\/catalogo\.asp\?(\w)\/5\/\d\/\d\/[\w\d\_]+',
|
||
|
'\/catalogo\.asp\?(\w)\/5\/\d\/[\d\,\w]+\/',
|
||
|
'\/catalogo\.asp\?(\w)\/5\/\d\/[\d\,\w]+',
|
||
|
),
|
||
|
allow_domains=allowed_domains,
|
||
|
canonicalize=False
|
||
|
)
|
||
|
),
|
||
|
# Next releases
|
||
|
Rule(LinkExtractor(
|
||
|
allow=("\/blogmanga\/blog\/\?page_id\=275",),
|
||
|
canonicalize=False
|
||
|
),
|
||
|
callback="parse_next_releases"
|
||
|
),
|
||
|
# Volume details
|
||
|
# /ficha.asp?0/0/012770008/0/fullmetal_alchemist_kanzenban_08
|
||
|
Rule(LinkExtractor(
|
||
|
allow=(
|
||
|
'\/ficha\.asp\?(\d+)\/(\d+)\/(\d+)\/(\d+)\/([\w\d\_\.\-]+)',
|
||
|
),
|
||
|
allow_domains=allowed_domains,
|
||
|
canonicalize=False
|
||
|
),
|
||
|
callback='parse_volume'
|
||
|
),
|
||
|
)
|
||
|
|
||
|
_publisher_name = 'Norma Editorial'
|
||
|
|
||
|
def parse_start_page(self, response):
|
||
|
pass
|
||
|
|
||
|
def parse_volume(self, response):
|
||
|
item = VolumeItem()
|
||
|
|
||
|
item['url'] = response.url
|
||
|
item['publisher_name'] = self._publisher_name
|
||
|
item['series_name'] = response.xpath(
|
||
|
'//div[@id="basic_info"]/h2[contains(., "Serie")]/a/text()'
|
||
|
).extract()[0].strip()
|
||
|
|
||
|
pairs = (
|
||
|
('size', 'Tama'),
|
||
|
('color', 'Color'),
|
||
|
('isbn', 'ISBN'),
|
||
|
('price', 'PVP'),
|
||
|
)
|
||
|
|
||
|
not_released = response.xpath('//div[@id="basic_info"]/h2/img[@alt="proximamente"]')
|
||
|
|
||
|
if len(not_released) > 0:
|
||
|
item['not_released'] = True
|
||
|
|
||
|
for k, v in pairs:
|
||
|
try:
|
||
|
item[k] = response.xpath(
|
||
|
'//div[@id="basic_info"]/h3[contains(., "{}")]/span/text()'.format(v)
|
||
|
).extract()[0].strip()
|
||
|
except IndexError:
|
||
|
pass
|
||
|
|
||
|
big_name = response.xpath('//div[@id="basic_info"]/h1/text()')\
|
||
|
.extract()[0].strip()
|
||
|
|
||
|
try:
|
||
|
item['number'] = int(big_name.replace(item['series_name'], '').strip())
|
||
|
except:
|
||
|
item['name'] = big_name.replace(item['series_name'], '').strip()
|
||
|
|
||
|
item['cover'] = response.xpath('//div[@id="SUB_centro_IZ_FICHA_MENU_im"]/img/@src').extract()[0]
|
||
|
|
||
|
# Other
|
||
|
if 'name' in item and 'pack' in item['name'].lower():
|
||
|
item['hide'] = True
|
||
|
item['is_pack'] = True
|
||
|
|
||
|
# TEST
|
||
|
item['uuid'] = big_name
|
||
|
|
||
|
yield item
|
||
|
|
||
|
def parse_next_releases(self, request):
|
||
|
pass
|