to-do: format, color, price and not released, almost done
This commit is contained in:
parent
c587eba726
commit
ef2d6f4e2d
|
@ -1,5 +1,6 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import scrapy
|
||||
import re
|
||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||
from scrapy.contrib.linkextractors import LinkExtractor
|
||||
from shelfzilla.items import VolumeItem
|
||||
|
@ -37,6 +38,7 @@ class PaninicomicsSpider(CrawlSpider):
|
|||
),
|
||||
)
|
||||
|
||||
_base_url = 'http://www.paninicomics.es'
|
||||
_publisher_name = 'Panini Comics'
|
||||
|
||||
def parse_start_url(self, response):
|
||||
|
@ -49,8 +51,31 @@ class PaninicomicsSpider(CrawlSpider):
|
|||
|
||||
item['url'] = response.url
|
||||
item['publisher_name'] = self._publisher_name
|
||||
item['isbn_13'] = response.xpath('//*[@class="features"]/text()').extract()
|
||||
#item['pages'] = response.xpath('//*[@id="shop"]/div[2]/div[3]/p[2]/text()').extract()
|
||||
|
||||
## Serie Name and volume name
|
||||
name_raw = response.xpath('//*[@class="title"]/h3/text()').extract()
|
||||
name = str(name_raw)[3:-2]
|
||||
cleaned_name = str(name).split(' ')[0:-1]
|
||||
item['series_name'] = ' '.join(cleaned_name)
|
||||
list_raw_name = response.xpath('//*[@class="title"]/h4//text()').extract()
|
||||
item['name'] = str(list_raw_name[1]).strip("\n \' [ ]")
|
||||
|
||||
## Tome number
|
||||
item['number'] = str(name).split(' ')[-1]
|
||||
|
||||
## Cover
|
||||
image_link = str(response.xpath('//*[@class="cover"]/img/@src').extract()).strip("\' [ ] ")[2:-1]
|
||||
item['cover'] = self._base_url + image_link
|
||||
|
||||
## ISBN and Pages
|
||||
numbers = response.xpath('//*[@class="features"]/text()').extract()
|
||||
pages = re.findall(r'\d{3}', str(numbers))
|
||||
if len(pages) > 3:
|
||||
item['pages'] = pages[-1]
|
||||
|
||||
else:
|
||||
item['pages'] = pages[0]
|
||||
item['isbn_13'] = str(re.findall(r'\d{13}', str(numbers))).strip("\' [] ")
|
||||
|
||||
print item
|
||||
#response.xpath('//*[@id="shop"]/div[2]/div[3]/h3/text()').extract()
|
||||
|
@ -66,6 +91,3 @@ class PaninicomicsSpider(CrawlSpider):
|
|||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
Reference in New Issue