fmartingr
/
shelfzilla
Archived
1
0
Fork 0
This repository has been archived on 2021-06-29. You can view files and clone it, but cannot push or open issues or pull requests.
shelfzilla/utils/crawler_listadomanga/crawler.py

156 lines
5.2 KiB
Python

from StringIO import StringIO
from datCrawl.crawlers import Crawler
from lxml import etree
from datetime import date
from pprint import pprint
class ListadoManga(Crawler):
urls = [
('get_manga', '(?P<url>^http\:\/\/www\.listadomanga\.es\/coleccion\.php(.*)$)'),
('get_links', '(?P<url>^http\:\/\/www\.listadomanga\.es\/lista\.php)'),
]
downloader = 'DefaultDownloader'
base = 'http://www.listadomanga.es'
_info = {
'site_name': 'ListadoManga',
'crawler_key': 'listadomanga',
'language': 'en'
}
_constant = {}
def action_get_links(self, data, **kwargs):
ids = []
document = etree.parse(
StringIO(data),
etree.HTMLParser(encoding='utf-8')
)
root = document.getroot()
for link in root.xpath("//a/@href"):
if "coleccion" in link:
ids.append(int(link.split('=',2)[1]))
return ids
def action_get_manga(self, data, **kwargs):
obj = {}
document = etree.parse(
StringIO(data),
etree.HTMLParser(encoding='utf-8')
)
root = document.getroot()
try:
# Details TD
keys = [
('original', 'original_title'),
('Gui', 'story'),
('Dibujo', 'art'),
('Editorial japonesa', 'japanese_publisher'),
('Editorial espa', 'spanish_publisher'),
('Colecci', 'collection'),
('Formato', 'format'),
('Sentido de lectura', 'read_direction'),
('meros en japo', 'japanese_numbers'),
('meros en espa', 'spanish_numbers'),
('Nota', 'note'),
]
td = root.xpath('//table//td[@class="izq"][contains(.,"original")]')
first = True
td_contains = {}
last = None
for t in td[0].itertext():
text = t.strip()
if t.strip():
if first:
first = False
obj['name'] = text
else:
if ':' in text:
last = text.split(':')[0]
td_contains[last] = u''
else:
td_contains[last] += text
for tdkey, row in td_contains.iteritems():
for key in keys:
if key[0] in tdkey:
obj[key[1]] = row.split('(web oficial)')[0]
# Japanese publisher URL
try:
jap = root.xpath("//td[contains(., 'Editorial jap')]//text()[contains(., 'Editorial jap')]/following::a")[1]
obj['japanese_publisher_url'] = jap.attrib['href']
except:
obj['japanese_publisher_url'] = ''
# Spanish publisher URL
try:
esp = root.xpath("//td[contains(., 'Editorial esp')]//text()[contains(., 'Editorial esp')]/following::a")[1]
obj['spanish_publisher_url'] = esp.attrib['href']
except:
obj['spanish_publisher_url'] = ''
# Un/Published volumes
obj['published_volumes'] = []
obj['unpublished_volumes'] = []
vols = root.xpath("//table[contains(., 'editados')]/following-sibling::table//td[not(contains(@class, 'separacion'))]//table//td[@class='cen']")
for vol in vols:
published = False
volume = {}
image = vol.find('img')
volume['cover'] = "{}/{}".format(self.base, image.attrib['src'])
text = []
if vol.find('hr') is not None:
for dom in vol.iterchildren():
if dom.tag == 'hr':
break
else:
if dom.text:
text.append(dom.text)
elif dom.tail:
text.append(dom.tail)
else:
text = list(vol.itertext())
if len(text) >= 4:
volume['date'] = text.pop(-1).strip()
price = text.pop(-1)
if 'ratuito' in price:
volume['price'] = 0
else:
volume['price'] = float(price.split(' ')[0].replace(',', '.'))
volume['pages'] = text.pop(-1).strip()
published = True
volume['name'] = ("".join(text)).strip()
if published:
obj['published_volumes'].append(volume)
else:
obj['unpublished_volumes'].append(volume)
# Description
try:
obj['summary'] = root.xpath("//h2[contains(., 'Sinopsis')]/../text()")[0].strip()
except:
obj['summary'] = ''
except Exception as error:
print("Error with: {}".format(obj['name'].encode('utf-8')))
print("---------- {}".format(error))
obj = 'Error'
# pprint(obj)
return obj