from StringIO import StringIO from datCrawl.crawlers import Crawler from lxml import etree from datetime import date from pprint import pprint class ListadoManga(Crawler): urls = [ ('get_manga', '(?P^http\:\/\/www\.listadomanga\.es\/coleccion\.php(.*)$)'), ('get_links', '(?P^http\:\/\/www\.listadomanga\.es\/lista\.php)'), ] downloader = 'DefaultDownloader' base = 'http://www.listadomanga.es' _info = { 'site_name': 'ListadoManga', 'crawler_key': 'listadomanga', 'language': 'en' } _constant = {} def action_get_links(self, data, **kwargs): ids = [] document = etree.parse( StringIO(data), etree.HTMLParser(encoding='utf-8') ) root = document.getroot() for link in root.xpath("//a/@href"): if "coleccion" in link: ids.append(int(link.split('=',2)[1])) return ids def action_get_manga(self, data, **kwargs): obj = {} document = etree.parse( StringIO(data), etree.HTMLParser(encoding='utf-8') ) root = document.getroot() try: # Details TD keys = [ ('original', 'original_title'), ('Gui', 'story'), ('Dibujo', 'art'), ('Editorial japonesa', 'japanese_publisher'), ('Editorial espa', 'spanish_publisher'), ('Colecci', 'collection'), ('Formato', 'format'), ('Sentido de lectura', 'read_direction'), ('meros en japo', 'japanese_numbers'), ('meros en espa', 'spanish_numbers'), ('Nota', 'note'), ] td = root.xpath('//table//td[@class="izq"][contains(.,"original")]') first = True td_contains = {} last = None for t in td[0].itertext(): text = t.strip() if t.strip(): if first: first = False obj['name'] = text else: if ':' in text: last = text.split(':')[0] td_contains[last] = u'' else: td_contains[last] += text for tdkey, row in td_contains.iteritems(): for key in keys: if key[0] in tdkey: obj[key[1]] = row.split('(web oficial)')[0] # Japanese publisher URL try: jap = root.xpath("//td[contains(., 'Editorial jap')]//text()[contains(., 'Editorial jap')]/following::a")[1] obj['japanese_publisher_url'] = jap.attrib['href'] except: obj['japanese_publisher_url'] = '' # Spanish publisher URL try: esp = root.xpath("//td[contains(., 'Editorial esp')]//text()[contains(., 'Editorial esp')]/following::a")[1] obj['spanish_publisher_url'] = esp.attrib['href'] except: obj['spanish_publisher_url'] = '' # Un/Published volumes obj['published_volumes'] = [] obj['unpublished_volumes'] = [] vols = root.xpath("//table[contains(., 'editados')]/following-sibling::table//td[not(contains(@class, 'separacion'))]//table//td[@class='cen']") for vol in vols: published = False volume = {} image = vol.find('img') volume['cover'] = "{}/{}".format(self.base, image.attrib['src']) text = [] if vol.find('hr') is not None: for dom in vol.iterchildren(): if dom.tag == 'hr': break else: if dom.text: text.append(dom.text) elif dom.tail: text.append(dom.tail) else: text = list(vol.itertext()) if len(text) >= 4: volume['date'] = text.pop(-1).strip() price = text.pop(-1) if 'ratuito' in price: volume['price'] = 0 else: volume['price'] = float(price.split(' ')[0].replace(',', '.')) volume['pages'] = text.pop(-1).strip() published = True volume['name'] = ("".join(text)).strip() if published: obj['published_volumes'].append(volume) else: obj['unpublished_volumes'].append(volume) # Description try: obj['summary'] = root.xpath("//h2[contains(., 'Sinopsis')]/../text()")[0].strip() except: obj['summary'] = '' except Exception as error: print("Error with: {}".format(obj['name'].encode('utf-8'))) print("---------- {}".format(error)) obj = 'Error' # pprint(obj) return obj