Updated listadomanga crawler get_manga method

2014-04-10 23:07:31 +02:00 · 2014-04-10 23:07:31 +02:00 · 5a08f09455
parent eaa631b59a
commit 5a08f09455
1 changed files with 99 additions and 43 deletions
--- a/utils/crawler_listadomanga/crawler.py
+++ b/utils/crawler_listadomanga/crawler.py
@ -37,8 +37,6 @@ class ListadoManga(Crawler):
        return ids
    def action_get_manga(self, data, **kwargs):
        obj = {}
@ -46,54 +44,112 @@ class ListadoManga(Crawler):
            StringIO(data),
            etree.HTMLParser(encoding='utf-8')
        )
        root = document.getroot()
        ''' Get Info '''
        try:
-            obj['title'] = root.xpath('//td[@class="izq"]/h2/text()')[0].strip()
+            # Details TD
-            obj['dash'] = root.xpath('//td[@class="izq"]/a/text()')[0]
+            keys = [
-            obj['cartoonist'] = root.xpath('//td[@class="izq"]/a/text()')[1]
+                ('original', 'original_title'),
-            obj['src_editorial'] = root.xpath('//td[@class="izq"]/a/text()')[2]
+                ('Gui', 'story'),
-            obj['src_ed_website'] = root.xpath('//td[@class="izq"]/a/@href')[3]
+                ('Dibujo', 'art'),
-            obj['editorial'] = root.xpath('//td[@class="izq"]/a/text()')[4]
+                ('Editorial japonesa', 'japanese_publisher'),
-            obj['ed_website'] = root.xpath('//td[@class="izq"]/a/@href')[5]
+                ('Editorial espa', 'spanish_publisher'),
-            obj['ed_collection'] = root.xpath('//td[@class="izq"]/a/text()')[6]
+                ('Colecci', 'collection'),
-            obj['sinopsis'] = root.xpath("//h2[contains(., 'Sinopsis')]/../text()")
+                ('Formato', 'format'),
-            
+                ('Sentido de lectura', 'read_direction'),
-        
+                ('meros en japo', 'japanese_numbers'),
                ('meros en espa', 'spanish_numbers'),
                ('Nota', 'note'),
            ]
            td = root.xpath('//table//td[@class="izq"][contains(.,"original")]')
            first = True
            td_contains = {}
            last = None
            for t in td[0].itertext():
                text = t.strip()
                if t.strip():
                    if first:
                        first = False
                        obj['name'] = text
                    else:
                        if ':' in text:
                            last = text.split(':')[0]
                            td_contains[last] = u''
                        else:
                            td_contains[last] += text
-            ''' Get Image link and info'''
+            for tdkey, row in td_contains.iteritems():
-            # Edited numbers
+                for key in keys:
-            obj['zz_data_sets_published'] = []
+                    if key[0] in tdkey:
-            obj['zz_data_sets_unpublished'] = []
+                        obj[key[1]] = row.split('(web oficial)')[0]
            package = {}
            data = root.xpath('/html/body/center/center[1]/table[3]/tr/td//text()')
            links = root.xpath('/html/body/center/center[1]/table[3]//@src')
-            for element in links:
+            # Japanese publisher URL
-                package['edited_image_link'] = self.base + "/" +element
+            try:
-                package['title'] = data.pop(0)
+                jap = root.xpath("//td[contains(., 'Editorial jap')]//text()[contains(., 'Editorial jap')]/following::a")[1]
-                package['pages'] = data.pop(0)
+                obj['japanese_publisher_url'] = jap.attrib['href']
-                package['price'] = data.pop(0)
+            except:
-                package['date'] = data.pop(0)
+                obj['japanese_publisher_url'] = ''
                obj['zz_data_sets_published'].append(package.copy())
            package = {}
            check = root.xpath('/html/body/center/center[1]/table[4]//text()')
            if u'N\xfameros en preparaci\xf3n:' in check:
                links = root.xpath('/html/body/center/center[1]/table[5]//@src')
                titles = root.xpath('/html/body/center/center[1]/table[5]//text()')
                for element in links:
                    package['no_edited_image_link'] = self.base + element
                    package['title'] = titles.pop(0)
                    obj['zz_data_sets_unpublished'].append(package.copy())
-            return obj
+            # Spanish publisher URL
            try:
                esp = root.xpath("//td[contains(., 'Editorial esp')]//text()[contains(., 'Editorial esp')]/following::a")[1]
                obj['japanese_publisher_url'] = esp.attrib['href']
            except:
                obj['japanese_publisher_url'] = ''
-        except:
+            # Un/Published volumes
-            return "Error"
+            obj['published_volumes'] = []
            obj['unpublished_volumes'] = []
            vols = root.xpath("//table[contains(., 'editados')]/following-sibling::table//td[not(contains(@class, 'separacion'))]//table//td[@class='cen']")
            for vol in vols:
                published = False
                volume = {}
                image = vol.find('img')
                volume['cover'] = "{}/{}".format(self.base, image.attrib['src'])
-        
+                text = []
                if vol.find('hr') is not None:
                    for dom in vol.iterchildren():
                        if dom.tag == 'hr':
                            break
                        else:
                            if dom.text:
                                text.append(dom.text)
                            elif dom.tail:
                                text.append(dom.tail)
                else:
                    text = list(vol.itertext())
                if len(text) >= 4:
                    volume['date'] = text.pop(-1).strip()
                    price = text.pop(-1)
                    if 'ratuito' in price:
                        volume['price'] = 0
                    else:
                        volume['price'] = float(price.split(' ')[0].replace(',', '.'))
                    volume['pages'] = text.pop(-1).strip()
                    published = True
                volume['name'] = ("".join(text)).strip()
                if published:
                    obj['published_volumes'].append(volume)
                else:
                    obj['unpublished_volumes'].append(volume)
            # Description
            try:
                obj['summary'] = root.xpath("//h2[contains(., 'Sinopsis')]/../text()")[0].strip()
            except:
                obj['summary'] = ''
        except Exception as error:
            print("Error with: {}".format(obj['name'].encode('utf-8')))
            print("---------- {}".format(error))
            obj = 'Error'
        # pprint(obj)
        return obj