Updated listadomanga crawler get_manga method
This commit is contained in:
parent
eaa631b59a
commit
5a08f09455
|
@ -37,8 +37,6 @@ class ListadoManga(Crawler):
|
||||||
|
|
||||||
return ids
|
return ids
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def action_get_manga(self, data, **kwargs):
|
def action_get_manga(self, data, **kwargs):
|
||||||
obj = {}
|
obj = {}
|
||||||
|
|
||||||
|
@ -46,54 +44,112 @@ class ListadoManga(Crawler):
|
||||||
StringIO(data),
|
StringIO(data),
|
||||||
etree.HTMLParser(encoding='utf-8')
|
etree.HTMLParser(encoding='utf-8')
|
||||||
)
|
)
|
||||||
|
|
||||||
root = document.getroot()
|
root = document.getroot()
|
||||||
|
|
||||||
''' Get Info '''
|
|
||||||
try:
|
try:
|
||||||
obj['title'] = root.xpath('//td[@class="izq"]/h2/text()')[0].strip()
|
# Details TD
|
||||||
obj['dash'] = root.xpath('//td[@class="izq"]/a/text()')[0]
|
keys = [
|
||||||
obj['cartoonist'] = root.xpath('//td[@class="izq"]/a/text()')[1]
|
('original', 'original_title'),
|
||||||
obj['src_editorial'] = root.xpath('//td[@class="izq"]/a/text()')[2]
|
('Gui', 'story'),
|
||||||
obj['src_ed_website'] = root.xpath('//td[@class="izq"]/a/@href')[3]
|
('Dibujo', 'art'),
|
||||||
obj['editorial'] = root.xpath('//td[@class="izq"]/a/text()')[4]
|
('Editorial japonesa', 'japanese_publisher'),
|
||||||
obj['ed_website'] = root.xpath('//td[@class="izq"]/a/@href')[5]
|
('Editorial espa', 'spanish_publisher'),
|
||||||
obj['ed_collection'] = root.xpath('//td[@class="izq"]/a/text()')[6]
|
('Colecci', 'collection'),
|
||||||
obj['sinopsis'] = root.xpath("//h2[contains(., 'Sinopsis')]/../text()")
|
('Formato', 'format'),
|
||||||
|
('Sentido de lectura', 'read_direction'),
|
||||||
|
('meros en japo', 'japanese_numbers'),
|
||||||
|
('meros en espa', 'spanish_numbers'),
|
||||||
|
('Nota', 'note'),
|
||||||
|
]
|
||||||
|
td = root.xpath('//table//td[@class="izq"][contains(.,"original")]')
|
||||||
|
first = True
|
||||||
|
td_contains = {}
|
||||||
|
last = None
|
||||||
|
for t in td[0].itertext():
|
||||||
|
text = t.strip()
|
||||||
|
if t.strip():
|
||||||
|
if first:
|
||||||
|
first = False
|
||||||
|
obj['name'] = text
|
||||||
|
else:
|
||||||
|
if ':' in text:
|
||||||
|
last = text.split(':')[0]
|
||||||
|
td_contains[last] = u''
|
||||||
|
else:
|
||||||
|
td_contains[last] += text
|
||||||
|
|
||||||
''' Get Image link and info'''
|
for tdkey, row in td_contains.iteritems():
|
||||||
# Edited numbers
|
for key in keys:
|
||||||
obj['zz_data_sets_published'] = []
|
if key[0] in tdkey:
|
||||||
obj['zz_data_sets_unpublished'] = []
|
obj[key[1]] = row.split('(web oficial)')[0]
|
||||||
package = {}
|
|
||||||
data = root.xpath('/html/body/center/center[1]/table[3]/tr/td//text()')
|
|
||||||
links = root.xpath('/html/body/center/center[1]/table[3]//@src')
|
|
||||||
|
|
||||||
for element in links:
|
# Japanese publisher URL
|
||||||
package['edited_image_link'] = self.base + "/" +element
|
try:
|
||||||
package['title'] = data.pop(0)
|
jap = root.xpath("//td[contains(., 'Editorial jap')]//text()[contains(., 'Editorial jap')]/following::a")[1]
|
||||||
package['pages'] = data.pop(0)
|
obj['japanese_publisher_url'] = jap.attrib['href']
|
||||||
package['price'] = data.pop(0)
|
except:
|
||||||
package['date'] = data.pop(0)
|
obj['japanese_publisher_url'] = ''
|
||||||
obj['zz_data_sets_published'].append(package.copy())
|
|
||||||
|
|
||||||
package = {}
|
|
||||||
check = root.xpath('/html/body/center/center[1]/table[4]//text()')
|
|
||||||
if u'N\xfameros en preparaci\xf3n:' in check:
|
|
||||||
links = root.xpath('/html/body/center/center[1]/table[5]//@src')
|
|
||||||
titles = root.xpath('/html/body/center/center[1]/table[5]//text()')
|
|
||||||
for element in links:
|
|
||||||
package['no_edited_image_link'] = self.base + element
|
|
||||||
package['title'] = titles.pop(0)
|
|
||||||
obj['zz_data_sets_unpublished'].append(package.copy())
|
|
||||||
|
|
||||||
return obj
|
# Spanish publisher URL
|
||||||
|
try:
|
||||||
|
esp = root.xpath("//td[contains(., 'Editorial esp')]//text()[contains(., 'Editorial esp')]/following::a")[1]
|
||||||
|
obj['japanese_publisher_url'] = esp.attrib['href']
|
||||||
|
except:
|
||||||
|
obj['japanese_publisher_url'] = ''
|
||||||
|
|
||||||
except:
|
# Un/Published volumes
|
||||||
return "Error"
|
obj['published_volumes'] = []
|
||||||
|
obj['unpublished_volumes'] = []
|
||||||
|
vols = root.xpath("//table[contains(., 'editados')]/following-sibling::table//td[not(contains(@class, 'separacion'))]//table//td[@class='cen']")
|
||||||
|
for vol in vols:
|
||||||
|
published = False
|
||||||
|
volume = {}
|
||||||
|
image = vol.find('img')
|
||||||
|
volume['cover'] = "{}/{}".format(self.base, image.attrib['src'])
|
||||||
|
|
||||||
|
text = []
|
||||||
|
|
||||||
|
|
||||||
|
if vol.find('hr') is not None:
|
||||||
|
for dom in vol.iterchildren():
|
||||||
|
if dom.tag == 'hr':
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
if dom.text:
|
||||||
|
text.append(dom.text)
|
||||||
|
elif dom.tail:
|
||||||
|
text.append(dom.tail)
|
||||||
|
else:
|
||||||
|
text = list(vol.itertext())
|
||||||
|
|
||||||
|
if len(text) >= 4:
|
||||||
|
volume['date'] = text.pop(-1).strip()
|
||||||
|
|
||||||
|
price = text.pop(-1)
|
||||||
|
if 'ratuito' in price:
|
||||||
|
volume['price'] = 0
|
||||||
|
else:
|
||||||
|
volume['price'] = float(price.split(' ')[0].replace(',', '.'))
|
||||||
|
volume['pages'] = text.pop(-1).strip()
|
||||||
|
published = True
|
||||||
|
|
||||||
|
volume['name'] = ("".join(text)).strip()
|
||||||
|
|
||||||
|
if published:
|
||||||
|
obj['published_volumes'].append(volume)
|
||||||
|
else:
|
||||||
|
obj['unpublished_volumes'].append(volume)
|
||||||
|
|
||||||
|
# Description
|
||||||
|
try:
|
||||||
|
obj['summary'] = root.xpath("//h2[contains(., 'Sinopsis')]/../text()")[0].strip()
|
||||||
|
except:
|
||||||
|
obj['summary'] = ''
|
||||||
|
except Exception as error:
|
||||||
|
print("Error with: {}".format(obj['name'].encode('utf-8')))
|
||||||
|
print("---------- {}".format(error))
|
||||||
|
obj = 'Error'
|
||||||
|
|
||||||
|
# pprint(obj)
|
||||||
|
|
||||||
|
return obj
|
||||||
|
|
Reference in New Issue