Updated listadomanga crawler
This commit is contained in:
parent
ed9bb424e9
commit
7432f212c6
|
@ -9,6 +9,14 @@ datcrawl = datCrawl()
|
||||||
datcrawl.register_downloader(DefaultDownloader)
|
datcrawl.register_downloader(DefaultDownloader)
|
||||||
datcrawl.register_crawler(ListadoManga)
|
datcrawl.register_crawler(ListadoManga)
|
||||||
|
|
||||||
|
# datcrawl.run('http://www.listadomanga.es/coleccion.php?id=60')
|
||||||
|
# datcrawl.run('http://www.listadomanga.es/coleccion.php?id=561')
|
||||||
|
# datcrawl.run('http://www.listadomanga.es/coleccion.php?id=1037')
|
||||||
|
# datcrawl.run('http://www.listadomanga.es/coleccion.php?id=1410')
|
||||||
|
# datcrawl.run('http://www.listadomanga.es/coleccion.php?id=98')
|
||||||
|
|
||||||
|
# exit
|
||||||
|
|
||||||
ids = datcrawl.run("http://www.listadomanga.es/lista.php")
|
ids = datcrawl.run("http://www.listadomanga.es/lista.php")
|
||||||
_list = []
|
_list = []
|
||||||
errors = 0
|
errors = 0
|
||||||
|
@ -25,10 +33,8 @@ f = open('data.json', 'w')
|
||||||
|
|
||||||
|
|
||||||
p = ProgressBar(**custom_options)
|
p = ProgressBar(**custom_options)
|
||||||
print "Crawling process in progress..."
|
print("Crawling process in progress...")
|
||||||
for _id in ids:
|
for _id in ids:
|
||||||
#print("ID: %d" % _id)
|
|
||||||
|
|
||||||
value = datcrawl.run("http://www.listadomanga.es/coleccion.php?id=%d" % _id)
|
value = datcrawl.run("http://www.listadomanga.es/coleccion.php?id=%d" % _id)
|
||||||
if value is "Error":
|
if value is "Error":
|
||||||
errors += 1
|
errors += 1
|
||||||
|
|
|
@ -94,9 +94,9 @@ class ListadoManga(Crawler):
|
||||||
# Spanish publisher URL
|
# Spanish publisher URL
|
||||||
try:
|
try:
|
||||||
esp = root.xpath("//td[contains(., 'Editorial esp')]//text()[contains(., 'Editorial esp')]/following::a")[1]
|
esp = root.xpath("//td[contains(., 'Editorial esp')]//text()[contains(., 'Editorial esp')]/following::a")[1]
|
||||||
obj['japanese_publisher_url'] = esp.attrib['href']
|
obj['spanish_publisher_url'] = esp.attrib['href']
|
||||||
except:
|
except:
|
||||||
obj['japanese_publisher_url'] = ''
|
obj['spanish_publisher_url'] = ''
|
||||||
|
|
||||||
# Un/Published volumes
|
# Un/Published volumes
|
||||||
obj['published_volumes'] = []
|
obj['published_volumes'] = []
|
||||||
|
|
Reference in New Issue