fmartingr
/
shelfzilla
Archived
1
0
Fork 0

Updated listadomanga crawler

This commit is contained in:
Felipe Martin 2014-04-23 00:34:17 +02:00
parent ed9bb424e9
commit 7432f212c6
3 changed files with 12 additions and 6 deletions

0
utils/__init__.py Normal file
View File

View File

@ -9,6 +9,14 @@ datcrawl = datCrawl()
datcrawl.register_downloader(DefaultDownloader) datcrawl.register_downloader(DefaultDownloader)
datcrawl.register_crawler(ListadoManga) datcrawl.register_crawler(ListadoManga)
# datcrawl.run('http://www.listadomanga.es/coleccion.php?id=60')
# datcrawl.run('http://www.listadomanga.es/coleccion.php?id=561')
# datcrawl.run('http://www.listadomanga.es/coleccion.php?id=1037')
# datcrawl.run('http://www.listadomanga.es/coleccion.php?id=1410')
# datcrawl.run('http://www.listadomanga.es/coleccion.php?id=98')
# exit
ids = datcrawl.run("http://www.listadomanga.es/lista.php") ids = datcrawl.run("http://www.listadomanga.es/lista.php")
_list = [] _list = []
errors = 0 errors = 0
@ -25,10 +33,8 @@ f = open('data.json', 'w')
p = ProgressBar(**custom_options) p = ProgressBar(**custom_options)
print "Crawling process in progress..." print("Crawling process in progress...")
for _id in ids: for _id in ids:
#print("ID: %d" % _id)
value = datcrawl.run("http://www.listadomanga.es/coleccion.php?id=%d" % _id) value = datcrawl.run("http://www.listadomanga.es/coleccion.php?id=%d" % _id)
if value is "Error": if value is "Error":
errors += 1 errors += 1
@ -47,4 +53,4 @@ print ""
print "Summary:" print "Summary:"
print "--------" print "--------"
print "Success: %d" % success print "Success: %d" % success
print "Errors: %d" % errors print "Errors: %d" % errors

View File

@ -94,9 +94,9 @@ class ListadoManga(Crawler):
# Spanish publisher URL # Spanish publisher URL
try: try:
esp = root.xpath("//td[contains(., 'Editorial esp')]//text()[contains(., 'Editorial esp')]/following::a")[1] esp = root.xpath("//td[contains(., 'Editorial esp')]//text()[contains(., 'Editorial esp')]/following::a")[1]
obj['japanese_publisher_url'] = esp.attrib['href'] obj['spanish_publisher_url'] = esp.attrib['href']
except: except:
obj['japanese_publisher_url'] = '' obj['spanish_publisher_url'] = ''
# Un/Published volumes # Un/Published volumes
obj['published_volumes'] = [] obj['published_volumes'] = []