Browse Source

Added Crawler to utils folder

crawlers/norma
Juan Manuel Parrilla 8 years ago
parent
commit
c3449d4630
  1. 3
      rpm/spec/shelfzilla.spec
  2. 50
      utils/crawler_listadomanga/crawl.py
  3. 99
      utils/crawler_listadomanga/crawler.py
  4. 121
      utils/crawler_listadomanga/progressbar.py
  5. 3
      utils/crawler_listadomanga/requirements.txt

3
rpm/spec/shelfzilla.spec

@ -44,9 +44,6 @@ cp -r %{_gitdir}/*.json $RPM_BUILD_ROOT%{_app_dir}/
cp -r %{_gitdir}/*.py $RPM_BUILD_ROOT%{_app_dir}/
cp -r %{_gitdir}/gruntfile.coffee $RPM_BUILD_ROOT%{_app_dir}/
# -------------------------------------------------------------------------------------------- #
# post-install section:
# -------------------------------------------------------------------------------------------- #

50
utils/crawler_listadomanga/crawl.py

@ -0,0 +1,50 @@
import sys
import json
from datCrawl import datCrawl
from datCrawl.downloaders import DefaultDownloader
from crawler import ListadoManga
from progressbar import ProgressBar
datcrawl = datCrawl()
datcrawl.register_downloader(DefaultDownloader)
datcrawl.register_crawler(ListadoManga)
ids = datcrawl.run("http://www.listadomanga.es/lista.php")
_list = []
errors = 0
success = 0
custom_options = {
'end': len(ids)-1,
'width': 50,
'fill': '#',
'format': '%(progress)s%% [%(fill)s%(blank)s]'
}
f = open('data.json', 'w')
p = ProgressBar(**custom_options)
print "Crawling process in progress..."
for _id in ids:
#print("ID: %d" % _id)
value = datcrawl.run("http://www.listadomanga.es/coleccion.php?id=%d" % _id)
if value is "Error":
errors += 1
else:
success += 1
_list.append(value)
sys.stdout.write("\r %s" % p)
p += 1
sys.stdout.flush()
json.dump(_list,f)
print " <-- Completed!"
f.close()
print ""
print "Summary:"
print "--------"
print "Success: %d" % success
print "Errors: %d" % errors

99
utils/crawler_listadomanga/crawler.py

@ -0,0 +1,99 @@
from StringIO import StringIO
from datCrawl.crawlers import Crawler
from lxml import etree
from datetime import date
from pprint import pprint
class ListadoManga(Crawler):
urls = [
('get_manga', '(?P<url>^http\:\/\/www\.listadomanga\.es\/coleccion\.php(.*)$)'),
('get_links', '(?P<url>^http\:\/\/www\.listadomanga\.es\/lista\.php)'),
]
downloader = 'DefaultDownloader'
base = 'http://www.listadomanga.es'
_info = {
'site_name': 'ListadoManga',
'crawler_key': 'listadomanga',
'language': 'en'
}
_constant = {}
def action_get_links(self, data, **kwargs):
ids = []
document = etree.parse(
StringIO(data),
etree.HTMLParser(encoding='utf-8')
)
root = document.getroot()
for link in root.xpath("//a/@href"):
if "coleccion" in link:
ids.append(int(link.split('=',2)[1]))
return ids
def action_get_manga(self, data, **kwargs):
obj = {}
document = etree.parse(
StringIO(data),
etree.HTMLParser(encoding='utf-8')
)
root = document.getroot()
''' Get Info '''
try:
obj['title'] = root.xpath('//td[@class="izq"]/h2/text()')[0].strip()
obj['dash'] = root.xpath('//td[@class="izq"]/a/text()')[0]
obj['cartoonist'] = root.xpath('//td[@class="izq"]/a/text()')[1]
obj['src_editorial'] = root.xpath('//td[@class="izq"]/a/text()')[2]
obj['src_ed_website'] = root.xpath('//td[@class="izq"]/a/@href')[3]
obj['editorial'] = root.xpath('//td[@class="izq"]/a/text()')[4]
obj['ed_website'] = root.xpath('//td[@class="izq"]/a/@href')[5]
obj['ed_collection'] = root.xpath('//td[@class="izq"]/a/text()')[6]
obj['sinopsis'] = root.xpath("//h2[contains(., 'Sinopsis')]/../text()")
''' Get Image link and info'''
# Edited numbers
obj['zz_data_sets_published'] = []
obj['zz_data_sets_unpublished'] = []
package = {}
data = root.xpath('/html/body/center/center[1]/table[3]/tr/td//text()')
links = root.xpath('/html/body/center/center[1]/table[3]//@src')
for element in links:
package['edited_image_link'] = self.base + "/" +element
package['title'] = data.pop(0)
package['pages'] = data.pop(0)
package['price'] = data.pop(0)
package['date'] = data.pop(0)
obj['zz_data_sets_published'].append(package.copy())
package = {}
check = root.xpath('/html/body/center/center[1]/table[4]//text()')
if u'N\xfameros en preparaci\xf3n:' in check:
links = root.xpath('/html/body/center/center[1]/table[5]//@src')
titles = root.xpath('/html/body/center/center[1]/table[5]//text()')
for element in links:
package['no_edited_image_link'] = self.base + element
package['title'] = titles.pop(0)
obj['zz_data_sets_unpublished'].append(package.copy())
return obj
except:
return "Error"

121
utils/crawler_listadomanga/progressbar.py

@ -0,0 +1,121 @@
#!/usr/bin/env python
"""
progressbar.py
A Python module with a ProgressBar class which can be used to represent a
task's progress in the form of a progress bar and it can be formated in a
basic way.
Here is some basic usage with the default options:
>>> from progressbar import ProgressBar
>>> p = ProgressBar()
>>> print p
[>............] 0%
>>> p + 1
>>> print p
[=>...........] 10%
>>> p + 9
>>> print p
[============>] 0%
And here another example with different options:
>>> from progressbar import ProgressBar
>>> custom_options = {
... 'end': 100,
... 'width': 20,
... 'fill': '#',
... 'format': '%(progress)s%% [%(fill)s%(blank)s]'
... }
>>> p = ProgressBar(**custom_options)
>>> print p
0% [....................]
>>> p + 5
>>> print p
5% [#...................]
>>> p + 9
>>> print p
100% [####################]
"""
import sys
import time
class ProgressBar(object):
"""ProgressBar class holds the options of the progress bar.
The options are:
start State from which start the progress. For example, if start is
5 and the end is 10, the progress of this state is 50%
end State in which the progress has terminated.
width --
fill String to use for "filled" used to represent the progress
blank String to use for "filled" used to represent remaining space.
format Format
incremental
"""
def __init__(self, start=0, end=10, width=12, fill='=', blank='.', format='[%(fill)s>%(blank)s] %(progress)s%%', incremental=True):
super(ProgressBar, self).__init__()
self.start = start
self.end = end
self.width = width
self.fill = fill
self.blank = blank
self.format = format
self.incremental = incremental
self.step = 100 / float(width) #fix
self.reset()
def __add__(self, increment):
increment = self._get_progress(increment)
if 100 > self.progress + increment:
self.progress += increment
else:
self.progress = 100
return self
def __str__(self):
progressed = int(self.progress / self.step) #fix
fill = progressed * self.fill
blank = (self.width - progressed) * self.blank
return self.format % {'fill': fill, 'blank': blank, 'progress': int(self.progress)}
__repr__ = __str__
def _get_progress(self, increment):
return float(increment * 100) / self.end
def reset(self):
"""Resets the current progress to the start point"""
self.progress = self._get_progress(self.start)
return self
class AnimatedProgressBar(ProgressBar):
"""Extends ProgressBar to allow you to use it straighforward on a script.
Accepts an extra keyword argument named `stdout` (by default use sys.stdout)
and may be any file-object to which send the progress status.
"""
def __init__(self, *args, **kwargs):
super(AnimatedProgressBar, self).__init__(*args, **kwargs)
self.stdout = kwargs.get('stdout', sys.stdout)
def show_progress(self):
if hasattr(self.stdout, 'isatty') and self.stdout.isatty():
self.stdout.write('\r')
else:
self.stdout.write('\n')
self.stdout.write(str(self))
self.stdout.flush()
if __name__ == '__main__':
p = AnimatedProgressBar(end=100, width=80)
while True:
p + 5
p.show_progress()
time.sleep(0.1)
if p.progress == 100:
break
print #new line

3
utils/crawler_listadomanga/requirements.txt

@ -0,0 +1,3 @@
datCrawl==0.3.0
lxml==3.3.3
wsgiref==0.1.2