Added Crawler to utils folder
This commit is contained in:
parent
9fc00c7235
commit
c3449d4630
|
@ -44,9 +44,6 @@ cp -r %{_gitdir}/*.json $RPM_BUILD_ROOT%{_app_dir}/
|
|||
cp -r %{_gitdir}/*.py $RPM_BUILD_ROOT%{_app_dir}/
|
||||
cp -r %{_gitdir}/gruntfile.coffee $RPM_BUILD_ROOT%{_app_dir}/
|
||||
|
||||
|
||||
|
||||
|
||||
# -------------------------------------------------------------------------------------------- #
|
||||
# post-install section:
|
||||
# -------------------------------------------------------------------------------------------- #
|
||||
|
|
|
@ -0,0 +1,50 @@
|
|||
import sys
|
||||
import json
|
||||
from datCrawl import datCrawl
|
||||
from datCrawl.downloaders import DefaultDownloader
|
||||
from crawler import ListadoManga
|
||||
from progressbar import ProgressBar
|
||||
|
||||
datcrawl = datCrawl()
|
||||
datcrawl.register_downloader(DefaultDownloader)
|
||||
datcrawl.register_crawler(ListadoManga)
|
||||
|
||||
ids = datcrawl.run("http://www.listadomanga.es/lista.php")
|
||||
_list = []
|
||||
errors = 0
|
||||
success = 0
|
||||
custom_options = {
|
||||
'end': len(ids)-1,
|
||||
'width': 50,
|
||||
'fill': '#',
|
||||
'format': '%(progress)s%% [%(fill)s%(blank)s]'
|
||||
}
|
||||
|
||||
|
||||
f = open('data.json', 'w')
|
||||
|
||||
|
||||
p = ProgressBar(**custom_options)
|
||||
print "Crawling process in progress..."
|
||||
for _id in ids:
|
||||
#print("ID: %d" % _id)
|
||||
|
||||
value = datcrawl.run("http://www.listadomanga.es/coleccion.php?id=%d" % _id)
|
||||
if value is "Error":
|
||||
errors += 1
|
||||
else:
|
||||
success += 1
|
||||
_list.append(value)
|
||||
|
||||
sys.stdout.write("\r %s" % p)
|
||||
p += 1
|
||||
sys.stdout.flush()
|
||||
json.dump(_list,f)
|
||||
|
||||
print " <-- Completed!"
|
||||
f.close()
|
||||
print ""
|
||||
print "Summary:"
|
||||
print "--------"
|
||||
print "Success: %d" % success
|
||||
print "Errors: %d" % errors
|
|
@ -0,0 +1,99 @@
|
|||
from StringIO import StringIO
|
||||
from datCrawl.crawlers import Crawler
|
||||
from lxml import etree
|
||||
from datetime import date
|
||||
from pprint import pprint
|
||||
|
||||
|
||||
class ListadoManga(Crawler):
|
||||
urls = [
|
||||
('get_manga', '(?P<url>^http\:\/\/www\.listadomanga\.es\/coleccion\.php(.*)$)'),
|
||||
('get_links', '(?P<url>^http\:\/\/www\.listadomanga\.es\/lista\.php)'),
|
||||
]
|
||||
downloader = 'DefaultDownloader'
|
||||
|
||||
base = 'http://www.listadomanga.es'
|
||||
|
||||
_info = {
|
||||
'site_name': 'ListadoManga',
|
||||
'crawler_key': 'listadomanga',
|
||||
'language': 'en'
|
||||
}
|
||||
|
||||
_constant = {}
|
||||
|
||||
def action_get_links(self, data, **kwargs):
|
||||
ids = []
|
||||
|
||||
document = etree.parse(
|
||||
StringIO(data),
|
||||
etree.HTMLParser(encoding='utf-8')
|
||||
)
|
||||
root = document.getroot()
|
||||
|
||||
for link in root.xpath("//a/@href"):
|
||||
if "coleccion" in link:
|
||||
ids.append(int(link.split('=',2)[1]))
|
||||
|
||||
return ids
|
||||
|
||||
|
||||
|
||||
def action_get_manga(self, data, **kwargs):
|
||||
obj = {}
|
||||
|
||||
document = etree.parse(
|
||||
StringIO(data),
|
||||
etree.HTMLParser(encoding='utf-8')
|
||||
)
|
||||
root = document.getroot()
|
||||
|
||||
''' Get Info '''
|
||||
try:
|
||||
obj['title'] = root.xpath('//td[@class="izq"]/h2/text()')[0].strip()
|
||||
obj['dash'] = root.xpath('//td[@class="izq"]/a/text()')[0]
|
||||
obj['cartoonist'] = root.xpath('//td[@class="izq"]/a/text()')[1]
|
||||
obj['src_editorial'] = root.xpath('//td[@class="izq"]/a/text()')[2]
|
||||
obj['src_ed_website'] = root.xpath('//td[@class="izq"]/a/@href')[3]
|
||||
obj['editorial'] = root.xpath('//td[@class="izq"]/a/text()')[4]
|
||||
obj['ed_website'] = root.xpath('//td[@class="izq"]/a/@href')[5]
|
||||
obj['ed_collection'] = root.xpath('//td[@class="izq"]/a/text()')[6]
|
||||
obj['sinopsis'] = root.xpath("//h2[contains(., 'Sinopsis')]/../text()")
|
||||
|
||||
|
||||
|
||||
''' Get Image link and info'''
|
||||
# Edited numbers
|
||||
obj['zz_data_sets_published'] = []
|
||||
obj['zz_data_sets_unpublished'] = []
|
||||
package = {}
|
||||
data = root.xpath('/html/body/center/center[1]/table[3]/tr/td//text()')
|
||||
links = root.xpath('/html/body/center/center[1]/table[3]//@src')
|
||||
|
||||
for element in links:
|
||||
package['edited_image_link'] = self.base + "/" +element
|
||||
package['title'] = data.pop(0)
|
||||
package['pages'] = data.pop(0)
|
||||
package['price'] = data.pop(0)
|
||||
package['date'] = data.pop(0)
|
||||
obj['zz_data_sets_published'].append(package.copy())
|
||||
|
||||
package = {}
|
||||
check = root.xpath('/html/body/center/center[1]/table[4]//text()')
|
||||
if u'N\xfameros en preparaci\xf3n:' in check:
|
||||
links = root.xpath('/html/body/center/center[1]/table[5]//@src')
|
||||
titles = root.xpath('/html/body/center/center[1]/table[5]//text()')
|
||||
for element in links:
|
||||
package['no_edited_image_link'] = self.base + element
|
||||
package['title'] = titles.pop(0)
|
||||
obj['zz_data_sets_unpublished'].append(package.copy())
|
||||
|
||||
return obj
|
||||
|
||||
except:
|
||||
return "Error"
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,121 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
progressbar.py
|
||||
|
||||
A Python module with a ProgressBar class which can be used to represent a
|
||||
task's progress in the form of a progress bar and it can be formated in a
|
||||
basic way.
|
||||
|
||||
Here is some basic usage with the default options:
|
||||
|
||||
>>> from progressbar import ProgressBar
|
||||
>>> p = ProgressBar()
|
||||
>>> print p
|
||||
[>............] 0%
|
||||
>>> p + 1
|
||||
>>> print p
|
||||
[=>...........] 10%
|
||||
>>> p + 9
|
||||
>>> print p
|
||||
[============>] 0%
|
||||
|
||||
And here another example with different options:
|
||||
|
||||
>>> from progressbar import ProgressBar
|
||||
>>> custom_options = {
|
||||
... 'end': 100,
|
||||
... 'width': 20,
|
||||
... 'fill': '#',
|
||||
... 'format': '%(progress)s%% [%(fill)s%(blank)s]'
|
||||
... }
|
||||
>>> p = ProgressBar(**custom_options)
|
||||
>>> print p
|
||||
0% [....................]
|
||||
>>> p + 5
|
||||
>>> print p
|
||||
5% [#...................]
|
||||
>>> p + 9
|
||||
>>> print p
|
||||
100% [####################]
|
||||
"""
|
||||
import sys
|
||||
import time
|
||||
|
||||
class ProgressBar(object):
|
||||
"""ProgressBar class holds the options of the progress bar.
|
||||
The options are:
|
||||
start State from which start the progress. For example, if start is
|
||||
5 and the end is 10, the progress of this state is 50%
|
||||
end State in which the progress has terminated.
|
||||
width --
|
||||
fill String to use for "filled" used to represent the progress
|
||||
blank String to use for "filled" used to represent remaining space.
|
||||
format Format
|
||||
incremental
|
||||
"""
|
||||
def __init__(self, start=0, end=10, width=12, fill='=', blank='.', format='[%(fill)s>%(blank)s] %(progress)s%%', incremental=True):
|
||||
super(ProgressBar, self).__init__()
|
||||
|
||||
self.start = start
|
||||
self.end = end
|
||||
self.width = width
|
||||
self.fill = fill
|
||||
self.blank = blank
|
||||
self.format = format
|
||||
self.incremental = incremental
|
||||
self.step = 100 / float(width) #fix
|
||||
self.reset()
|
||||
|
||||
def __add__(self, increment):
|
||||
increment = self._get_progress(increment)
|
||||
if 100 > self.progress + increment:
|
||||
self.progress += increment
|
||||
else:
|
||||
self.progress = 100
|
||||
return self
|
||||
|
||||
def __str__(self):
|
||||
progressed = int(self.progress / self.step) #fix
|
||||
fill = progressed * self.fill
|
||||
blank = (self.width - progressed) * self.blank
|
||||
return self.format % {'fill': fill, 'blank': blank, 'progress': int(self.progress)}
|
||||
|
||||
__repr__ = __str__
|
||||
|
||||
def _get_progress(self, increment):
|
||||
return float(increment * 100) / self.end
|
||||
|
||||
def reset(self):
|
||||
"""Resets the current progress to the start point"""
|
||||
self.progress = self._get_progress(self.start)
|
||||
return self
|
||||
|
||||
|
||||
class AnimatedProgressBar(ProgressBar):
|
||||
"""Extends ProgressBar to allow you to use it straighforward on a script.
|
||||
Accepts an extra keyword argument named `stdout` (by default use sys.stdout)
|
||||
and may be any file-object to which send the progress status.
|
||||
"""
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(AnimatedProgressBar, self).__init__(*args, **kwargs)
|
||||
self.stdout = kwargs.get('stdout', sys.stdout)
|
||||
|
||||
def show_progress(self):
|
||||
if hasattr(self.stdout, 'isatty') and self.stdout.isatty():
|
||||
self.stdout.write('\r')
|
||||
else:
|
||||
self.stdout.write('\n')
|
||||
self.stdout.write(str(self))
|
||||
self.stdout.flush()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
p = AnimatedProgressBar(end=100, width=80)
|
||||
|
||||
while True:
|
||||
p + 5
|
||||
p.show_progress()
|
||||
time.sleep(0.1)
|
||||
if p.progress == 100:
|
||||
break
|
||||
print #new line
|
|
@ -0,0 +1,3 @@
|
|||
datCrawl==0.3.0
|
||||
lxml==3.3.3
|
||||
wsgiref==0.1.2
|
Reference in New Issue