From c3449d4630dec4cbd3535eb96b62107640774239 Mon Sep 17 00:00:00 2001 From: Juan Manuel Parrilla Date: Fri, 4 Apr 2014 19:02:25 +0200 Subject: [PATCH] Added Crawler to utils folder --- rpm/spec/shelfzilla.spec | 3 - utils/crawler_listadomanga/crawl.py | 50 ++++++++ utils/crawler_listadomanga/crawler.py | 99 ++++++++++++++++ utils/crawler_listadomanga/progressbar.py | 121 ++++++++++++++++++++ utils/crawler_listadomanga/requirements.txt | 3 + 5 files changed, 273 insertions(+), 3 deletions(-) create mode 100644 utils/crawler_listadomanga/crawl.py create mode 100644 utils/crawler_listadomanga/crawler.py create mode 100644 utils/crawler_listadomanga/progressbar.py create mode 100644 utils/crawler_listadomanga/requirements.txt diff --git a/rpm/spec/shelfzilla.spec b/rpm/spec/shelfzilla.spec index 0953f48..4d4c65b 100644 --- a/rpm/spec/shelfzilla.spec +++ b/rpm/spec/shelfzilla.spec @@ -44,9 +44,6 @@ cp -r %{_gitdir}/*.json $RPM_BUILD_ROOT%{_app_dir}/ cp -r %{_gitdir}/*.py $RPM_BUILD_ROOT%{_app_dir}/ cp -r %{_gitdir}/gruntfile.coffee $RPM_BUILD_ROOT%{_app_dir}/ - - - # -------------------------------------------------------------------------------------------- # # post-install section: # -------------------------------------------------------------------------------------------- # diff --git a/utils/crawler_listadomanga/crawl.py b/utils/crawler_listadomanga/crawl.py new file mode 100644 index 0000000..4e6b24a --- /dev/null +++ b/utils/crawler_listadomanga/crawl.py @@ -0,0 +1,50 @@ +import sys +import json +from datCrawl import datCrawl +from datCrawl.downloaders import DefaultDownloader +from crawler import ListadoManga +from progressbar import ProgressBar + +datcrawl = datCrawl() +datcrawl.register_downloader(DefaultDownloader) +datcrawl.register_crawler(ListadoManga) + +ids = datcrawl.run("http://www.listadomanga.es/lista.php") +_list = [] +errors = 0 +success = 0 +custom_options = { + 'end': len(ids)-1, + 'width': 50, + 'fill': '#', + 'format': '%(progress)s%% [%(fill)s%(blank)s]' +} + + +f = open('data.json', 'w') + + +p = ProgressBar(**custom_options) +print "Crawling process in progress..." +for _id in ids: + #print("ID: %d" % _id) + + value = datcrawl.run("http://www.listadomanga.es/coleccion.php?id=%d" % _id) + if value is "Error": + errors += 1 + else: + success += 1 + _list.append(value) + + sys.stdout.write("\r %s" % p) + p += 1 + sys.stdout.flush() +json.dump(_list,f) + +print " <-- Completed!" +f.close() +print "" +print "Summary:" +print "--------" +print "Success: %d" % success +print "Errors: %d" % errors \ No newline at end of file diff --git a/utils/crawler_listadomanga/crawler.py b/utils/crawler_listadomanga/crawler.py new file mode 100644 index 0000000..d3b2ee4 --- /dev/null +++ b/utils/crawler_listadomanga/crawler.py @@ -0,0 +1,99 @@ +from StringIO import StringIO +from datCrawl.crawlers import Crawler +from lxml import etree +from datetime import date +from pprint import pprint + + +class ListadoManga(Crawler): + urls = [ + ('get_manga', '(?P^http\:\/\/www\.listadomanga\.es\/coleccion\.php(.*)$)'), + ('get_links', '(?P^http\:\/\/www\.listadomanga\.es\/lista\.php)'), + ] + downloader = 'DefaultDownloader' + + base = 'http://www.listadomanga.es' + + _info = { + 'site_name': 'ListadoManga', + 'crawler_key': 'listadomanga', + 'language': 'en' + } + + _constant = {} + + def action_get_links(self, data, **kwargs): + ids = [] + + document = etree.parse( + StringIO(data), + etree.HTMLParser(encoding='utf-8') + ) + root = document.getroot() + + for link in root.xpath("//a/@href"): + if "coleccion" in link: + ids.append(int(link.split('=',2)[1])) + + return ids + + + + def action_get_manga(self, data, **kwargs): + obj = {} + + document = etree.parse( + StringIO(data), + etree.HTMLParser(encoding='utf-8') + ) + root = document.getroot() + + ''' Get Info ''' + try: + obj['title'] = root.xpath('//td[@class="izq"]/h2/text()')[0].strip() + obj['dash'] = root.xpath('//td[@class="izq"]/a/text()')[0] + obj['cartoonist'] = root.xpath('//td[@class="izq"]/a/text()')[1] + obj['src_editorial'] = root.xpath('//td[@class="izq"]/a/text()')[2] + obj['src_ed_website'] = root.xpath('//td[@class="izq"]/a/@href')[3] + obj['editorial'] = root.xpath('//td[@class="izq"]/a/text()')[4] + obj['ed_website'] = root.xpath('//td[@class="izq"]/a/@href')[5] + obj['ed_collection'] = root.xpath('//td[@class="izq"]/a/text()')[6] + obj['sinopsis'] = root.xpath("//h2[contains(., 'Sinopsis')]/../text()") + + + + ''' Get Image link and info''' + # Edited numbers + obj['zz_data_sets_published'] = [] + obj['zz_data_sets_unpublished'] = [] + package = {} + data = root.xpath('/html/body/center/center[1]/table[3]/tr/td//text()') + links = root.xpath('/html/body/center/center[1]/table[3]//@src') + + for element in links: + package['edited_image_link'] = self.base + "/" +element + package['title'] = data.pop(0) + package['pages'] = data.pop(0) + package['price'] = data.pop(0) + package['date'] = data.pop(0) + obj['zz_data_sets_published'].append(package.copy()) + + package = {} + check = root.xpath('/html/body/center/center[1]/table[4]//text()') + if u'N\xfameros en preparaci\xf3n:' in check: + links = root.xpath('/html/body/center/center[1]/table[5]//@src') + titles = root.xpath('/html/body/center/center[1]/table[5]//text()') + for element in links: + package['no_edited_image_link'] = self.base + element + package['title'] = titles.pop(0) + obj['zz_data_sets_unpublished'].append(package.copy()) + + return obj + + except: + return "Error" + + + + + diff --git a/utils/crawler_listadomanga/progressbar.py b/utils/crawler_listadomanga/progressbar.py new file mode 100644 index 0000000..912ab2f --- /dev/null +++ b/utils/crawler_listadomanga/progressbar.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python +""" +progressbar.py + +A Python module with a ProgressBar class which can be used to represent a +task's progress in the form of a progress bar and it can be formated in a +basic way. + +Here is some basic usage with the default options: + + >>> from progressbar import ProgressBar + >>> p = ProgressBar() + >>> print p + [>............] 0% + >>> p + 1 + >>> print p + [=>...........] 10% + >>> p + 9 + >>> print p + [============>] 0% + +And here another example with different options: + + >>> from progressbar import ProgressBar + >>> custom_options = { + ... 'end': 100, + ... 'width': 20, + ... 'fill': '#', + ... 'format': '%(progress)s%% [%(fill)s%(blank)s]' + ... } + >>> p = ProgressBar(**custom_options) + >>> print p + 0% [....................] + >>> p + 5 + >>> print p + 5% [#...................] + >>> p + 9 + >>> print p + 100% [####################] +""" +import sys +import time + +class ProgressBar(object): + """ProgressBar class holds the options of the progress bar. + The options are: + start State from which start the progress. For example, if start is + 5 and the end is 10, the progress of this state is 50% + end State in which the progress has terminated. + width -- + fill String to use for "filled" used to represent the progress + blank String to use for "filled" used to represent remaining space. + format Format + incremental + """ + def __init__(self, start=0, end=10, width=12, fill='=', blank='.', format='[%(fill)s>%(blank)s] %(progress)s%%', incremental=True): + super(ProgressBar, self).__init__() + + self.start = start + self.end = end + self.width = width + self.fill = fill + self.blank = blank + self.format = format + self.incremental = incremental + self.step = 100 / float(width) #fix + self.reset() + + def __add__(self, increment): + increment = self._get_progress(increment) + if 100 > self.progress + increment: + self.progress += increment + else: + self.progress = 100 + return self + + def __str__(self): + progressed = int(self.progress / self.step) #fix + fill = progressed * self.fill + blank = (self.width - progressed) * self.blank + return self.format % {'fill': fill, 'blank': blank, 'progress': int(self.progress)} + + __repr__ = __str__ + + def _get_progress(self, increment): + return float(increment * 100) / self.end + + def reset(self): + """Resets the current progress to the start point""" + self.progress = self._get_progress(self.start) + return self + + +class AnimatedProgressBar(ProgressBar): + """Extends ProgressBar to allow you to use it straighforward on a script. + Accepts an extra keyword argument named `stdout` (by default use sys.stdout) + and may be any file-object to which send the progress status. + """ + def __init__(self, *args, **kwargs): + super(AnimatedProgressBar, self).__init__(*args, **kwargs) + self.stdout = kwargs.get('stdout', sys.stdout) + + def show_progress(self): + if hasattr(self.stdout, 'isatty') and self.stdout.isatty(): + self.stdout.write('\r') + else: + self.stdout.write('\n') + self.stdout.write(str(self)) + self.stdout.flush() + + +if __name__ == '__main__': + p = AnimatedProgressBar(end=100, width=80) + + while True: + p + 5 + p.show_progress() + time.sleep(0.1) + if p.progress == 100: + break + print #new line \ No newline at end of file diff --git a/utils/crawler_listadomanga/requirements.txt b/utils/crawler_listadomanga/requirements.txt new file mode 100644 index 0000000..720e061 --- /dev/null +++ b/utils/crawler_listadomanga/requirements.txt @@ -0,0 +1,3 @@ +datCrawl==0.3.0 +lxml==3.3.3 +wsgiref==0.1.2