Added Crawler to utils folder

2014-04-04 19:02:25 +02:00 · 2014-04-04 19:02:25 +02:00 · c3449d4630
parent 9fc00c7235
commit c3449d4630
5 changed files with 273 additions and 3 deletions
--- a/rpm/spec/shelfzilla.spec
+++ b/rpm/spec/shelfzilla.spec
@ -44,9 +44,6 @@ cp -r %{_gitdir}/*.json $RPM_BUILD_ROOT%{_app_dir}/
 cp -r %{_gitdir}/*.py $RPM_BUILD_ROOT%{_app_dir}/
 cp -r %{_gitdir}/gruntfile.coffee $RPM_BUILD_ROOT%{_app_dir}/
 # -------------------------------------------------------------------------------------------- #
 # post-install section:
 # -------------------------------------------------------------------------------------------- #
--- a/utils/crawler_listadomanga/crawl.py
+++ b/utils/crawler_listadomanga/crawl.py
@ -0,0 +1,50 @@
 import sys
 import json
 from datCrawl import datCrawl
 from datCrawl.downloaders import DefaultDownloader
 from crawler import ListadoManga
 from progressbar import ProgressBar
 datcrawl = datCrawl()
 datcrawl.register_downloader(DefaultDownloader)
 datcrawl.register_crawler(ListadoManga)
 ids = datcrawl.run("http://www.listadomanga.es/lista.php")
 _list = []
 errors = 0
 success = 0
 custom_options = {
    'end': len(ids)-1,
    'width': 50,
    'fill': '#',
    'format': '%(progress)s%% [%(fill)s%(blank)s]'
 }
 f = open('data.json', 'w')
 p = ProgressBar(**custom_options)
 print "Crawling process in progress..."
 for _id in ids:
    #print("ID: %d" % _id)
    value = datcrawl.run("http://www.listadomanga.es/coleccion.php?id=%d" % _id)
    if value is "Error":
        errors += 1
    else:
        success += 1
        _list.append(value)
    sys.stdout.write("\r %s" % p)
    p += 1
    sys.stdout.flush()
 json.dump(_list,f)
 print "  <-- Completed!"
 f.close()
 print ""
 print "Summary:"
 print "--------"
 print "Success: %d" % success
 print "Errors: %d" % errors
--- a/utils/crawler_listadomanga/crawler.py
+++ b/utils/crawler_listadomanga/crawler.py
@ -0,0 +1,99 @@
 from StringIO import StringIO
 from datCrawl.crawlers import Crawler
 from lxml import etree
 from datetime import date
 from pprint import pprint
 class ListadoManga(Crawler):
    urls = [
        ('get_manga', '(?P<url>^http\:\/\/www\.listadomanga\.es\/coleccion\.php(.*)$)'),
        ('get_links', '(?P<url>^http\:\/\/www\.listadomanga\.es\/lista\.php)'),
    ]
    downloader = 'DefaultDownloader'
    base = 'http://www.listadomanga.es'
    _info = {
        'site_name': 'ListadoManga',
        'crawler_key': 'listadomanga',
        'language': 'en'
    }
    _constant = {}
    def action_get_links(self, data, **kwargs):
        ids = []
        document = etree.parse(
            StringIO(data),
            etree.HTMLParser(encoding='utf-8')
        )
        root = document.getroot()
        for link in root.xpath("//a/@href"):
            if "coleccion" in link:
                ids.append(int(link.split('=',2)[1]))
        return ids
    def action_get_manga(self, data, **kwargs):
        obj = {}
        document = etree.parse(
            StringIO(data),
            etree.HTMLParser(encoding='utf-8')
        )
        root = document.getroot()
        ''' Get Info '''
        try:
            obj['title'] = root.xpath('//td[@class="izq"]/h2/text()')[0].strip()
            obj['dash'] = root.xpath('//td[@class="izq"]/a/text()')[0]
            obj['cartoonist'] = root.xpath('//td[@class="izq"]/a/text()')[1]
            obj['src_editorial'] = root.xpath('//td[@class="izq"]/a/text()')[2]
            obj['src_ed_website'] = root.xpath('//td[@class="izq"]/a/@href')[3]
            obj['editorial'] = root.xpath('//td[@class="izq"]/a/text()')[4]
            obj['ed_website'] = root.xpath('//td[@class="izq"]/a/@href')[5]
            obj['ed_collection'] = root.xpath('//td[@class="izq"]/a/text()')[6]
            obj['sinopsis'] = root.xpath("//h2[contains(., 'Sinopsis')]/../text()")
            ''' Get Image link and info'''
            # Edited numbers
            obj['zz_data_sets_published'] = []
            obj['zz_data_sets_unpublished'] = []
            package = {}
            data = root.xpath('/html/body/center/center[1]/table[3]/tr/td//text()')
            links = root.xpath('/html/body/center/center[1]/table[3]//@src')
            for element in links:
                package['edited_image_link'] = self.base + "/" +element
                package['title'] = data.pop(0)
                package['pages'] = data.pop(0)
                package['price'] = data.pop(0)
                package['date'] = data.pop(0)
                obj['zz_data_sets_published'].append(package.copy())
            package = {}
            check = root.xpath('/html/body/center/center[1]/table[4]//text()')
            if u'N\xfameros en preparaci\xf3n:' in check:
                links = root.xpath('/html/body/center/center[1]/table[5]//@src')
                titles = root.xpath('/html/body/center/center[1]/table[5]//text()')
                for element in links:
                    package['no_edited_image_link'] = self.base + element
                    package['title'] = titles.pop(0)
                    obj['zz_data_sets_unpublished'].append(package.copy())
            return obj
        except:
            return "Error"
--- a/utils/crawler_listadomanga/progressbar.py
+++ b/utils/crawler_listadomanga/progressbar.py
@ -0,0 +1,121 @@
 #!/usr/bin/env python
 """
 progressbar.py
 A Python module with a ProgressBar class which can be used to represent a
 task's progress in the form of a progress bar and it can be formated in a
 basic way.
 Here is some basic usage with the default options:
    >>> from progressbar import ProgressBar
    >>> p = ProgressBar()
    >>> print p
    [>............] 0%
    >>> p + 1
    >>> print p
    [=>...........] 10%
    >>> p + 9
    >>> print p
    [============>] 0%
 And here another example with different options:
    >>> from progressbar import ProgressBar
    >>> custom_options = {
    ...     'end': 100, 
    ...     'width': 20, 
    ...     'fill': '#',
    ...     'format': '%(progress)s%% [%(fill)s%(blank)s]'
    ... }
    >>> p = ProgressBar(**custom_options)
    >>> print p
    0% [....................]
    >>> p + 5
    >>> print p
    5% [#...................]
    >>> p + 9
    >>> print p
    100% [####################]
 """
 import sys
 import time
 class ProgressBar(object):
    """ProgressBar class holds the options of the progress bar.
    The options are:
        start   State from which start the progress. For example, if start is 
                5 and the end is 10, the progress of this state is 50%
        end     State in which the progress has terminated.
        width   --
        fill    String to use for "filled" used to represent the progress
        blank   String to use for "filled" used to represent remaining space.
        format  Format
        incremental
    """
    def __init__(self, start=0, end=10, width=12, fill='=', blank='.', format='[%(fill)s>%(blank)s] %(progress)s%%', incremental=True):
        super(ProgressBar, self).__init__()
        self.start = start
        self.end = end
        self.width = width
        self.fill = fill
        self.blank = blank
        self.format = format
        self.incremental = incremental
        self.step = 100 / float(width) #fix
        self.reset()
    def __add__(self, increment):
        increment = self._get_progress(increment)
        if 100 > self.progress + increment:
            self.progress += increment
        else:
            self.progress = 100
        return self
    def __str__(self):
        progressed = int(self.progress / self.step) #fix
        fill = progressed * self.fill
        blank = (self.width - progressed) * self.blank
        return self.format % {'fill': fill, 'blank': blank, 'progress': int(self.progress)}
    __repr__ = __str__
    def _get_progress(self, increment):
        return float(increment * 100) / self.end
    def reset(self):
        """Resets the current progress to the start point"""
        self.progress = self._get_progress(self.start)
        return self
 class AnimatedProgressBar(ProgressBar):
    """Extends ProgressBar to allow you to use it straighforward on a script.
    Accepts an extra keyword argument named `stdout` (by default use sys.stdout)
    and may be any file-object to which send the progress status.
    """
    def __init__(self, *args, **kwargs):
        super(AnimatedProgressBar, self).__init__(*args, **kwargs)
        self.stdout = kwargs.get('stdout', sys.stdout)
    def show_progress(self):
        if hasattr(self.stdout, 'isatty') and self.stdout.isatty():
            self.stdout.write('\r')
        else:
            self.stdout.write('\n')
        self.stdout.write(str(self))
        self.stdout.flush()
 if __name__ == '__main__':
    p = AnimatedProgressBar(end=100, width=80)
    while True:
        p + 5
        p.show_progress()
        time.sleep(0.1)
        if p.progress == 100:
            break
    print #new line
--- a/utils/crawler_listadomanga/requirements.txt
+++ b/utils/crawler_listadomanga/requirements.txt
@ -0,0 +1,3 @@
 datCrawl==0.3.0
 lxml==3.3.3
 wsgiref==0.1.2