Added Crawler to utils folder

2014-04-04 19:02:25 +02:00 · 2014-04-04 19:02:25 +02:00 · c3449d4630
parent 9fc00c7235
commit c3449d4630
5 changed files with 273 additions and 3 deletions
--- a/rpm/spec/shelfzilla.spec
+++ b/rpm/spec/shelfzilla.spec
@ -44,9 +44,6 @@ cp -r %{_gitdir}/*.json $RPM_BUILD_ROOT%{_app_dir}/
 cp -r %{_gitdir}/*.py $RPM_BUILD_ROOT%{_app_dir}/
 cp -r %{_gitdir}/gruntfile.coffee $RPM_BUILD_ROOT%{_app_dir}/

-
-
-
 # -------------------------------------------------------------------------------------------- #
 # post-install section:
 # -------------------------------------------------------------------------------------------- #
--- a/utils/crawler_listadomanga/crawl.py
+++ b/utils/crawler_listadomanga/crawl.py
@ -0,0 +1,50 @@
+import sys
+import json
+from datCrawl import datCrawl
+from datCrawl.downloaders import DefaultDownloader
+from crawler import ListadoManga
+from progressbar import ProgressBar
+
+datcrawl = datCrawl()
+datcrawl.register_downloader(DefaultDownloader)
+datcrawl.register_crawler(ListadoManga)
+
+ids = datcrawl.run("http://www.listadomanga.es/lista.php")
+_list = []
+errors = 0
+success = 0
+custom_options = {
+    'end': len(ids)-1,
+    'width': 50,
+    'fill': '#',
+    'format': '%(progress)s%% [%(fill)s%(blank)s]'
+}
+
+
+f = open('data.json', 'w')
+
+
+p = ProgressBar(**custom_options)
+print "Crawling process in progress..."
+for _id in ids:
+    #print("ID: %d" % _id)
+    
+    value = datcrawl.run("http://www.listadomanga.es/coleccion.php?id=%d" % _id)
+    if value is "Error":
+        errors += 1
+    else:
+        success += 1
+        _list.append(value)
+
+    sys.stdout.write("\r %s" % p)
+    p += 1
+    sys.stdout.flush()
+json.dump(_list,f)
+
+print "  <-- Completed!"
+f.close()
+print ""
+print "Summary:"
+print "--------"
+print "Success: %d" % success
+print "Errors: %d" % errors
--- a/utils/crawler_listadomanga/crawler.py
+++ b/utils/crawler_listadomanga/crawler.py
@ -0,0 +1,99 @@
+from StringIO import StringIO
+from datCrawl.crawlers import Crawler
+from lxml import etree
+from datetime import date
+from pprint import pprint
+
+
+class ListadoManga(Crawler):
+    urls = [
+        ('get_manga', '(?P<url>^http\:\/\/www\.listadomanga\.es\/coleccion\.php(.*)$)'),
+        ('get_links', '(?P<url>^http\:\/\/www\.listadomanga\.es\/lista\.php)'),
+    ]
+    downloader = 'DefaultDownloader'
+
+    base = 'http://www.listadomanga.es'
+
+    _info = {
+        'site_name': 'ListadoManga',
+        'crawler_key': 'listadomanga',
+        'language': 'en'
+    }
+
+    _constant = {}
+
+    def action_get_links(self, data, **kwargs):
+        ids = []
+
+        document = etree.parse(
+            StringIO(data),
+            etree.HTMLParser(encoding='utf-8')
+        )
+        root = document.getroot()
+
+        for link in root.xpath("//a/@href"):
+            if "coleccion" in link:
+                ids.append(int(link.split('=',2)[1]))
+
+        return ids
+
+
+
+    def action_get_manga(self, data, **kwargs):
+        obj = {}
+
+        document = etree.parse(
+            StringIO(data),
+            etree.HTMLParser(encoding='utf-8')
+        )
+        root = document.getroot()
+
+        ''' Get Info '''
+        try:
+            obj['title'] = root.xpath('//td[@class="izq"]/h2/text()')[0].strip()
+            obj['dash'] = root.xpath('//td[@class="izq"]/a/text()')[0]
+            obj['cartoonist'] = root.xpath('//td[@class="izq"]/a/text()')[1]
+            obj['src_editorial'] = root.xpath('//td[@class="izq"]/a/text()')[2]
+            obj['src_ed_website'] = root.xpath('//td[@class="izq"]/a/@href')[3]
+            obj['editorial'] = root.xpath('//td[@class="izq"]/a/text()')[4]
+            obj['ed_website'] = root.xpath('//td[@class="izq"]/a/@href')[5]
+            obj['ed_collection'] = root.xpath('//td[@class="izq"]/a/text()')[6]
+            obj['sinopsis'] = root.xpath("//h2[contains(., 'Sinopsis')]/../text()")
+            
+        
+
+            ''' Get Image link and info'''
+            # Edited numbers
+            obj['zz_data_sets_published'] = []
+            obj['zz_data_sets_unpublished'] = []
+            package = {}
+            data = root.xpath('/html/body/center/center[1]/table[3]/tr/td//text()')
+            links = root.xpath('/html/body/center/center[1]/table[3]//@src')
+
+            for element in links:
+                package['edited_image_link'] = self.base + "/" +element
+                package['title'] = data.pop(0)
+                package['pages'] = data.pop(0)
+                package['price'] = data.pop(0)
+                package['date'] = data.pop(0)
+                obj['zz_data_sets_published'].append(package.copy())
+                
+            package = {}
+            check = root.xpath('/html/body/center/center[1]/table[4]//text()')
+            if u'N\xfameros en preparaci\xf3n:' in check:
+                links = root.xpath('/html/body/center/center[1]/table[5]//@src')
+                titles = root.xpath('/html/body/center/center[1]/table[5]//text()')
+                for element in links:
+                    package['no_edited_image_link'] = self.base + element
+                    package['title'] = titles.pop(0)
+                    obj['zz_data_sets_unpublished'].append(package.copy())
+
+            return obj
+
+        except:
+            return "Error"
+
+        
+                    
+
+
--- a/utils/crawler_listadomanga/progressbar.py
+++ b/utils/crawler_listadomanga/progressbar.py
@ -0,0 +1,121 @@
+#!/usr/bin/env python
+"""
+progressbar.py
+
+A Python module with a ProgressBar class which can be used to represent a
+task's progress in the form of a progress bar and it can be formated in a
+basic way.
+
+Here is some basic usage with the default options:
+
+    >>> from progressbar import ProgressBar
+    >>> p = ProgressBar()
+    >>> print p
+    [>............] 0%
+    >>> p + 1
+    >>> print p
+    [=>...........] 10%
+    >>> p + 9
+    >>> print p
+    [============>] 0%
+
+And here another example with different options:
+
+    >>> from progressbar import ProgressBar
+    >>> custom_options = {
+    ...     'end': 100, 
+    ...     'width': 20, 
+    ...     'fill': '#',
+    ...     'format': '%(progress)s%% [%(fill)s%(blank)s]'
+    ... }
+    >>> p = ProgressBar(**custom_options)
+    >>> print p
+    0% [....................]
+    >>> p + 5
+    >>> print p
+    5% [#...................]
+    >>> p + 9
+    >>> print p
+    100% [####################]
+"""
+import sys
+import time
+
+class ProgressBar(object):
+    """ProgressBar class holds the options of the progress bar.
+    The options are:
+        start   State from which start the progress. For example, if start is 
+                5 and the end is 10, the progress of this state is 50%
+        end     State in which the progress has terminated.
+        width   --
+        fill    String to use for "filled" used to represent the progress
+        blank   String to use for "filled" used to represent remaining space.
+        format  Format
+        incremental
+    """
+    def __init__(self, start=0, end=10, width=12, fill='=', blank='.', format='[%(fill)s>%(blank)s] %(progress)s%%', incremental=True):
+        super(ProgressBar, self).__init__()
+
+        self.start = start
+        self.end = end
+        self.width = width
+        self.fill = fill
+        self.blank = blank
+        self.format = format
+        self.incremental = incremental
+        self.step = 100 / float(width) #fix
+        self.reset()
+
+    def __add__(self, increment):
+        increment = self._get_progress(increment)
+        if 100 > self.progress + increment:
+            self.progress += increment
+        else:
+            self.progress = 100
+        return self
+
+    def __str__(self):
+        progressed = int(self.progress / self.step) #fix
+        fill = progressed * self.fill
+        blank = (self.width - progressed) * self.blank
+        return self.format % {'fill': fill, 'blank': blank, 'progress': int(self.progress)}
+
+    __repr__ = __str__
+
+    def _get_progress(self, increment):
+        return float(increment * 100) / self.end
+
+    def reset(self):
+        """Resets the current progress to the start point"""
+        self.progress = self._get_progress(self.start)
+        return self
+
+
+class AnimatedProgressBar(ProgressBar):
+    """Extends ProgressBar to allow you to use it straighforward on a script.
+    Accepts an extra keyword argument named `stdout` (by default use sys.stdout)
+    and may be any file-object to which send the progress status.
+    """
+    def __init__(self, *args, **kwargs):
+        super(AnimatedProgressBar, self).__init__(*args, **kwargs)
+        self.stdout = kwargs.get('stdout', sys.stdout)
+
+    def show_progress(self):
+        if hasattr(self.stdout, 'isatty') and self.stdout.isatty():
+            self.stdout.write('\r')
+        else:
+            self.stdout.write('\n')
+        self.stdout.write(str(self))
+        self.stdout.flush()
+
+
+if __name__ == '__main__':
+    p = AnimatedProgressBar(end=100, width=80)
+
+    while True:
+        p + 5
+        p.show_progress()
+        time.sleep(0.1)
+        if p.progress == 100:
+            break
+    print #new line
--- a/utils/crawler_listadomanga/requirements.txt
+++ b/utils/crawler_listadomanga/requirements.txt
@ -0,0 +1,3 @@
+datCrawl==0.3.0
+lxml==3.3.3
+wsgiref==0.1.2