shelfzilla/utils/crawler_listadomanga/test.py

# coding: utf-8
import json
import os
import sys
import requesocks as requests
import uuid
import re
from utils.crawler_listadomanga.progressbar import ProgressBar
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "shelfzilla.settings.local")
from filer.models.imagemodels import Image
from django.core.files import File
import warnings
import datetime
warnings.filterwarnings('ignore')
buff = ''
with open('utils/crawler_listadomanga/data.json', 'r') as f:
    buff += f.read()

new_json = json.loads(buff)

pb_total = {
    'end': len(new_json),
    'width': 50,
    'fill': '#',
    'format': '%(progress)s%% [%(fill)s%(blank)s]'
}

r_unwanted = re.compile("[\n\t\r]")
total_pb = ProgressBar(**pb_total)

session = requests.session()
session.proxies = {
    'http': 'socks5://127.0.0.1:9150',
    'https': 'socks5://127.0.0.1:9150'
}

DATE_VALUES = {
    'Enero': 1,
    'Febrero': 2,
    'Marzo': 3,
    'Abril': 4,
    'Mayo': 5,
    'Junio': 6,
    'Julio': 7,
    'Agosto': 8,
    'Septiembre': 9,
    'Octubre': 10,
    'Noviembre': 11,
    'Diciembre': 12
}

def download_file(url):
    local_filename = "/tmp/{}".format(str(uuid.uuid4()))
    # local_filename = str(uuid.uuid4())
    # NOTE the stream=True parameter
    download_errors = True
    redownload_cover = False
    while download_errors:
        # Change TOR identity
        if redownload_cover:
            print('=> RENEWING TOR IDENTITY...')
            from stem import Signal
            from stem.control import Controller

            with Controller.from_port(port=9051) as controller:
                controller.authenticate("1234")
                controller.signal(Signal.NEWNYM)

            redownload_cover = False

        try:
            r = session.get(url)
            with open(local_filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
                        f.flush()

            if os.path.getsize(local_filename) < 200:
                check_cover = open(local_filename, 'r')
                if 'blacklisted' in check_cover.read():
                    redownload_cover = True
                    old_cover = vol.cover
                    vol.cover = None
                    vol.save()
                    old_cover.delete()
                else:
                    download_errors = False
            else:
                download_errors = False
        except:
            redownload_cover = True
            download_errors = True

    return local_filename


def clear():
    os.system('clear')


def update_pbs(title):
    clear()
    print('{}'.format(total_pb))
    print('Now working: {}'.format(title.encode('utf-8')))


# filename = download_file('http://ifconfig.me/ip')
# os.remove(filename)


i = 0
from shelfzilla.apps.manga.models import Series, Person, Publisher, Volume, Language
for s in new_json:
    is_catala = False
    update_pbs(s['name'])

    if '(' in s['name']:
        # TODO collections!
        pass

    serie, is_new = Series.objects.get_or_create(
        name=s['name']
    )

    # Replaces
    if 'spanish_publisher' in s:
        if u'Editores de Tebeos' in s['spanish_publisher']:
            s['spanish_publisher'] = u'Ediciones Glénat / EDT'
        if u'ECC Ediciones' in s['spanish_publisher']:
            s['spanish_publisher'] = u'El Catálogo del Cómic'


    # TODO collections

    # print("================== {}".format(serie.name.encode('utf-8')))

    # Summary
    if s['summary'] != '':
        serie.summary = s['summary']

    # Completed series
    # TODO catala?
    if 'spanish_numbers' in s:
        if 'completa' in s['spanish_numbers']:
            serie.finished = True
            serie.status = 'finished'

        if 'cancelada' in s['spanish_numbers']:
            serie.status = 'cancelled'

    if 'catala_numbers' in s:
        is_catala = True
        if 'completa' in s['catala_numbers']:
            serie.finished = True
            serie.status = 'finished'

        if 'cancelada' in s['catala_numbers']:
            serie.status = 'cancelled'

    # Art
    if 'art' in s and s['art']:
        art = s['art'].split(',')
        for person in art:
            name = person.strip()
            art, is_new = Person.objects.get_or_create(
                name=name
            )
            serie.art.add(art)

    # Story
    if 'story' in s and s['story']:
        story = s['story'].split(',')
        for person in story:
            name = person.strip()
            story, is_new = Person.objects.get_or_create(
                name=name
            )
            serie.story.add(story)

    # Spanish publisher
    if 'spanish_publisher' in s and s['spanish_publisher']:
        pub, is_new = Publisher.objects.get_or_create(
            name=s['spanish_publisher']
        )
        if s['spanish_publisher_url']:
            pub.url = s['spanish_publisher_url']
            pub.save()

    # Japanese publisher
    if 'japanese_publisher' in s and s['japanese_publisher']:
        src_pub, is_new = Publisher.objects.get_or_create(
            name=s['japanese_publisher']
        )
        if s['japanese_publisher_url']:
            src_pub.url = s['japanese_publisher_url']
            src_pub.save()

        serie.original_publisher = src_pub

    # Volumes
    if len(s['published_volumes']) > 0:
        for index, volume in enumerate(s['published_volumes']):

            try:
                number = int(volume['name'].split(u'\u00ba')[1])
            except:
                number = index
            print('[volume] {}'.format(r_unwanted.sub(" ", volume['name'].encode('utf-8'))))

            try:
                vol, is_new = Volume.objects.get_or_create(
                    series=serie,
                    number=number,
                    publisher=pub
                )

                if is_catala:
                    language = Language.objects.get(code='es-ca')
                    vol.language = language
                    vol.save()

                if 'date' in volume:
                    month, year = volume['date'].split(' ')
                    month = DATE_VALUES[month]
                    year = int(year)

                    vol.release_date = datetime.datetime(year, month, 1)
                    vol.save()

                if vol.cover:
                    file_path = vol.cover.file.path
                    if os.path.getsize(file_path) < 200:
                        check_cover = open(file_path, 'r')
                        if 'blacklisted' in check_cover.read():
                            old_cover = vol.cover
                            vol.cover = None
                            vol.save()
                            old_cover.delete()

                if 'cover' in volume and not vol.cover:
                    cover_file = download_file(volume['cover'])
                    check_local_file = open(cover_file, 'r')
                    if not 'blacklisted' in check_local_file.read():
                        with open(cover_file) as f:
                            dj_file = File(f, name=str(uuid.uuid4()))

                            cover, is_new_cover = Image.objects.get_or_create(
                                folder=None,
                                name=str(uuid.uuid4()),
                                file=dj_file
                            )

                            vol.cover = cover
                            vol.save()
                    else:
                        print('BLACKLISTED!')
                        print(cover_file)
                        quit()

            except Exception as error:
                print('Error: {}'.format(error))
    serie.save()
    total_pb += 1
    sys.stdout.flush()

    i += 1

    # if i == 20:
    #     break