diff --git a/utils/crawler_listadomanga/test.py b/utils/crawler_listadomanga/test.py new file mode 100644 index 0000000..3a7f89b --- /dev/null +++ b/utils/crawler_listadomanga/test.py @@ -0,0 +1,271 @@ +# coding: utf-8 +import json +import os +import sys +import requesocks as requests +import uuid +import re +from utils.crawler_listadomanga.progressbar import ProgressBar +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "shelfzilla.settings.local") +from filer.models.imagemodels import Image +from django.core.files import File +import warnings +import datetime +warnings.filterwarnings('ignore') +buff = '' +with open('utils/crawler_listadomanga/data.json', 'r') as f: + buff += f.read() + +new_json = json.loads(buff) + +pb_total = { + 'end': len(new_json), + 'width': 50, + 'fill': '#', + 'format': '%(progress)s%% [%(fill)s%(blank)s]' +} + +r_unwanted = re.compile("[\n\t\r]") +total_pb = ProgressBar(**pb_total) + +session = requests.session() +session.proxies = { + 'http': 'socks5://127.0.0.1:9150', + 'https': 'socks5://127.0.0.1:9150' +} + +DATE_VALUES = { + 'Enero': 1, + 'Febrero': 2, + 'Marzo': 3, + 'Abril': 4, + 'Mayo': 5, + 'Junio': 6, + 'Julio': 7, + 'Agosto': 8, + 'Septiembre': 9, + 'Octubre': 10, + 'Noviembre': 11, + 'Diciembre': 12 +} + +def download_file(url): + local_filename = "/tmp/{}".format(str(uuid.uuid4())) + # local_filename = str(uuid.uuid4()) + # NOTE the stream=True parameter + download_errors = True + redownload_cover = False + while download_errors: + # Change TOR identity + if redownload_cover: + print('=> RENEWING TOR IDENTITY...') + from stem import Signal + from stem.control import Controller + + with Controller.from_port(port=9051) as controller: + controller.authenticate("1234") + controller.signal(Signal.NEWNYM) + + redownload_cover = False + + try: + r = session.get(url) + with open(local_filename, 'wb') as f: + for chunk in r.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + f.flush() + + if os.path.getsize(local_filename) < 200: + check_cover = open(local_filename, 'r') + if 'blacklisted' in check_cover.read(): + redownload_cover = True + old_cover = vol.cover + vol.cover = None + vol.save() + old_cover.delete() + else: + download_errors = False + else: + download_errors = False + except: + redownload_cover = True + download_errors = True + + return local_filename + + +def clear(): + os.system('clear') + + +def update_pbs(title): + clear() + print('{}'.format(total_pb)) + print('Now working: {}'.format(title.encode('utf-8'))) + + +# filename = download_file('http://ifconfig.me/ip') +# os.remove(filename) + + +i = 0 +from shelfzilla.apps.manga.models import Series, Person, Publisher, Volume, Language +for s in new_json: + is_catala = False + update_pbs(s['name']) + + if '(' in s['name']: + # TODO collections! + pass + + serie, is_new = Series.objects.get_or_create( + name=s['name'] + ) + + # Replaces + if 'spanish_publisher' in s: + if u'Editores de Tebeos' in s['spanish_publisher']: + s['spanish_publisher'] = u'Ediciones Glénat / EDT' + if u'ECC Ediciones' in s['spanish_publisher']: + s['spanish_publisher'] = u'El Catálogo del Cómic' + + + + # TODO collections + + # print("================== {}".format(serie.name.encode('utf-8'))) + + # Summary + if s['summary'] != '': + serie.summary = s['summary'] + + # Completed series + # TODO catala? + if 'spanish_numbers' in s: + if 'completa' in s['spanish_numbers']: + serie.finished = True + serie.status = 'finished' + + if 'cancelada' in s['spanish_numbers']: + serie.status = 'cancelled' + + if 'catala_numbers' in s: + is_catala = True + if 'completa' in s['catala_numbers']: + serie.finished = True + serie.status = 'finished' + + if 'cancelada' in s['catala_numbers']: + serie.status = 'cancelled' + + # Art + if 'art' in s and s['art']: + art = s['art'].split(',') + for person in art: + name = person.strip() + art, is_new = Person.objects.get_or_create( + name=name + ) + serie.art.add(art) + + # Story + if 'story' in s and s['story']: + story = s['story'].split(',') + for person in story: + name = person.strip() + story, is_new = Person.objects.get_or_create( + name=name + ) + serie.story.add(story) + + # Spanish publisher + if 'spanish_publisher' in s and s['spanish_publisher']: + pub, is_new = Publisher.objects.get_or_create( + name=s['spanish_publisher'] + ) + if s['spanish_publisher_url']: + pub.url = s['spanish_publisher_url'] + pub.save() + + # Japanese publisher + if 'japanese_publisher' in s and s['japanese_publisher']: + src_pub, is_new = Publisher.objects.get_or_create( + name=s['japanese_publisher'] + ) + if s['japanese_publisher_url']: + src_pub.url = s['japanese_publisher_url'] + src_pub.save() + + serie.original_publisher = src_pub + + # Volumes + if len(s['published_volumes']) > 0: + for index, volume in enumerate(s['published_volumes']): + + try: + number = int(volume['name'].split(u'\u00ba')[1]) + except: + number = index + print('[volume] {}'.format(r_unwanted.sub(" ", volume['name'].encode('utf-8')))) + + try: + vol, is_new = Volume.objects.get_or_create( + series=serie, + number=number, + publisher=pub + ) + + if is_catala: + language = Language.objects.get(code='es-ca') + vol.language = language + vol.save() + + if 'date' in volume: + month, year = volume['date'].split(' ') + month = DATE_VALUES[month] + year = int(year) + + vol.release_date = datetime.datetime(year, month, 1) + vol.save() + + if vol.cover: + file_path = vol.cover.file.path + if os.path.getsize(file_path) < 200: + check_cover = open(file_path, 'r') + if 'blacklisted' in check_cover.read(): + old_cover = vol.cover + vol.cover = None + vol.save() + old_cover.delete() + + if 'cover' in volume and not vol.cover: + cover_file = download_file(volume['cover']) + check_local_file = open(cover_file, 'r') + if not 'blacklisted' in check_local_file.read(): + with open(cover_file) as f: + dj_file = File(f, name=str(uuid.uuid4())) + + cover, is_new_cover = Image.objects.get_or_create( + folder=None, + name=str(uuid.uuid4()), + file=dj_file + ) + + vol.cover = cover + vol.save() + else: + print('BLACKLISTED!') + print(cover_file) + quit() + + except Exception as error: + print('Error: {}'.format(error)) + serie.save() + total_pb += 1 + sys.stdout.flush() + + i += 1 + + # if i == 20: + # break