You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This repo is archived. You can view files and clone it, but cannot push or open issues/pull-requests.
 
 
 
 
 
 

271 lines
7.7 KiB

# coding: utf-8
import json
import os
import sys
import requesocks as requests
import uuid
import re
from utils.crawler_listadomanga.progressbar import ProgressBar
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "shelfzilla.settings.local")
from filer.models.imagemodels import Image
from django.core.files import File
import warnings
import datetime
warnings.filterwarnings('ignore')
buff = ''
with open('utils/crawler_listadomanga/data.json', 'r') as f:
buff += f.read()
new_json = json.loads(buff)
pb_total = {
'end': len(new_json),
'width': 50,
'fill': '#',
'format': '%(progress)s%% [%(fill)s%(blank)s]'
}
r_unwanted = re.compile("[\n\t\r]")
total_pb = ProgressBar(**pb_total)
session = requests.session()
session.proxies = {
'http': 'socks5://127.0.0.1:9150',
'https': 'socks5://127.0.0.1:9150'
}
DATE_VALUES = {
'Enero': 1,
'Febrero': 2,
'Marzo': 3,
'Abril': 4,
'Mayo': 5,
'Junio': 6,
'Julio': 7,
'Agosto': 8,
'Septiembre': 9,
'Octubre': 10,
'Noviembre': 11,
'Diciembre': 12
}
def download_file(url):
local_filename = "/tmp/{}".format(str(uuid.uuid4()))
# local_filename = str(uuid.uuid4())
# NOTE the stream=True parameter
download_errors = True
redownload_cover = False
while download_errors:
# Change TOR identity
if redownload_cover:
print('=> RENEWING TOR IDENTITY...')
from stem import Signal
from stem.control import Controller
with Controller.from_port(port=9051) as controller:
controller.authenticate("1234")
controller.signal(Signal.NEWNYM)
redownload_cover = False
try:
r = session.get(url)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
if os.path.getsize(local_filename) < 200:
check_cover = open(local_filename, 'r')
if 'blacklisted' in check_cover.read():
redownload_cover = True
old_cover = vol.cover
vol.cover = None
vol.save()
old_cover.delete()
else:
download_errors = False
else:
download_errors = False
except:
redownload_cover = True
download_errors = True
return local_filename
def clear():
os.system('clear')
def update_pbs(title):
clear()
print('{}'.format(total_pb))
print('Now working: {}'.format(title.encode('utf-8')))
# filename = download_file('http://ifconfig.me/ip')
# os.remove(filename)
i = 0
from shelfzilla.apps.manga.models import Series, Person, Publisher, Volume, Language
for s in new_json:
is_catala = False
update_pbs(s['name'])
if '(' in s['name']:
# TODO collections!
pass
serie, is_new = Series.objects.get_or_create(
name=s['name']
)
# Replaces
if 'spanish_publisher' in s:
if u'Editores de Tebeos' in s['spanish_publisher']:
s['spanish_publisher'] = u'Ediciones Glénat / EDT'
if u'ECC Ediciones' in s['spanish_publisher']:
s['spanish_publisher'] = u'El Catálogo del Cómic'
# TODO collections
# print("================== {}".format(serie.name.encode('utf-8')))
# Summary
if s['summary'] != '':
serie.summary = s['summary']
# Completed series
# TODO catala?
if 'spanish_numbers' in s:
if 'completa' in s['spanish_numbers']:
serie.finished = True
serie.status = 'finished'
if 'cancelada' in s['spanish_numbers']:
serie.status = 'cancelled'
if 'catala_numbers' in s:
is_catala = True
if 'completa' in s['catala_numbers']:
serie.finished = True
serie.status = 'finished'
if 'cancelada' in s['catala_numbers']:
serie.status = 'cancelled'
# Art
if 'art' in s and s['art']:
art = s['art'].split(',')
for person in art:
name = person.strip()
art, is_new = Person.objects.get_or_create(
name=name
)
serie.art.add(art)
# Story
if 'story' in s and s['story']:
story = s['story'].split(',')
for person in story:
name = person.strip()
story, is_new = Person.objects.get_or_create(
name=name
)
serie.story.add(story)
# Spanish publisher
if 'spanish_publisher' in s and s['spanish_publisher']:
pub, is_new = Publisher.objects.get_or_create(
name=s['spanish_publisher']
)
if s['spanish_publisher_url']:
pub.url = s['spanish_publisher_url']
pub.save()
# Japanese publisher
if 'japanese_publisher' in s and s['japanese_publisher']:
src_pub, is_new = Publisher.objects.get_or_create(
name=s['japanese_publisher']
)
if s['japanese_publisher_url']:
src_pub.url = s['japanese_publisher_url']
src_pub.save()
serie.original_publisher = src_pub
# Volumes
if len(s['published_volumes']) > 0:
for index, volume in enumerate(s['published_volumes']):
try:
number = int(volume['name'].split(u'\u00ba')[1])
except:
number = index
print('[volume] {}'.format(r_unwanted.sub(" ", volume['name'].encode('utf-8'))))
try:
vol, is_new = Volume.objects.get_or_create(
series=serie,
number=number,
publisher=pub
)
if is_catala:
language = Language.objects.get(code='es-ca')
vol.language = language
vol.save()
if 'date' in volume:
month, year = volume['date'].split(' ')
month = DATE_VALUES[month]
year = int(year)
vol.release_date = datetime.datetime(year, month, 1)
vol.save()
if vol.cover:
file_path = vol.cover.file.path
if os.path.getsize(file_path) < 200:
check_cover = open(file_path, 'r')
if 'blacklisted' in check_cover.read():
old_cover = vol.cover
vol.cover = None
vol.save()
old_cover.delete()
if 'cover' in volume and not vol.cover:
cover_file = download_file(volume['cover'])
check_local_file = open(cover_file, 'r')
if not 'blacklisted' in check_local_file.read():
with open(cover_file) as f:
dj_file = File(f, name=str(uuid.uuid4()))
cover, is_new_cover = Image.objects.get_or_create(
folder=None,
name=str(uuid.uuid4()),
file=dj_file
)
vol.cover = cover
vol.save()
else:
print('BLACKLISTED!')
print(cover_file)
quit()
except Exception as error:
print('Error: {}'.format(error))
serie.save()
total_pb += 1
sys.stdout.flush()
i += 1
# if i == 20:
# break