272 lines
7.7 KiB
Python
272 lines
7.7 KiB
Python
|
# coding: utf-8
|
||
|
import json
|
||
|
import os
|
||
|
import sys
|
||
|
import requesocks as requests
|
||
|
import uuid
|
||
|
import re
|
||
|
from utils.crawler_listadomanga.progressbar import ProgressBar
|
||
|
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "shelfzilla.settings.local")
|
||
|
from filer.models.imagemodels import Image
|
||
|
from django.core.files import File
|
||
|
import warnings
|
||
|
import datetime
|
||
|
warnings.filterwarnings('ignore')
|
||
|
buff = ''
|
||
|
with open('utils/crawler_listadomanga/data.json', 'r') as f:
|
||
|
buff += f.read()
|
||
|
|
||
|
new_json = json.loads(buff)
|
||
|
|
||
|
pb_total = {
|
||
|
'end': len(new_json),
|
||
|
'width': 50,
|
||
|
'fill': '#',
|
||
|
'format': '%(progress)s%% [%(fill)s%(blank)s]'
|
||
|
}
|
||
|
|
||
|
r_unwanted = re.compile("[\n\t\r]")
|
||
|
total_pb = ProgressBar(**pb_total)
|
||
|
|
||
|
session = requests.session()
|
||
|
session.proxies = {
|
||
|
'http': 'socks5://127.0.0.1:9150',
|
||
|
'https': 'socks5://127.0.0.1:9150'
|
||
|
}
|
||
|
|
||
|
DATE_VALUES = {
|
||
|
'Enero': 1,
|
||
|
'Febrero': 2,
|
||
|
'Marzo': 3,
|
||
|
'Abril': 4,
|
||
|
'Mayo': 5,
|
||
|
'Junio': 6,
|
||
|
'Julio': 7,
|
||
|
'Agosto': 8,
|
||
|
'Septiembre': 9,
|
||
|
'Octubre': 10,
|
||
|
'Noviembre': 11,
|
||
|
'Diciembre': 12
|
||
|
}
|
||
|
|
||
|
def download_file(url):
|
||
|
local_filename = "/tmp/{}".format(str(uuid.uuid4()))
|
||
|
# local_filename = str(uuid.uuid4())
|
||
|
# NOTE the stream=True parameter
|
||
|
download_errors = True
|
||
|
redownload_cover = False
|
||
|
while download_errors:
|
||
|
# Change TOR identity
|
||
|
if redownload_cover:
|
||
|
print('=> RENEWING TOR IDENTITY...')
|
||
|
from stem import Signal
|
||
|
from stem.control import Controller
|
||
|
|
||
|
with Controller.from_port(port=9051) as controller:
|
||
|
controller.authenticate("1234")
|
||
|
controller.signal(Signal.NEWNYM)
|
||
|
|
||
|
redownload_cover = False
|
||
|
|
||
|
try:
|
||
|
r = session.get(url)
|
||
|
with open(local_filename, 'wb') as f:
|
||
|
for chunk in r.iter_content(chunk_size=1024):
|
||
|
if chunk:
|
||
|
f.write(chunk)
|
||
|
f.flush()
|
||
|
|
||
|
if os.path.getsize(local_filename) < 200:
|
||
|
check_cover = open(local_filename, 'r')
|
||
|
if 'blacklisted' in check_cover.read():
|
||
|
redownload_cover = True
|
||
|
old_cover = vol.cover
|
||
|
vol.cover = None
|
||
|
vol.save()
|
||
|
old_cover.delete()
|
||
|
else:
|
||
|
download_errors = False
|
||
|
else:
|
||
|
download_errors = False
|
||
|
except:
|
||
|
redownload_cover = True
|
||
|
download_errors = True
|
||
|
|
||
|
return local_filename
|
||
|
|
||
|
|
||
|
def clear():
|
||
|
os.system('clear')
|
||
|
|
||
|
|
||
|
def update_pbs(title):
|
||
|
clear()
|
||
|
print('{}'.format(total_pb))
|
||
|
print('Now working: {}'.format(title.encode('utf-8')))
|
||
|
|
||
|
|
||
|
# filename = download_file('http://ifconfig.me/ip')
|
||
|
# os.remove(filename)
|
||
|
|
||
|
|
||
|
i = 0
|
||
|
from shelfzilla.apps.manga.models import Series, Person, Publisher, Volume, Language
|
||
|
for s in new_json:
|
||
|
is_catala = False
|
||
|
update_pbs(s['name'])
|
||
|
|
||
|
if '(' in s['name']:
|
||
|
# TODO collections!
|
||
|
pass
|
||
|
|
||
|
serie, is_new = Series.objects.get_or_create(
|
||
|
name=s['name']
|
||
|
)
|
||
|
|
||
|
# Replaces
|
||
|
if 'spanish_publisher' in s:
|
||
|
if u'Editores de Tebeos' in s['spanish_publisher']:
|
||
|
s['spanish_publisher'] = u'Ediciones Glénat / EDT'
|
||
|
if u'ECC Ediciones' in s['spanish_publisher']:
|
||
|
s['spanish_publisher'] = u'El Catálogo del Cómic'
|
||
|
|
||
|
|
||
|
|
||
|
# TODO collections
|
||
|
|
||
|
# print("================== {}".format(serie.name.encode('utf-8')))
|
||
|
|
||
|
# Summary
|
||
|
if s['summary'] != '':
|
||
|
serie.summary = s['summary']
|
||
|
|
||
|
# Completed series
|
||
|
# TODO catala?
|
||
|
if 'spanish_numbers' in s:
|
||
|
if 'completa' in s['spanish_numbers']:
|
||
|
serie.finished = True
|
||
|
serie.status = 'finished'
|
||
|
|
||
|
if 'cancelada' in s['spanish_numbers']:
|
||
|
serie.status = 'cancelled'
|
||
|
|
||
|
if 'catala_numbers' in s:
|
||
|
is_catala = True
|
||
|
if 'completa' in s['catala_numbers']:
|
||
|
serie.finished = True
|
||
|
serie.status = 'finished'
|
||
|
|
||
|
if 'cancelada' in s['catala_numbers']:
|
||
|
serie.status = 'cancelled'
|
||
|
|
||
|
# Art
|
||
|
if 'art' in s and s['art']:
|
||
|
art = s['art'].split(',')
|
||
|
for person in art:
|
||
|
name = person.strip()
|
||
|
art, is_new = Person.objects.get_or_create(
|
||
|
name=name
|
||
|
)
|
||
|
serie.art.add(art)
|
||
|
|
||
|
# Story
|
||
|
if 'story' in s and s['story']:
|
||
|
story = s['story'].split(',')
|
||
|
for person in story:
|
||
|
name = person.strip()
|
||
|
story, is_new = Person.objects.get_or_create(
|
||
|
name=name
|
||
|
)
|
||
|
serie.story.add(story)
|
||
|
|
||
|
# Spanish publisher
|
||
|
if 'spanish_publisher' in s and s['spanish_publisher']:
|
||
|
pub, is_new = Publisher.objects.get_or_create(
|
||
|
name=s['spanish_publisher']
|
||
|
)
|
||
|
if s['spanish_publisher_url']:
|
||
|
pub.url = s['spanish_publisher_url']
|
||
|
pub.save()
|
||
|
|
||
|
# Japanese publisher
|
||
|
if 'japanese_publisher' in s and s['japanese_publisher']:
|
||
|
src_pub, is_new = Publisher.objects.get_or_create(
|
||
|
name=s['japanese_publisher']
|
||
|
)
|
||
|
if s['japanese_publisher_url']:
|
||
|
src_pub.url = s['japanese_publisher_url']
|
||
|
src_pub.save()
|
||
|
|
||
|
serie.original_publisher = src_pub
|
||
|
|
||
|
# Volumes
|
||
|
if len(s['published_volumes']) > 0:
|
||
|
for index, volume in enumerate(s['published_volumes']):
|
||
|
|
||
|
try:
|
||
|
number = int(volume['name'].split(u'\u00ba')[1])
|
||
|
except:
|
||
|
number = index
|
||
|
print('[volume] {}'.format(r_unwanted.sub(" ", volume['name'].encode('utf-8'))))
|
||
|
|
||
|
try:
|
||
|
vol, is_new = Volume.objects.get_or_create(
|
||
|
series=serie,
|
||
|
number=number,
|
||
|
publisher=pub
|
||
|
)
|
||
|
|
||
|
if is_catala:
|
||
|
language = Language.objects.get(code='es-ca')
|
||
|
vol.language = language
|
||
|
vol.save()
|
||
|
|
||
|
if 'date' in volume:
|
||
|
month, year = volume['date'].split(' ')
|
||
|
month = DATE_VALUES[month]
|
||
|
year = int(year)
|
||
|
|
||
|
vol.release_date = datetime.datetime(year, month, 1)
|
||
|
vol.save()
|
||
|
|
||
|
if vol.cover:
|
||
|
file_path = vol.cover.file.path
|
||
|
if os.path.getsize(file_path) < 200:
|
||
|
check_cover = open(file_path, 'r')
|
||
|
if 'blacklisted' in check_cover.read():
|
||
|
old_cover = vol.cover
|
||
|
vol.cover = None
|
||
|
vol.save()
|
||
|
old_cover.delete()
|
||
|
|
||
|
if 'cover' in volume and not vol.cover:
|
||
|
cover_file = download_file(volume['cover'])
|
||
|
check_local_file = open(cover_file, 'r')
|
||
|
if not 'blacklisted' in check_local_file.read():
|
||
|
with open(cover_file) as f:
|
||
|
dj_file = File(f, name=str(uuid.uuid4()))
|
||
|
|
||
|
cover, is_new_cover = Image.objects.get_or_create(
|
||
|
folder=None,
|
||
|
name=str(uuid.uuid4()),
|
||
|
file=dj_file
|
||
|
)
|
||
|
|
||
|
vol.cover = cover
|
||
|
vol.save()
|
||
|
else:
|
||
|
print('BLACKLISTED!')
|
||
|
print(cover_file)
|
||
|
quit()
|
||
|
|
||
|
except Exception as error:
|
||
|
print('Error: {}'.format(error))
|
||
|
serie.save()
|
||
|
total_pb += 1
|
||
|
sys.stdout.flush()
|
||
|
|
||
|
i += 1
|
||
|
|
||
|
# if i == 20:
|
||
|
# break
|