Crawler io
This commit is contained in:
parent
debb4abf2e
commit
dbe50f3947
|
@ -0,0 +1,271 @@
|
|||
# coding: utf-8
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import requesocks as requests
|
||||
import uuid
|
||||
import re
|
||||
from utils.crawler_listadomanga.progressbar import ProgressBar
|
||||
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "shelfzilla.settings.local")
|
||||
from filer.models.imagemodels import Image
|
||||
from django.core.files import File
|
||||
import warnings
|
||||
import datetime
|
||||
warnings.filterwarnings('ignore')
|
||||
buff = ''
|
||||
with open('utils/crawler_listadomanga/data.json', 'r') as f:
|
||||
buff += f.read()
|
||||
|
||||
new_json = json.loads(buff)
|
||||
|
||||
pb_total = {
|
||||
'end': len(new_json),
|
||||
'width': 50,
|
||||
'fill': '#',
|
||||
'format': '%(progress)s%% [%(fill)s%(blank)s]'
|
||||
}
|
||||
|
||||
r_unwanted = re.compile("[\n\t\r]")
|
||||
total_pb = ProgressBar(**pb_total)
|
||||
|
||||
session = requests.session()
|
||||
session.proxies = {
|
||||
'http': 'socks5://127.0.0.1:9150',
|
||||
'https': 'socks5://127.0.0.1:9150'
|
||||
}
|
||||
|
||||
DATE_VALUES = {
|
||||
'Enero': 1,
|
||||
'Febrero': 2,
|
||||
'Marzo': 3,
|
||||
'Abril': 4,
|
||||
'Mayo': 5,
|
||||
'Junio': 6,
|
||||
'Julio': 7,
|
||||
'Agosto': 8,
|
||||
'Septiembre': 9,
|
||||
'Octubre': 10,
|
||||
'Noviembre': 11,
|
||||
'Diciembre': 12
|
||||
}
|
||||
|
||||
def download_file(url):
|
||||
local_filename = "/tmp/{}".format(str(uuid.uuid4()))
|
||||
# local_filename = str(uuid.uuid4())
|
||||
# NOTE the stream=True parameter
|
||||
download_errors = True
|
||||
redownload_cover = False
|
||||
while download_errors:
|
||||
# Change TOR identity
|
||||
if redownload_cover:
|
||||
print('=> RENEWING TOR IDENTITY...')
|
||||
from stem import Signal
|
||||
from stem.control import Controller
|
||||
|
||||
with Controller.from_port(port=9051) as controller:
|
||||
controller.authenticate("1234")
|
||||
controller.signal(Signal.NEWNYM)
|
||||
|
||||
redownload_cover = False
|
||||
|
||||
try:
|
||||
r = session.get(url)
|
||||
with open(local_filename, 'wb') as f:
|
||||
for chunk in r.iter_content(chunk_size=1024):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
f.flush()
|
||||
|
||||
if os.path.getsize(local_filename) < 200:
|
||||
check_cover = open(local_filename, 'r')
|
||||
if 'blacklisted' in check_cover.read():
|
||||
redownload_cover = True
|
||||
old_cover = vol.cover
|
||||
vol.cover = None
|
||||
vol.save()
|
||||
old_cover.delete()
|
||||
else:
|
||||
download_errors = False
|
||||
else:
|
||||
download_errors = False
|
||||
except:
|
||||
redownload_cover = True
|
||||
download_errors = True
|
||||
|
||||
return local_filename
|
||||
|
||||
|
||||
def clear():
|
||||
os.system('clear')
|
||||
|
||||
|
||||
def update_pbs(title):
|
||||
clear()
|
||||
print('{}'.format(total_pb))
|
||||
print('Now working: {}'.format(title.encode('utf-8')))
|
||||
|
||||
|
||||
# filename = download_file('http://ifconfig.me/ip')
|
||||
# os.remove(filename)
|
||||
|
||||
|
||||
i = 0
|
||||
from shelfzilla.apps.manga.models import Series, Person, Publisher, Volume, Language
|
||||
for s in new_json:
|
||||
is_catala = False
|
||||
update_pbs(s['name'])
|
||||
|
||||
if '(' in s['name']:
|
||||
# TODO collections!
|
||||
pass
|
||||
|
||||
serie, is_new = Series.objects.get_or_create(
|
||||
name=s['name']
|
||||
)
|
||||
|
||||
# Replaces
|
||||
if 'spanish_publisher' in s:
|
||||
if u'Editores de Tebeos' in s['spanish_publisher']:
|
||||
s['spanish_publisher'] = u'Ediciones Glénat / EDT'
|
||||
if u'ECC Ediciones' in s['spanish_publisher']:
|
||||
s['spanish_publisher'] = u'El Catálogo del Cómic'
|
||||
|
||||
|
||||
|
||||
# TODO collections
|
||||
|
||||
# print("================== {}".format(serie.name.encode('utf-8')))
|
||||
|
||||
# Summary
|
||||
if s['summary'] != '':
|
||||
serie.summary = s['summary']
|
||||
|
||||
# Completed series
|
||||
# TODO catala?
|
||||
if 'spanish_numbers' in s:
|
||||
if 'completa' in s['spanish_numbers']:
|
||||
serie.finished = True
|
||||
serie.status = 'finished'
|
||||
|
||||
if 'cancelada' in s['spanish_numbers']:
|
||||
serie.status = 'cancelled'
|
||||
|
||||
if 'catala_numbers' in s:
|
||||
is_catala = True
|
||||
if 'completa' in s['catala_numbers']:
|
||||
serie.finished = True
|
||||
serie.status = 'finished'
|
||||
|
||||
if 'cancelada' in s['catala_numbers']:
|
||||
serie.status = 'cancelled'
|
||||
|
||||
# Art
|
||||
if 'art' in s and s['art']:
|
||||
art = s['art'].split(',')
|
||||
for person in art:
|
||||
name = person.strip()
|
||||
art, is_new = Person.objects.get_or_create(
|
||||
name=name
|
||||
)
|
||||
serie.art.add(art)
|
||||
|
||||
# Story
|
||||
if 'story' in s and s['story']:
|
||||
story = s['story'].split(',')
|
||||
for person in story:
|
||||
name = person.strip()
|
||||
story, is_new = Person.objects.get_or_create(
|
||||
name=name
|
||||
)
|
||||
serie.story.add(story)
|
||||
|
||||
# Spanish publisher
|
||||
if 'spanish_publisher' in s and s['spanish_publisher']:
|
||||
pub, is_new = Publisher.objects.get_or_create(
|
||||
name=s['spanish_publisher']
|
||||
)
|
||||
if s['spanish_publisher_url']:
|
||||
pub.url = s['spanish_publisher_url']
|
||||
pub.save()
|
||||
|
||||
# Japanese publisher
|
||||
if 'japanese_publisher' in s and s['japanese_publisher']:
|
||||
src_pub, is_new = Publisher.objects.get_or_create(
|
||||
name=s['japanese_publisher']
|
||||
)
|
||||
if s['japanese_publisher_url']:
|
||||
src_pub.url = s['japanese_publisher_url']
|
||||
src_pub.save()
|
||||
|
||||
serie.original_publisher = src_pub
|
||||
|
||||
# Volumes
|
||||
if len(s['published_volumes']) > 0:
|
||||
for index, volume in enumerate(s['published_volumes']):
|
||||
|
||||
try:
|
||||
number = int(volume['name'].split(u'\u00ba')[1])
|
||||
except:
|
||||
number = index
|
||||
print('[volume] {}'.format(r_unwanted.sub(" ", volume['name'].encode('utf-8'))))
|
||||
|
||||
try:
|
||||
vol, is_new = Volume.objects.get_or_create(
|
||||
series=serie,
|
||||
number=number,
|
||||
publisher=pub
|
||||
)
|
||||
|
||||
if is_catala:
|
||||
language = Language.objects.get(code='es-ca')
|
||||
vol.language = language
|
||||
vol.save()
|
||||
|
||||
if 'date' in volume:
|
||||
month, year = volume['date'].split(' ')
|
||||
month = DATE_VALUES[month]
|
||||
year = int(year)
|
||||
|
||||
vol.release_date = datetime.datetime(year, month, 1)
|
||||
vol.save()
|
||||
|
||||
if vol.cover:
|
||||
file_path = vol.cover.file.path
|
||||
if os.path.getsize(file_path) < 200:
|
||||
check_cover = open(file_path, 'r')
|
||||
if 'blacklisted' in check_cover.read():
|
||||
old_cover = vol.cover
|
||||
vol.cover = None
|
||||
vol.save()
|
||||
old_cover.delete()
|
||||
|
||||
if 'cover' in volume and not vol.cover:
|
||||
cover_file = download_file(volume['cover'])
|
||||
check_local_file = open(cover_file, 'r')
|
||||
if not 'blacklisted' in check_local_file.read():
|
||||
with open(cover_file) as f:
|
||||
dj_file = File(f, name=str(uuid.uuid4()))
|
||||
|
||||
cover, is_new_cover = Image.objects.get_or_create(
|
||||
folder=None,
|
||||
name=str(uuid.uuid4()),
|
||||
file=dj_file
|
||||
)
|
||||
|
||||
vol.cover = cover
|
||||
vol.save()
|
||||
else:
|
||||
print('BLACKLISTED!')
|
||||
print(cover_file)
|
||||
quit()
|
||||
|
||||
except Exception as error:
|
||||
print('Error: {}'.format(error))
|
||||
serie.save()
|
||||
total_pb += 1
|
||||
sys.stdout.flush()
|
||||
|
||||
i += 1
|
||||
|
||||
# if i == 20:
|
||||
# break
|
Reference in New Issue