PyComicPackRouMan/utils/downloader.py
2023-04-01 19:58:58 +08:00

174 lines
6.8 KiB
Python

""" Download image according to given urls and automatically rename them in order. """
# -*- coding: utf-8 -*-
# author: Yabin Zheng
# Email: sczhengyabin@hotmail.com
from __future__ import print_function
from queue import Queue
import shutil
import imghdr
import os
import concurrent.futures
import requests
import time
from utils.Ntfy import ntfy
from utils.comic.ComicInfo import comicInfo
from utils.HtmlUtils import htmlUtils
from utils.FileUtils import fileUtils as fu
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Proxy-Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",
"Accept-Encoding": "gzip, deflate, sdch",
# 'Connection': 'close',
}
down_queue = Queue()
def common_download(file_name,image_url,dst_dir,timeout=10,proxy=None,proxy_type=None):
proxies = None
if proxy_type is not None:
proxies = {
"http": proxy_type + "://" + proxy,
"https": proxy_type + "://" + proxy }
response = None
file_path = os.path.join(dst_dir, file_name)
if os.path.exists(file_path):
print("download_image 文件已存在,已跳过=",file_path)
return None
temp_path = os.path.join(dst_dir, file_name+".downloads")
repair_count = 1
response = requests.get(
image_url, headers=headers, timeout=timeout, proxies=proxies)
while response.status_code != 200 and repair_count <= 5:
time.sleep(0.7)
download_image(image_url,dst_dir,file_name)
ntfy.sendMsg(f'重试:第{repair_count}{image_url}')
repair_count += 1
with open(temp_path, 'wb') as f:
f.write(response.content)
response.close()
#验证是否是图像
if fu.ver_file(temp_path,type="image"):
shutil.move(temp_path, file_path)
print("## OK: {} {}".format(file_path, image_url))
else:
print("## Fail: {} {}".format(image_url, "图像损坏"))
down_queue.put([file_name,image_url,dst_dir])
def download_image(timeout=20, proxy_type=None, proxy=None,type="image"):
repeat = 0
while not down_queue.empty() and repeat <= 10:
repeat += 1
data = down_queue.get(False)
(file_name,image_url,dst_dir) = [data[0],data[1],data[2]]
if repeat > 1:
ntfy.sendMsg(f"{repeat}次下载数据中... file_name={file_name}")
try:
common_download(file_name,image_url,dst_dir)
except:
ntfy.sendMsg(f"下载重试中 {file_name}={image_url}")
down_queue.put([file_name,image_url,dst_dir])
def download_images(image_urls, dst_dir,concurrency=None,timeout=20,proxy_type=None, proxy=None,files_name=None):
"""
Download image according to given urls and automatically rename them in order.
:param timeout:
:param proxy:
:param proxy_type:
:param image_urls: list of image urls
:param dst_dir: output the downloaded images to dst_dir
:param file_prefix: if set to "img", files will be in format "img_xxx.jpg"
:param concurrency: number of requests process simultaneously
:return: none
"""
if concurrency == None:
concurrency = len(image_urls)
with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as executor:
future_list = list()
count = 0
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
for image_url in image_urls:
file_name = files_name[count]
down_queue.put([file_name,image_url,dst_dir])
future_list.append(executor.submit(
download_image,timeout, proxy_type, proxy))
count += 1
concurrent.futures.wait(future_list, timeout)
def download_comic_icon(is_new=comicInfo.IS_NEW_ICON):
icon_url = comicInfo.getIcon()
if icon_url == None:
print("icon 不存在,已跳过")
return None
save_name = comicInfo.COMIC_ICON_NAME
icon_prefix = "."+str(icon_url).split(".")[-1]
icon_prefix = icon_prefix.split("?")[0]
#判断漫画名路径是否已存在comicname/cover.jpg, 存在跳过
path_comic_icon = os.path.join(comicInfo.getDirConfComic(),save_name+icon_prefix)
if not comicInfo.equIcon() and fu.exists(path_comic_icon):
os.remove(path_comic_icon)
if fu.notExists(path_comic_icon):
download(icon_url, path_comic_icon)
#if not os.path.exists(path_cbz_comic):
# os.makedirs(path_cbz_comic)
save_path = os.path.join(comicInfo.getDirCBZComic(),comicInfo.getChapter()+icon_prefix)
if is_new:
#历史版本ICON
if os.path.exists(save_path):
os.remove(save_path)
if os.path.exists(path_comic_icon):
base_dir = comicInfo.getDirComicChapter()
if not os.path.exists(base_dir): os.makedirs(base_dir)
shutil.copy(path_comic_icon,os.path.join(base_dir,save_name+icon_prefix))
else:
if fu.notExists(comicInfo.getDirCBZComic()): os.makedirs(comicInfo.getDirCBZComic())
shutil.copy(path_comic_icon,save_path)
print(f"{path_comic_icon} 已复制至: {save_path}")
#保存icon信息
comicInfo.iconDB()
comicInfo.nextDownloadToCBZChapter()
comicInfo.setProgress(comicInfo.PROGRESS_CBZ)
# 定义下载函数
def download(url,path,file_type=None):
if os.path.exists(path):
if imghdr.what(path):
msg = "已存在同路径文件,已跳过:"+path
print(msg)
return msg
else:
print("文件已损坏,已重试:"+path)
path = os.path.join(os.path.dirname(path),str(os.path.basename(path)).split("?")[0])
tmp_file = path+".downloads"
if os.path.exists(tmp_file):
os.remove(tmp_file)
print("存在缓存文件,已删除:",tmp_file)
repair_count = 1
res = htmlUtils.getBytes(url)
while res.status_code != 200 and repair_count <= 5:
res = htmlUtils.getBytes(url)
print(f'重试:第{repair_count}{url}')
repair_count += 1
#判断是否为图片
if file_type == "image":
if 'image' not in res.headers.get("content-type",""):
print(f"url= {url} Error: URL doesnot appear to be an image")
basedir= os.path.dirname(path)
if not os.path.exists(basedir):
os.makedirs(basedir)
#expected_length = res.headers.get('Content-Length')
#actual_length = res.raw.tell()
with open(tmp_file, 'wb') as f:
for ch in res:
f.write(ch)
f.close()
shutil.move(tmp_file, path)
print(f"url={url} 保存至:{path}")
return path