fix
This commit is contained in:
parent
2ed9aba6dd
commit
be1e963cb7
76
Comics/_utils/downloader.py
Normal file
76
Comics/_utils/downloader.py
Normal file
@ -0,0 +1,76 @@
|
||||
import os
|
||||
import requests
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from queue import Queue
|
||||
|
||||
# 图片下载函数
|
||||
def download_image(url, save_path, retry_queue, max_retries=3):
|
||||
retries = 0
|
||||
while retries < max_retries:
|
||||
try:
|
||||
response = requests.get(url, timeout=10)
|
||||
response.raise_for_status() # 检查是否有请求错误
|
||||
# 确保保存目录存在
|
||||
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
||||
with open(save_path, "wb") as image_file:
|
||||
image_file.write(response.content)
|
||||
print(f"下载成功: {save_path}")
|
||||
return True
|
||||
except Exception as e:
|
||||
retries += 1
|
||||
print(f"下载失败: {url} 错误: {e} 尝试次数: {retries}")
|
||||
# 如果达到最大重试次数,放入重试队列
|
||||
retry_queue.put((url, save_path))
|
||||
return False
|
||||
|
||||
# 多线程下载函数,带失败重试
|
||||
def download_images(urls_with_paths, max_workers=20, max_retries=3):
|
||||
retry_queue = Queue()
|
||||
# 使用 ThreadPoolExecutor 进行多线程下载
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
# 提交所有任务
|
||||
future_to_url = {
|
||||
executor.submit(download_image, url, save_path, retry_queue, max_retries): (url, save_path)
|
||||
for url, save_path in urls_with_paths
|
||||
}
|
||||
|
||||
# 使用 as_completed 监控下载进度
|
||||
for future in as_completed(future_to_url):
|
||||
url, save_path = future_to_url[future]
|
||||
try:
|
||||
future.result()
|
||||
except Exception as e:
|
||||
print(f"下载失败: {url} 错误: {e}")
|
||||
|
||||
# 处理下载失败的任务
|
||||
while not retry_queue.empty():
|
||||
retry_tasks = []
|
||||
while not retry_queue.empty():
|
||||
retry_tasks.append(retry_queue.get())
|
||||
|
||||
# 重试失败的下载任务
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
future_to_url = {
|
||||
executor.submit(download_image, url, save_path, retry_queue, max_retries): (url, save_path)
|
||||
for url, save_path in retry_tasks
|
||||
}
|
||||
|
||||
# 使用 as_completed 监控下载进度
|
||||
for future in as_completed(future_to_url):
|
||||
url, save_path = future_to_url[future]
|
||||
try:
|
||||
future.result()
|
||||
except Exception as e:
|
||||
print(f"下载失败: {url} 错误: {e}(重试阶段)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 示例URL及其对应的保存路径
|
||||
urls_with_custom_paths = [
|
||||
("https://example.com/image1.jpg", "./images/custom_name1.jpg"),
|
||||
("https://example.com/image2.jpg", "./images/folder1/custom_name2.jpg"),
|
||||
("https://example.com/image3.jpg", "./images/folder2/custom_name3.jpg"),
|
||||
]
|
||||
|
||||
# 开始多线程下载,支持失败重试
|
||||
download_images(urls_with_custom_paths, max_workers=20, max_retries=3)
|
||||
@ -466,7 +466,6 @@ class imageUtils:
|
||||
|
||||
newh += b_h
|
||||
newimage.save(save_path)
|
||||
time.sleep(0.1)
|
||||
logging.info(f"解密成功 {img_path} {save_path}")
|
||||
if os.path.exists(img_path):
|
||||
os.remove(img_path)
|
||||
|
||||
@ -16,6 +16,7 @@ from Comics._utils.utils import oldUtils
|
||||
from Comics._utils.exporters import JsonExport,ItemExporter
|
||||
from scrapy.pipelines.images import ImagesPipeline
|
||||
from Comics._utils.ComicInfo import ComicInfoXml
|
||||
from Comics._utils.downloader import download_images
|
||||
|
||||
class ComicsPipeline():
|
||||
'''
|
||||
@ -71,7 +72,8 @@ class ImgDownloadPipeline(BaseImagesPipeline):
|
||||
donwloaded_images = []
|
||||
for image_item in images_item:
|
||||
image_url, image_path = [ image_item["image_url"], image_item["image_path"]]
|
||||
if image_item["image_type"] == "Icon": image_path = super().get_file_path(item, result_type="icon_cache")
|
||||
if image_item["image_type"] == "Icon":
|
||||
image_path = super().get_file_path(item, result_type="icon_cache")
|
||||
is_next = not super().image_scramble_exits(item, image_path)
|
||||
# 图像(含加密图像)存在
|
||||
if not is_next:
|
||||
@ -123,7 +125,7 @@ class ImgDownloadPipeline(BaseImagesPipeline):
|
||||
images_file = oldUtils().old_images(folder=chapter_dir)
|
||||
images_urls = ComicLoader(item=item).get_image_urls()
|
||||
# 校验数据是正确
|
||||
# if len(images_file) != len(images_urls) or len(images_urls) == 0: return
|
||||
if len(images_file) != len(images_urls) or len(images_urls) == 0: return
|
||||
super().update_icon(item)
|
||||
# CBZ文件是否已存在
|
||||
if fu.exists(cbz_path):
|
||||
@ -150,6 +152,27 @@ class ImgDownloadPipeline(BaseImagesPipeline):
|
||||
item (_type_): Comic item数据
|
||||
info (_type_): 信息
|
||||
"""
|
||||
comic = ComicLoader(item=item)
|
||||
# 获取需要解析下载的图像
|
||||
images_item = comic.parse_images()
|
||||
donwloaded_images = []
|
||||
down_queue = []
|
||||
for image_item in images_item:
|
||||
image_url, image_path = [ image_item["image_url"], image_item["image_path"]]
|
||||
if image_item["image_type"] == "Image":
|
||||
is_next = not super().image_scramble_exits(item, image_path)
|
||||
# 图像(含加密图像)存在
|
||||
if not is_next:
|
||||
logging.info(f"file exists: IMAGE_STORE {image_path}")
|
||||
donwloaded_images.append(image_path)
|
||||
logging.info(f"images count= {len(images_item)} downloaded_images_count= {len(donwloaded_images)}")
|
||||
# 如果图像已全部下载则直接跳至下一步(压缩CBZ)
|
||||
# if len(donwloaded_images) == len(images_item):
|
||||
# logging.info(f"len(donwloaded_images) == len(images_item)")
|
||||
# self.download_done(item)
|
||||
if is_next:
|
||||
# logging.info(f"downloading {image_url} --> IMAGE_STORE {image_path}")
|
||||
down_queue.append((image_url, os.path.join(IMAGES_STORE, image_path)))
|
||||
download_images(down_queue)
|
||||
# 存在未下载图像数据则重试
|
||||
if self.download_validate(item):
|
||||
self.download_done(item)
|
||||
Loading…
Reference in New Issue
Block a user