This commit is contained in:
caiwx86 2024-11-14 18:52:33 +08:00
parent 2ed9aba6dd
commit be1e963cb7
3 changed files with 116 additions and 18 deletions

View File

@ -0,0 +1,76 @@
import os
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from queue import Queue
# 图片下载函数
def download_image(url, save_path, retry_queue, max_retries=3):
retries = 0
while retries < max_retries:
try:
response = requests.get(url, timeout=10)
response.raise_for_status() # 检查是否有请求错误
# 确保保存目录存在
os.makedirs(os.path.dirname(save_path), exist_ok=True)
with open(save_path, "wb") as image_file:
image_file.write(response.content)
print(f"下载成功: {save_path}")
return True
except Exception as e:
retries += 1
print(f"下载失败: {url} 错误: {e} 尝试次数: {retries}")
# 如果达到最大重试次数,放入重试队列
retry_queue.put((url, save_path))
return False
# 多线程下载函数,带失败重试
def download_images(urls_with_paths, max_workers=20, max_retries=3):
retry_queue = Queue()
# 使用 ThreadPoolExecutor 进行多线程下载
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# 提交所有任务
future_to_url = {
executor.submit(download_image, url, save_path, retry_queue, max_retries): (url, save_path)
for url, save_path in urls_with_paths
}
# 使用 as_completed 监控下载进度
for future in as_completed(future_to_url):
url, save_path = future_to_url[future]
try:
future.result()
except Exception as e:
print(f"下载失败: {url} 错误: {e}")
# 处理下载失败的任务
while not retry_queue.empty():
retry_tasks = []
while not retry_queue.empty():
retry_tasks.append(retry_queue.get())
# 重试失败的下载任务
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_url = {
executor.submit(download_image, url, save_path, retry_queue, max_retries): (url, save_path)
for url, save_path in retry_tasks
}
# 使用 as_completed 监控下载进度
for future in as_completed(future_to_url):
url, save_path = future_to_url[future]
try:
future.result()
except Exception as e:
print(f"下载失败: {url} 错误: {e}(重试阶段)")
if __name__ == "__main__":
# 示例URL及其对应的保存路径
urls_with_custom_paths = [
("https://example.com/image1.jpg", "./images/custom_name1.jpg"),
("https://example.com/image2.jpg", "./images/folder1/custom_name2.jpg"),
("https://example.com/image3.jpg", "./images/folder2/custom_name3.jpg"),
]
# 开始多线程下载,支持失败重试
download_images(urls_with_custom_paths, max_workers=20, max_retries=3)

View File

@ -466,7 +466,6 @@ class imageUtils:
newh += b_h
newimage.save(save_path)
time.sleep(0.1)
logging.info(f"解密成功 {img_path} {save_path}")
if os.path.exists(img_path):
os.remove(img_path)

View File

@ -16,6 +16,7 @@ from Comics._utils.utils import oldUtils
from Comics._utils.exporters import JsonExport,ItemExporter
from scrapy.pipelines.images import ImagesPipeline
from Comics._utils.ComicInfo import ComicInfoXml
from Comics._utils.downloader import download_images
class ComicsPipeline():
'''
@ -71,7 +72,8 @@ class ImgDownloadPipeline(BaseImagesPipeline):
donwloaded_images = []
for image_item in images_item:
image_url, image_path = [ image_item["image_url"], image_item["image_path"]]
if image_item["image_type"] == "Icon": image_path = super().get_file_path(item, result_type="icon_cache")
if image_item["image_type"] == "Icon":
image_path = super().get_file_path(item, result_type="icon_cache")
is_next = not super().image_scramble_exits(item, image_path)
# 图像(含加密图像)存在
if not is_next:
@ -123,7 +125,7 @@ class ImgDownloadPipeline(BaseImagesPipeline):
images_file = oldUtils().old_images(folder=chapter_dir)
images_urls = ComicLoader(item=item).get_image_urls()
# 校验数据是正确
# if len(images_file) != len(images_urls) or len(images_urls) == 0: return
if len(images_file) != len(images_urls) or len(images_urls) == 0: return
super().update_icon(item)
# CBZ文件是否已存在
if fu.exists(cbz_path):
@ -150,6 +152,27 @@ class ImgDownloadPipeline(BaseImagesPipeline):
item (_type_): Comic item数据
info (_type_): 信息
"""
comic = ComicLoader(item=item)
# 获取需要解析下载的图像
images_item = comic.parse_images()
donwloaded_images = []
down_queue = []
for image_item in images_item:
image_url, image_path = [ image_item["image_url"], image_item["image_path"]]
if image_item["image_type"] == "Image":
is_next = not super().image_scramble_exits(item, image_path)
# 图像(含加密图像)存在
if not is_next:
logging.info(f"file exists: IMAGE_STORE {image_path}")
donwloaded_images.append(image_path)
logging.info(f"images count= {len(images_item)} downloaded_images_count= {len(donwloaded_images)}")
# 如果图像已全部下载则直接跳至下一步压缩CBZ
# if len(donwloaded_images) == len(images_item):
# logging.info(f"len(donwloaded_images) == len(images_item)")
# self.download_done(item)
if is_next:
# logging.info(f"downloading {image_url} --> IMAGE_STORE {image_path}")
down_queue.append((image_url, os.path.join(IMAGES_STORE, image_path)))
download_images(down_queue)
# 存在未下载图像数据则重试
if self.download_validate(item):
self.download_done(item)