diff --git a/Comics/_utils/downloader.py b/Comics/_utils/downloader.py new file mode 100644 index 0000000..7695304 --- /dev/null +++ b/Comics/_utils/downloader.py @@ -0,0 +1,76 @@ +import os +import requests +from concurrent.futures import ThreadPoolExecutor, as_completed +from queue import Queue + +# 图片下载函数 +def download_image(url, save_path, retry_queue, max_retries=3): + retries = 0 + while retries < max_retries: + try: + response = requests.get(url, timeout=10) + response.raise_for_status() # 检查是否有请求错误 + # 确保保存目录存在 + os.makedirs(os.path.dirname(save_path), exist_ok=True) + with open(save_path, "wb") as image_file: + image_file.write(response.content) + print(f"下载成功: {save_path}") + return True + except Exception as e: + retries += 1 + print(f"下载失败: {url} 错误: {e} 尝试次数: {retries}") + # 如果达到最大重试次数,放入重试队列 + retry_queue.put((url, save_path)) + return False + +# 多线程下载函数,带失败重试 +def download_images(urls_with_paths, max_workers=20, max_retries=3): + retry_queue = Queue() + # 使用 ThreadPoolExecutor 进行多线程下载 + with ThreadPoolExecutor(max_workers=max_workers) as executor: + # 提交所有任务 + future_to_url = { + executor.submit(download_image, url, save_path, retry_queue, max_retries): (url, save_path) + for url, save_path in urls_with_paths + } + + # 使用 as_completed 监控下载进度 + for future in as_completed(future_to_url): + url, save_path = future_to_url[future] + try: + future.result() + except Exception as e: + print(f"下载失败: {url} 错误: {e}") + + # 处理下载失败的任务 + while not retry_queue.empty(): + retry_tasks = [] + while not retry_queue.empty(): + retry_tasks.append(retry_queue.get()) + + # 重试失败的下载任务 + with ThreadPoolExecutor(max_workers=max_workers) as executor: + future_to_url = { + executor.submit(download_image, url, save_path, retry_queue, max_retries): (url, save_path) + for url, save_path in retry_tasks + } + + # 使用 as_completed 监控下载进度 + for future in as_completed(future_to_url): + url, save_path = future_to_url[future] + try: + future.result() + except Exception as e: + print(f"下载失败: {url} 错误: {e}(重试阶段)") + + +if __name__ == "__main__": + # 示例URL及其对应的保存路径 + urls_with_custom_paths = [ + ("https://example.com/image1.jpg", "./images/custom_name1.jpg"), + ("https://example.com/image2.jpg", "./images/folder1/custom_name2.jpg"), + ("https://example.com/image3.jpg", "./images/folder2/custom_name3.jpg"), + ] + + # 开始多线程下载,支持失败重试 + download_images(urls_with_custom_paths, max_workers=20, max_retries=3) \ No newline at end of file diff --git a/Comics/_utils/utils.py b/Comics/_utils/utils.py index aa3361f..3b6f537 100644 --- a/Comics/_utils/utils.py +++ b/Comics/_utils/utils.py @@ -466,7 +466,6 @@ class imageUtils: newh += b_h newimage.save(save_path) - time.sleep(0.1) logging.info(f"解密成功 {img_path} {save_path}") if os.path.exists(img_path): os.remove(img_path) diff --git a/Comics/pipelines.py b/Comics/pipelines.py index a7b10e1..ba0bbb7 100644 --- a/Comics/pipelines.py +++ b/Comics/pipelines.py @@ -16,6 +16,7 @@ from Comics._utils.utils import oldUtils from Comics._utils.exporters import JsonExport,ItemExporter from scrapy.pipelines.images import ImagesPipeline from Comics._utils.ComicInfo import ComicInfoXml +from Comics._utils.downloader import download_images class ComicsPipeline(): ''' @@ -71,20 +72,21 @@ class ImgDownloadPipeline(BaseImagesPipeline): donwloaded_images = [] for image_item in images_item: image_url, image_path = [ image_item["image_url"], image_item["image_path"]] - if image_item["image_type"] == "Icon": image_path = super().get_file_path(item, result_type="icon_cache") - is_next = not super().image_scramble_exits(item, image_path) - # 图像(含加密图像)存在 - if not is_next: - logging.info(f"file exists: IMAGE_STORE {image_path}") - donwloaded_images.append(image_path) - logging.info(f"images count= {len(images_item)} downloaded_images_count= {len(donwloaded_images)}") - # 如果图像已全部下载则直接跳至下一步(压缩CBZ) - if len(donwloaded_images) == len(images_item): - logging.info(f"len(donwloaded_images) == len(images_item)") - self.download_done(item) - if is_next: - logging.info(f"downloading {image_url} --> IMAGE_STORE {image_path}") - yield scrapy.Request(url=image_url, meta={'path': image_path }) + if image_item["image_type"] == "Icon": + image_path = super().get_file_path(item, result_type="icon_cache") + is_next = not super().image_scramble_exits(item, image_path) + # 图像(含加密图像)存在 + if not is_next: + logging.info(f"file exists: IMAGE_STORE {image_path}") + donwloaded_images.append(image_path) + logging.info(f"images count= {len(images_item)} downloaded_images_count= {len(donwloaded_images)}") + # 如果图像已全部下载则直接跳至下一步(压缩CBZ) + if len(donwloaded_images) == len(images_item): + logging.info(f"len(donwloaded_images) == len(images_item)") + self.download_done(item) + if is_next: + logging.info(f"downloading {image_url} --> IMAGE_STORE {image_path}") + yield scrapy.Request(url=image_url, meta={'path': image_path }) # 打包cbz封面 def pack_icon(self, item): @@ -123,7 +125,7 @@ class ImgDownloadPipeline(BaseImagesPipeline): images_file = oldUtils().old_images(folder=chapter_dir) images_urls = ComicLoader(item=item).get_image_urls() # 校验数据是正确 - # if len(images_file) != len(images_urls) or len(images_urls) == 0: return + if len(images_file) != len(images_urls) or len(images_urls) == 0: return super().update_icon(item) # CBZ文件是否已存在 if fu.exists(cbz_path): @@ -150,6 +152,27 @@ class ImgDownloadPipeline(BaseImagesPipeline): item (_type_): Comic item数据 info (_type_): 信息 """ + comic = ComicLoader(item=item) + # 获取需要解析下载的图像 + images_item = comic.parse_images() + donwloaded_images = [] + down_queue = [] + for image_item in images_item: + image_url, image_path = [ image_item["image_url"], image_item["image_path"]] + if image_item["image_type"] == "Image": + is_next = not super().image_scramble_exits(item, image_path) + # 图像(含加密图像)存在 + if not is_next: + logging.info(f"file exists: IMAGE_STORE {image_path}") + donwloaded_images.append(image_path) + logging.info(f"images count= {len(images_item)} downloaded_images_count= {len(donwloaded_images)}") + # 如果图像已全部下载则直接跳至下一步(压缩CBZ) + # if len(donwloaded_images) == len(images_item): + # logging.info(f"len(donwloaded_images) == len(images_item)") + # self.download_done(item) + if is_next: + # logging.info(f"downloading {image_url} --> IMAGE_STORE {image_path}") + down_queue.append((image_url, os.path.join(IMAGES_STORE, image_path))) + download_images(down_queue) # 存在未下载图像数据则重试 - if self.download_validate(item): - self.download_done(item) \ No newline at end of file + self.download_done(item) \ No newline at end of file