# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface import os,scrapy,logging,shutil from Comics import settings from Comics.settings import IMAGES_STORE from Comics._utils.items import ComicItem from Comics._utils.loader import ComicLoader from Comics._utils.utils import CBZUtils,fileUtils as fu from Comics._utils.utils import ComicPath from Comics._utils.utils import oldUtils from Comics._utils.exporters import JsonExport,ItemExporter from scrapy.pipelines.images import ImagesPipeline from Comics._utils.ComicInfo import ComicInfoXml from Comics._utils.downloader import download_images class ComicsPipeline(): # 500kb def remove_min_file(self, file_path, min_size=500*1024): try: # 使用 os.path.getsize() 获取文件大小(以字节为单位) file_size = os.path.getsize(file_path) if file_size < min_size: os.remove(file_path) logging.info(f"清除错误文件: {file_path}") except Exception as e: print(f"获取文件大小失败: {e}") ''' 解析前端传入的item数据 将数据进行序列化后传出 ''' # item就是yield后面的对象 def process_item(self, item: ComicItem, spider): if isinstance(item, ComicItem): # 已存在漫画CBZ文件 调用转换 result_item = None if fu.exists(ComicPath(item).PATH_CBZ()): result_item = ItemExporter().export_obj(item) # 不存在漫画CBZ文件 else: result_item = JsonExport(file=ComicPath(item).getDirJosnComicChapter()).export_json(ComicLoader(item).load_item(), if_return=True) cbz_path = ComicPath(item=item).PATH_CBZ() if not fu.exists(cbz_path): return result_item else: self.remove_min_file(cbz_path) return None class BaseImagesPipeline(ImagesPipeline): def image_scramble_exits(self, item,image_path): en_image_path = ComicPath(item).getFileScrambleImageSave(image_path, relative="fullpath") return fu.exists(fu.join(settings.IMAGES_STORE, self.get_file_path(item, en_image_path))) def get_file_path(self, item, file=None, result_type="image"): return ComicPath(item).file_path(file=file, result_type= result_type) # 封面路径 def file_path(self, request, response=None, info=None, *, item=None): return request.meta['path'] # 判断是否需要更新封面 def update_icon(self, item): # 下载后的缓存封面路径 image_path = self.get_file_path(item, result_type="down_cache_icon") # 最终封面保存路径 save_path = self.get_file_path(item=item, result_type="down_icon") fu.update_icon(image_path, save_path) def success_completed(self, item, results): is_success = False fail_data = [] for result in results: # 失败数据 if not result[0]: fail_data.append(result[1]) if len(fail_data) == 0 and len(results) != 0: is_success = True return is_success class ImgDownloadPipeline(BaseImagesPipeline): def get_media_requests(self, item, info): comic = ComicLoader(item=item) # 获取需要解析下载的图像 images_item = comic.parse_images() donwloaded_images = [] for image_item in images_item: image_url, image_path = [ image_item["image_url"], image_item["image_path"]] if image_item["image_type"] == "Icon": image_path = super().get_file_path(item, result_type="icon_cache") # 图像(含加密图像)存在 # if super().image_scramble_exits(item, image_path): if os.path.exists(image_path): logging.info(f"file exists: IMAGE_STORE {image_path}") donwloaded_images.append(image_path) logging.info(f"images count= {len(images_item)} downloaded_images_count= {len(donwloaded_images)}") else: logging.info(f"downloading {image_url} --> IMAGE_STORE {image_path}") yield scrapy.Request(url=image_url, meta={'path': image_path }) # 打包cbz封面 def pack_icon(self, item): cbz_icon = super().get_file_path(item=item, result_type="cbz_icon") dwn_icon = super().get_file_path(item=item, result_type="down_icon") base_dir = fu.dirname(dwn_icon) name = fu.basename(dwn_icon).split(".")[0] for dirname in os.listdir(base_dir): path = fu.join(base_dir, dirname) if os.path.isfile(path) and dirname.startswith(name): fu.update_icon(path, cbz_icon) def download_validate(self, item): """图片校验完整则返回TRUE Args: item (_type_): _description_ Returns: _type_: _description_ """ comic = ComicLoader(item=item) # 获取需要解析下载的图像 images_item = comic.parse_images() exist_images = [] for image_item in images_item: image_path = os.path.join(IMAGES_STORE, image_item["image_path"]) if image_item["image_type"] == "Image" and os.path.exists(image_path): exist_images.append(image_path) if len(comic.get_image_urls()) == len(exist_images) and len(exist_images) != 0 : return True return False def download_done(self, item): # super().update_icon(item) cbz_path = super().get_file_path(item, result_type="cbz") chapter_dir = ComicPath(item=item).file_path(result_type=ComicPath().MAPPING_IMAGES_DIR) # images_file = oldUtils().old_images(folder=chapter_dir) # images_urls = ComicLoader(item=item).get_image_urls() # 校验数据是正确 # if len(images_file) != len(images_urls) or len(images_urls) == 0: return super().update_icon(item) # CBZ文件是否已存在 if fu.exists(cbz_path): logging.info(f"file exists {cbz_path}") #self.update_icon(item) chapter = os.path.basename(cbz_path).split(".")[0] if os.path.exists(chapter_dir) and chapter in chapter_dir: print(f"正在删除多余章节文件夹 {chapter_dir}") shutil.rmtree(chapter_dir) self.pack_icon(item) else: # ComicInfoXml 生成 ComicInfoXml().scrapy_xml_by_json(item, save_dir= chapter_dir) if CBZUtils.comicChapterCBZPack(src_dir= chapter_dir, dts_path= cbz_path, remove=True): super().update_icon(item) self.pack_icon(item) def down_image(self, item): """图片下载完成,开始CBZ打包 不再做数据完整检验,CBZUtils后续会自动检验 传入图像路径即可 Args: results (_type_): 当前下载完成的图片,已下载的已忽略 item (_type_): Comic item数据 info (_type_): 信息 """ comic = ComicLoader(item=item) # 获取需要解析下载的图像 images_item = comic.parse_images() donwloaded_images = [] down_queue = [] for image_item in images_item: image_url, image_path = [ image_item["image_url"], image_item["image_path"]] if image_item["image_type"] == "Image": is_next = not super().image_scramble_exits(item, image_path) # 图像(含加密图像)存在 if not is_next: logging.info(f"file exists: IMAGE_STORE {image_path}") donwloaded_images.append(image_path) logging.info(f"images count= {len(images_item)} downloaded_images_count= {len(donwloaded_images)}") # 如果图像已全部下载则直接跳至下一步(压缩CBZ) # if len(donwloaded_images) == len(images_item): # logging.info(f"len(donwloaded_images) == len(images_item)") # self.download_done(item) if is_next: # logging.info(f"downloading {image_url} --> IMAGE_STORE {image_path}") down_queue.append((image_url, os.path.join(IMAGES_STORE, image_path))) if len(down_queue) > 0: download_images(down_queue, max_retries=3) def item_completed(self, results, item, info): cbz_path = super().get_file_path(item, result_type="cbz") if not fu.exists(cbz_path): self.down_image(item) # 存在未下载图像数据则重试 self.download_done(item)