# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface import os,scrapy,logging,shutil from Comics import settings from Comics._utils.items import ComicItem from Comics._utils.loader import ComicLoader from Comics._utils.utils import CBZUtils,fileUtils as fu from Comics._utils.utils import ComicPath from Comics._utils.utils import oldUtils from Comics._utils.exporters import JsonExport,ItemExporter from scrapy.pipelines.images import ImagesPipeline from Comics._utils.ComicInfo import ComicInfoXml class ComicsPipeline(): ''' 解析前端传入的item数据 将数据进行序列化后传出 ''' # item就是yield后面的对象 def process_item(self, item: ComicItem, spider): if isinstance(item, ComicItem): # 已存在漫画CBZ文件 调用转换 result_item = None if fu.exists(ComicPath(item).PATH_CBZ()): result_item = ItemExporter().export_obj(item) # 不存在漫画CBZ文件 else: result_item = JsonExport(file=ComicPath(item).getDirJosnComicChapter()).export_json(ComicLoader(item).load_item(), if_return=True) return result_item class BaseImagesPipeline(ImagesPipeline): def image_scramble_exits(self, item,image_path): en_image_path = ComicPath(item).getFileScrambleImageSave(image_path, relative="fullpath") return fu.exists(fu.join(settings.IMAGES_STORE, self.get_file_path(item, en_image_path))) def get_file_path(self, item, file=None, result_type="image"): return ComicPath(item).file_path(file=file, result_type= result_type) # 封面路径 def file_path(self, request, response=None, info=None, *, item=None): return request.meta['path'] # 判断是否需要更新封面 def update_icon(self, item): # 下载后的缓存封面路径 image_path = self.get_file_path(item, result_type="down_cache_icon") # 最终封面保存路径 save_path = self.get_file_path(item=item, result_type="down_icon") fu.update_icon(image_path, save_path) def success_completed(self, item, results): is_success = False fail_data = [] for result in results: # 失败数据 if not result[0]: fail_data.append(result[1]) if len(fail_data) == 0 and len(results) != 0: is_success = True return is_success class ImgDownloadPipeline(BaseImagesPipeline): def get_media_requests(self, item, info): comic = ComicLoader(item=item) # 获取需要解析下载的图像 images_item = comic.parse_images() for image_item in images_item: image_url, image_path = [ image_item["image_url"], image_item["image_path"]] if image_item["image_type"] == "Icon": image_path = super().get_file_path(item, result_type="icon_cache") if fu.exists(image_path): return # 图像(含加密图像)存在 if super().image_scramble_exits(item, image_path): logging.info(f"file exists: IMAGE_STORE {image_path}") else: logging.info(f"downloading {image_url} --> IMAGE_STORE {image_path}") yield scrapy.Request(url=image_url, meta={'path': image_path}) # 打包cbz封面 def pack_icon(self, item): cbz_icon = super().get_file_path(item=item, result_type="cbz_icon") dwn_icon = super().get_file_path(item=item, result_type="down_icon") base_dir = fu.dirname(dwn_icon) name = fu.basename(dwn_icon).split(".")[0] for dirname in os.listdir(base_dir): path = fu.join(base_dir, dirname) if os.path.isfile(path) and dirname.startswith(name): fu.update_icon(path, cbz_icon) def item_completed(self, results, item, info): """图片下载完成,开始CBZ打包 不再做数据完整检验,CBZUtils后续会自动检验 传入图像路径即可 Args: results (_type_): 当前下载完成的图片,已下载的已忽略 item (_type_): Comic item数据 info (_type_): 信息 """ # 存在未下载图像数据则重试 if not super().success_completed(item, results): return # super().update_icon(item) cbz_path = super().get_file_path(item, result_type="cbz") chapter_dir = ComicPath(item=item).file_path(result_type=ComicPath().MAPPING_IMAGES_DIR) images_file = oldUtils().old_images(folder=chapter_dir) images_urls = ComicLoader(item=item).get_image_urls() # 校验数据是正确 if len(images_file) != len(images_urls) or len(images_urls) == 0: return super().update_icon(item) # CBZ文件是否已存在 if fu.exists(cbz_path): #self.update_icon(item) chapter = os.path.basename(cbz_path).split(".")[0] if os.path.exists(chapter_dir) and chapter in chapter_dir: print(f"正在删除多余章节文件夹 {chapter_dir}") shutil.rmtree(chapter_dir) self.pack_icon(item) else: # ComicInfoXml 生成 ComicInfoXml().scrapy_xml_by_json(item, save_dir= chapter_dir) if CBZUtils.packComicChapterCBZ(src_dir= chapter_dir, dts_path= cbz_path, comic_info_images= images_urls, remove=False): super().update_icon(item) self.pack_icon(item)