182 lines
8.4 KiB
Python
182 lines
8.4 KiB
Python
# Define your item pipelines here
|
||
#
|
||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||
|
||
|
||
# useful for handling different item types with a single interface
|
||
import os,scrapy,logging,shutil
|
||
from Comics import settings
|
||
from Comics.settings import IMAGES_STORE
|
||
from Comics._utils.items import ComicItem
|
||
from Comics._utils.loader import ComicLoader
|
||
from Comics._utils.utils import CBZUtils,fileUtils as fu
|
||
from Comics._utils.utils import ComicPath
|
||
from Comics._utils.utils import oldUtils
|
||
from Comics._utils.exporters import JsonExport,ItemExporter
|
||
from scrapy.pipelines.images import ImagesPipeline
|
||
from Comics._utils.ComicInfo import ComicInfoXml
|
||
from Comics._utils.downloader import download_images
|
||
|
||
class ComicsPipeline():
|
||
'''
|
||
解析前端传入的item数据
|
||
将数据进行序列化后传出
|
||
'''
|
||
# item就是yield后面的对象
|
||
def process_item(self, item: ComicItem, spider):
|
||
if isinstance(item, ComicItem):
|
||
# 已存在漫画CBZ文件 调用转换
|
||
result_item = None
|
||
if fu.exists(ComicPath(item).PATH_CBZ()): result_item = ItemExporter().export_obj(item)
|
||
# 不存在漫画CBZ文件
|
||
else: result_item = JsonExport(file=ComicPath(item).getDirJosnComicChapter()).export_json(ComicLoader(item).load_item(), if_return=True)
|
||
if not fu.exists(ComicPath(item=item).PATH_CBZ()):
|
||
return result_item
|
||
return None
|
||
|
||
class BaseImagesPipeline(ImagesPipeline):
|
||
|
||
def image_scramble_exits(self, item,image_path):
|
||
en_image_path = ComicPath(item).getFileScrambleImageSave(image_path, relative="fullpath")
|
||
return fu.exists(fu.join(settings.IMAGES_STORE, self.get_file_path(item, en_image_path)))
|
||
|
||
def get_file_path(self, item, file=None, result_type="image"):
|
||
return ComicPath(item).file_path(file=file, result_type= result_type)
|
||
|
||
# 封面路径
|
||
def file_path(self, request, response=None, info=None, *, item=None): return request.meta['path']
|
||
|
||
# 判断是否需要更新封面
|
||
def update_icon(self, item):
|
||
# 下载后的缓存封面路径
|
||
image_path = self.get_file_path(item, result_type="down_cache_icon")
|
||
# 最终封面保存路径
|
||
save_path = self.get_file_path(item=item, result_type="down_icon")
|
||
|
||
fu.update_icon(image_path, save_path)
|
||
|
||
def success_completed(self, item, results):
|
||
is_success = False
|
||
fail_data = []
|
||
for result in results:
|
||
# 失败数据
|
||
if not result[0]: fail_data.append(result[1])
|
||
if len(fail_data) == 0 and len(results) != 0: is_success = True
|
||
return is_success
|
||
|
||
class ImgDownloadPipeline(BaseImagesPipeline):
|
||
|
||
def get_media_requests(self, item, info):
|
||
comic = ComicLoader(item=item)
|
||
# 获取需要解析下载的图像
|
||
images_item = comic.parse_images()
|
||
donwloaded_images = []
|
||
for image_item in images_item:
|
||
image_url, image_path = [ image_item["image_url"], image_item["image_path"]]
|
||
if image_item["image_type"] == "Icon":
|
||
image_path = super().get_file_path(item, result_type="icon_cache")
|
||
# 图像(含加密图像)存在
|
||
# if super().image_scramble_exits(item, image_path):
|
||
if os.path.exists(image_path):
|
||
logging.info(f"file exists: IMAGE_STORE {image_path}")
|
||
donwloaded_images.append(image_path)
|
||
logging.info(f"images count= {len(images_item)} downloaded_images_count= {len(donwloaded_images)}")
|
||
else:
|
||
logging.info(f"downloading {image_url} --> IMAGE_STORE {image_path}")
|
||
yield scrapy.Request(url=image_url, meta={'path': image_path })
|
||
|
||
# 打包cbz封面
|
||
def pack_icon(self, item):
|
||
cbz_icon = super().get_file_path(item=item, result_type="cbz_icon")
|
||
dwn_icon = super().get_file_path(item=item, result_type="down_icon")
|
||
base_dir = fu.dirname(dwn_icon)
|
||
name = fu.basename(dwn_icon).split(".")[0]
|
||
for dirname in os.listdir(base_dir):
|
||
path = fu.join(base_dir, dirname)
|
||
if os.path.isfile(path) and dirname.startswith(name):
|
||
fu.update_icon(path, cbz_icon)
|
||
|
||
def download_validate(self, item):
|
||
"""图片校验完整则返回TRUE
|
||
|
||
Args:
|
||
item (_type_): _description_
|
||
|
||
Returns:
|
||
_type_: _description_
|
||
"""
|
||
comic = ComicLoader(item=item)
|
||
# 获取需要解析下载的图像
|
||
images_item = comic.parse_images()
|
||
exist_images = []
|
||
for image_item in images_item:
|
||
image_path = os.path.join(IMAGES_STORE, image_item["image_path"])
|
||
if image_item["image_type"] == "Image" and os.path.exists(image_path): exist_images.append(image_path)
|
||
if len(comic.get_image_urls()) == len(exist_images) and len(exist_images) != 0 : return True
|
||
return False
|
||
|
||
def download_done(self, item):
|
||
# super().update_icon(item)
|
||
cbz_path = super().get_file_path(item, result_type="cbz")
|
||
chapter_dir = ComicPath(item=item).file_path(result_type=ComicPath().MAPPING_IMAGES_DIR)
|
||
# images_file = oldUtils().old_images(folder=chapter_dir)
|
||
# images_urls = ComicLoader(item=item).get_image_urls()
|
||
# 校验数据是正确
|
||
# if len(images_file) != len(images_urls) or len(images_urls) == 0: return
|
||
super().update_icon(item)
|
||
# CBZ文件是否已存在
|
||
if fu.exists(cbz_path):
|
||
logging.info(f"file exists {cbz_path}")
|
||
#self.update_icon(item)
|
||
chapter = os.path.basename(cbz_path).split(".")[0]
|
||
if os.path.exists(chapter_dir) and chapter in chapter_dir:
|
||
print(f"正在删除多余章节文件夹 {chapter_dir}")
|
||
shutil.rmtree(chapter_dir)
|
||
self.pack_icon(item)
|
||
else:
|
||
# ComicInfoXml 生成
|
||
ComicInfoXml().scrapy_xml_by_json(item, save_dir= chapter_dir)
|
||
if CBZUtils.comicChapterCBZPack(src_dir= chapter_dir, dts_path= cbz_path, remove=True):
|
||
super().update_icon(item)
|
||
self.pack_icon(item)
|
||
|
||
def down_image(self, item):
|
||
"""图片下载完成,开始CBZ打包
|
||
不再做数据完整检验,CBZUtils后续会自动检验
|
||
传入图像路径即可
|
||
Args:
|
||
results (_type_): 当前下载完成的图片,已下载的已忽略
|
||
item (_type_): Comic item数据
|
||
info (_type_): 信息
|
||
"""
|
||
comic = ComicLoader(item=item)
|
||
# 获取需要解析下载的图像
|
||
images_item = comic.parse_images()
|
||
donwloaded_images = []
|
||
down_queue = []
|
||
for image_item in images_item:
|
||
image_url, image_path = [ image_item["image_url"], image_item["image_path"]]
|
||
if image_item["image_type"] == "Image":
|
||
is_next = not super().image_scramble_exits(item, image_path)
|
||
# 图像(含加密图像)存在
|
||
if not is_next:
|
||
logging.info(f"file exists: IMAGE_STORE {image_path}")
|
||
donwloaded_images.append(image_path)
|
||
logging.info(f"images count= {len(images_item)} downloaded_images_count= {len(donwloaded_images)}")
|
||
# 如果图像已全部下载则直接跳至下一步(压缩CBZ)
|
||
# if len(donwloaded_images) == len(images_item):
|
||
# logging.info(f"len(donwloaded_images) == len(images_item)")
|
||
# self.download_done(item)
|
||
if is_next:
|
||
# logging.info(f"downloading {image_url} --> IMAGE_STORE {image_path}")
|
||
down_queue.append((image_url, os.path.join(IMAGES_STORE, image_path)))
|
||
if len(down_queue) > 0:
|
||
download_images(down_queue, max_retries=3)
|
||
|
||
def item_completed(self, results, item, info):
|
||
cbz_path = super().get_file_path(item, result_type="cbz")
|
||
if not fu.exists(cbz_path):
|
||
self.down_image(item)
|
||
# 存在未下载图像数据则重试
|
||
self.download_done(item) |