ComicScrapy/Comics/pipelines.py
2024-02-20 21:08:13 +08:00

128 lines
5.5 KiB
Python

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
import os,scrapy,logging
from Comics import settings
from Comics.items import ComicItem
from Comics.settings import OUTPUT_DIR
from Comics.loader import ComicEntity,ComicLoader
from Comics.exporters import ComicInfoXmlItemExporter
from Comics.utils import CBZUtils,fileUtils as fu
from Comics.utils import ComicPath
from Comics.utils import checkUtils
from Comics.exporters import JsonExport,ItemExporter
from scrapy.pipelines.images import ImagesPipeline
class ComicsPipeline():
def open_spider(self, spider):
pass
# item就是yield后面的对象
def process_item(self, item, spider):
if isinstance(item, ComicItem):
# item = ComicEntity(item).item()
# 'output/rm_comic/json/壞X/第1話 壞X'
# 已存在漫画CBZ文件 调用转换
if fu.exists(ComicPath.path_cbz(item=item)): return ItemExporter().export_obj(item)
else:
# 不存在漫画CBZ文件
#file = os.path.join(OUTPUT_DIR, spider.name, "json", item['name'], item['chapter'])
return JsonExport(file=ComicPath.getDirJosnComicChapter(item)).export_json(ComicEntity(item).item(), if_return=True)
# image解析
def close_spider(self, spider):
pass
class ImgDownloadPipeline(ImagesPipeline):
def get_file_path(self, item, file=None, result_type="image"):
return ComicPath().get_file_path(item=item, file=file, result_type= result_type)
def image_scramble_exits(self, item,image_path):
en_image_path = ComicPath().getFileScrambleImageSave(image_path, relative="fullpath")
return fu.exists(fu.join(settings.IMAGES_STORE, self.get_file_path(item, en_image_path)))
## Icon Path : CBZ/NAME/CHAPTER.jpg
def download_icon(self, item, result_type="download"):
icon_path = self.get_file_path(item, result_type="icon_cache")
if result_type == "fullpath":
return fu.join(settings.IMAGES_STORE, icon_path)
if fu.exists(icon_path):
return False
else:
self.image_urls.append(item['icon'])
self.images.append(icon_path)
return True
def file_path(self, request, response=None, info=None, *, item=None): return request.meta['path']
def get_media_requests(self, item, info):
comic = ComicEntity(item)
self.image_urls = comic.image_urls()
self.images = comic.images()
# 下载封面
self.download_icon(item)
for image_url,image in zip(self.image_urls,self.images):
is_down = True
image_path = self.get_file_path(item, image)
if self.image_scramble_exits(item, image_path):
if image_path == self.get_file_path(item, result_type="icon_cache"):
logging.info(f"icon file exists: IMAGE_STORE {image_path}")
else:
is_down = False
logging.info(f"file exists: IMAGE_STORE {image_path}")
if is_down:
logging.info(f"downloading {image_url} --> IMAGE_STORE {image_path}")
yield scrapy.Request(url=image_url, meta={'path': image_path})
# 打包cbz封面
def pack_icon(self, item):
cbz_icon = self.get_file_path(item=item, result_type="cbz_icon")
dwn_icon = self.get_file_path(item=item, result_type="down_icon")
base_dir = fu.dirname(dwn_icon)
name = fu.basename(dwn_icon).split(".")[0]
for dirname in os.listdir(base_dir):
path = fu.join(base_dir, dirname)
if os.path.isfile(path) and dirname.startswith(name):
fu.update_icon(path, cbz_icon)
# 判断是否需要更新封面
def update_icon(self, item):
# 下载后的缓存封面路径
image_path = self.get_file_path(item, result_type="down_cache_icon")
# 最终封面保存路径
save_path = self.get_file_path(item=item, result_type="down_icon")
fu.update_icon(image_path, save_path)
def item_completed(self, results, item, info):
# return item
# 打包
cbz_path = self.get_file_path(item, result_type="cbz")
success_data = []
for result in results:
if result[0]: success_data.append(result[1])
image_urls = ComicLoader(item=item).get_image_urls()
if len(success_data) != len(image_urls): return
if fu.exists(cbz_path):
self.update_icon(item)
self.pack_icon(item)
else:
# ComicInfoXml 生成
comic_info = ComicInfoXmlItemExporter(dir=self.get_file_path(item=item, result_type="comic_info")).export_xml(item)
if CBZUtils.packComicChapterCBZ(src_dir= self.get_file_path(item, result_type="images_dir"),
dts_path= cbz_path,
comic_info_images= comic_info['Pages'], remove=True):
self.update_icon(item)
self.pack_icon(item)
# CBZ校验失败
else:
checkUtils().export_error(item)
#sleep_time = random.randint(3,15)
#print(f'等待{sleep_time}秒后进行下一章节')
#time.sleep(int(sleep_time))