ComicScrapy/Comics/pipelines.py
2023-08-19 13:25:32 +08:00

90 lines
4.0 KiB
Python

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
import os, scrapy,logging,time,random,shutil
from Comics import settings
from Comics.settings import CBZ_EXPORT_PATH,OUTPUT_DIR,PROJECT_KEY
from Comics.utils.Constant import ComicPath
from Comics.items import ComicItem
from scrapy.pipelines.images import ImagesPipeline
from Comics.exporters import ComicInfoXmlItemExporter,JsonExport
from Comics.utils.FileUtils import CBZUtils
class ComicsPipeline:
def open_spider(self, spider):
pass
# item就是yield后面的对象
def process_item(self, item, spider):
if isinstance(item, ComicItem):
file = os.path.join(OUTPUT_DIR, spider.name, "json", item['name'], item['chapter'])
item['count'] = len(item['images'])
item['index'] = item['chapters'].index(item['chapter']) + 1
data = JsonExport(file=file).export_json(item, if_return=True)
#data[PROJECT_KEY] = spider.name
return data
# image解析
def close_spider(self, spider):
pass
class ImgDownloadPipeline(ImagesPipeline):
def get_file_path(self, item, file=None, result_type="image"):
return ComicPath.get_file_path(item=item, file=file, result_type= result_type)
def image_scramble_exits(self, item,image_path):
en_image_path = ComicPath().getFileScrambleImageSave(image_path, relative="fullpath")
return os.path.exists(os.path.join(settings.IMAGES_STORE, self.get_file_path(item, en_image_path)))
## Icon Path : CBZ/NAME/CHAPTER.jpg
def download_icon(self, item, result_type="download"):
icon_path = self.get_file_path(item, result_type="icon")
if result_type == "fullpath":
return os.path.join(settings.IMAGES_STORE, icon_path)
if os.path.exists(icon_path):
return False
else:
self.image_urls.append(item['icon'])
self.images.append(icon_path)
return True
def file_path(self, request, response=None, info=None, *, item=None): return request.meta['path']
def get_media_requests(self, item, info):
self.image_urls = item['image_urls']
self.images = item['images']
# 下载封面
self.download_icon(item)
for image_url,image in zip(self.image_urls,self.images):
image_path = self.get_file_path(item, image)
if self.image_scramble_exits(item, image_path):
logging.info(f"file exists: IMAGE_STORE {image_path}")
else:
logging.info(f"downloading {image_url} --> IMAGE_STORE {image_path}")
yield scrapy.Request(url=image_url, meta={'path': image_path})
def pack_icon(self, item):
cbz_icon = self.get_file_path(item=item, result_type="cbz_icon")
dwn_icon = self.get_file_path(item=item, result_type="down_icon")
logging.info(f"icon packing {dwn_icon} => {cbz_icon}")
cbz_icon_dir = os.path.dirname(cbz_icon)
if not os.path.exists(cbz_icon_dir): os.makedirs(cbz_icon_dir)
shutil.copyfile(dwn_icon, cbz_icon)
def item_completed(self, results, item, info):
# return item
# ComicInfoXml 生成
comic_info = ComicInfoXmlItemExporter(dir=self.get_file_path(item=item, result_type="comic_info")).export_xml(item)
# 打包
if CBZUtils.packComicChapterCBZ(src_dir= self.get_file_path(item, result_type="images_dir"),
dts_path= self.get_file_path(item, result_type="cbz"),
comic_info_images= comic_info['Pages'], remove=True):
self.pack_icon(item)
sleep_time = random.randint(25,60)
print(f'等待{sleep_time}秒后进行下一章节')
time.sleep(int(sleep_time))