90 lines
4.0 KiB
Python
90 lines
4.0 KiB
Python
# Define your item pipelines here
|
|
#
|
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
|
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
|
|
|
|
|
# useful for handling different item types with a single interface
|
|
import os, scrapy,logging,time,random,shutil
|
|
from Comics import settings
|
|
from Comics.settings import CBZ_EXPORT_PATH,OUTPUT_DIR,PROJECT_KEY
|
|
from Comics.utils.Constant import ComicPath
|
|
from Comics.items import ComicItem
|
|
from scrapy.pipelines.images import ImagesPipeline
|
|
from Comics.exporters import ComicInfoXmlItemExporter,JsonExport
|
|
from Comics.utils.FileUtils import CBZUtils
|
|
|
|
class ComicsPipeline:
|
|
def open_spider(self, spider):
|
|
pass
|
|
|
|
# item就是yield后面的对象
|
|
def process_item(self, item, spider):
|
|
if isinstance(item, ComicItem):
|
|
file = os.path.join(OUTPUT_DIR, spider.name, "json", item['name'], item['chapter'])
|
|
item['count'] = len(item['images'])
|
|
item['index'] = item['chapters'].index(item['chapter']) + 1
|
|
data = JsonExport(file=file).export_json(item, if_return=True)
|
|
#data[PROJECT_KEY] = spider.name
|
|
return data
|
|
# image解析
|
|
|
|
def close_spider(self, spider):
|
|
pass
|
|
|
|
class ImgDownloadPipeline(ImagesPipeline):
|
|
def get_file_path(self, item, file=None, result_type="image"):
|
|
return ComicPath.get_file_path(item=item, file=file, result_type= result_type)
|
|
|
|
def image_scramble_exits(self, item,image_path):
|
|
en_image_path = ComicPath().getFileScrambleImageSave(image_path, relative="fullpath")
|
|
return os.path.exists(os.path.join(settings.IMAGES_STORE, self.get_file_path(item, en_image_path)))
|
|
|
|
## Icon Path : CBZ/NAME/CHAPTER.jpg
|
|
def download_icon(self, item, result_type="download"):
|
|
icon_path = self.get_file_path(item, result_type="icon")
|
|
if result_type == "fullpath":
|
|
return os.path.join(settings.IMAGES_STORE, icon_path)
|
|
if os.path.exists(icon_path):
|
|
return False
|
|
else:
|
|
self.image_urls.append(item['icon'])
|
|
self.images.append(icon_path)
|
|
return True
|
|
|
|
def file_path(self, request, response=None, info=None, *, item=None): return request.meta['path']
|
|
|
|
def get_media_requests(self, item, info):
|
|
self.image_urls = item['image_urls']
|
|
self.images = item['images']
|
|
# 下载封面
|
|
self.download_icon(item)
|
|
for image_url,image in zip(self.image_urls,self.images):
|
|
image_path = self.get_file_path(item, image)
|
|
if self.image_scramble_exits(item, image_path):
|
|
logging.info(f"file exists: IMAGE_STORE {image_path}")
|
|
else:
|
|
logging.info(f"downloading {image_url} --> IMAGE_STORE {image_path}")
|
|
yield scrapy.Request(url=image_url, meta={'path': image_path})
|
|
|
|
def pack_icon(self, item):
|
|
cbz_icon = self.get_file_path(item=item, result_type="cbz_icon")
|
|
dwn_icon = self.get_file_path(item=item, result_type="down_icon")
|
|
logging.info(f"icon packing {dwn_icon} => {cbz_icon}")
|
|
cbz_icon_dir = os.path.dirname(cbz_icon)
|
|
if not os.path.exists(cbz_icon_dir): os.makedirs(cbz_icon_dir)
|
|
shutil.copyfile(dwn_icon, cbz_icon)
|
|
|
|
def item_completed(self, results, item, info):
|
|
# return item
|
|
# ComicInfoXml 生成
|
|
comic_info = ComicInfoXmlItemExporter(dir=self.get_file_path(item=item, result_type="comic_info")).export_xml(item)
|
|
# 打包
|
|
if CBZUtils.packComicChapterCBZ(src_dir= self.get_file_path(item, result_type="images_dir"),
|
|
dts_path= self.get_file_path(item, result_type="cbz"),
|
|
comic_info_images= comic_info['Pages'], remove=True):
|
|
self.pack_icon(item)
|
|
|
|
sleep_time = random.randint(25,60)
|
|
print(f'等待{sleep_time}秒后进行下一章节')
|
|
time.sleep(int(sleep_time)) |