63 lines
2.6 KiB
Python
63 lines
2.6 KiB
Python
# Define your item pipelines here
|
|
#
|
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
|
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
|
|
|
|
|
# useful for handling different item types with a single interface
|
|
import os, scrapy,logging,time,random
|
|
from Comics import settings
|
|
from Comics.utils.FileUtils import imageUtils
|
|
from Comics.utils.FileUtils import fileUtils
|
|
from Comics.utils.Constant import ComicPath
|
|
from Comics.items import ComicItem
|
|
from Comics.items import ImagesItem
|
|
from scrapy.pipelines.images import ImagesPipeline
|
|
from Comics.exporters import ComicInfoXmlItemExporter
|
|
from Comics.exporters import ItemExporter
|
|
from Comics.exporters import JsonExport
|
|
from Comics.utils.FileUtils import CBZUtils
|
|
|
|
class ComicsPipeline:
|
|
def open_spider(self, spider):
|
|
pass
|
|
# item就是yield后面的对象
|
|
def process_item(self, item, spider):
|
|
if isinstance(item, ComicItem):
|
|
file = os.path.join(settings.OUTPUT_DIR,"json", item['name'], item['chapter'])
|
|
data = JsonExport(file=file).export_json(item, if_return=True)
|
|
#item['images'] = data['images']
|
|
return data
|
|
# image解析
|
|
|
|
def close_spider(self,spider):
|
|
pass
|
|
|
|
|
|
class ImgDownloadPipeline(ImagesPipeline):
|
|
def file_exits(self, image_path):
|
|
en_image_path = ComicPath().getFileScrambleImageSave(image_path, relative="fullpath")
|
|
return os.path.exists(os.path.join(settings.IMAGES_STORE, en_image_path))
|
|
|
|
def file_full_path(self, item, image): return os.path.join(item['name'], item['chapter'], image)
|
|
|
|
def file_path(self, request, response=None, info=None, *, item=None): return request.meta['path']
|
|
|
|
def get_media_requests(self, item, info):
|
|
for image_url,image_path in zip(item['image_urls'],item['images']):
|
|
image_path = self.file_full_path(item, image_path)
|
|
if self.file_exits(image_path):
|
|
logging.info(f"file exists: {image_path}")
|
|
else:
|
|
logging.info(f"downloading {image_url} --> {image_path}")
|
|
yield scrapy.Request(url=image_url, meta={'path': image_path})
|
|
|
|
def item_completed(self, results, item, info):
|
|
item['images_name'] = results
|
|
# return item
|
|
# ComicInfoXml 生成
|
|
comic_info = ComicInfoXmlItemExporter(comic=item['name'], chapter=item['chapter']).export_xml(item)
|
|
# 打包
|
|
CBZUtils.packComicChapterCBZ(comic=item['name'], chapter=item['chapter'],
|
|
comic_info_images= comic_info["Pages"], remove=False)
|
|
time.sleep(random.randint(5,10)) |