ComicScrapy/Comics/pipelines.py
2023-06-20 02:52:51 +08:00

63 lines
2.6 KiB
Python

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
import os, scrapy,logging,time,random
from Comics import settings
from Comics.utils.FileUtils import imageUtils
from Comics.utils.FileUtils import fileUtils
from Comics.utils.Constant import ComicPath
from Comics.items import ComicItem
from Comics.items import ImagesItem
from scrapy.pipelines.images import ImagesPipeline
from Comics.exporters import ComicInfoXmlItemExporter
from Comics.exporters import ItemExporter
from Comics.exporters import JsonExport
from Comics.utils.FileUtils import CBZUtils
class ComicsPipeline:
def open_spider(self, spider):
pass
# item就是yield后面的对象
def process_item(self, item, spider):
if isinstance(item, ComicItem):
file = os.path.join(settings.OUTPUT_DIR,"json", item['name'], item['chapter'])
data = JsonExport(file=file).export_json(item, if_return=True)
#item['images'] = data['images']
return data
# image解析
def close_spider(self,spider):
pass
class ImgDownloadPipeline(ImagesPipeline):
def file_exits(self, image_path):
en_image_path = ComicPath().getFileScrambleImageSave(image_path, relative="fullpath")
return os.path.exists(os.path.join(settings.IMAGES_STORE, en_image_path))
def file_full_path(self, item, image): return os.path.join(item['name'], item['chapter'], image)
def file_path(self, request, response=None, info=None, *, item=None): return request.meta['path']
def get_media_requests(self, item, info):
for image_url,image_path in zip(item['image_urls'],item['images']):
image_path = self.file_full_path(item, image_path)
if self.file_exits(image_path):
logging.info(f"file exists: {image_path}")
else:
logging.info(f"downloading {image_url} --> {image_path}")
yield scrapy.Request(url=image_url, meta={'path': image_path})
def item_completed(self, results, item, info):
item['images_name'] = results
# return item
# ComicInfoXml 生成
comic_info = ComicInfoXmlItemExporter(comic=item['name'], chapter=item['chapter']).export_xml(item)
# 打包
CBZUtils.packComicChapterCBZ(comic=item['name'], chapter=item['chapter'],
comic_info_images= comic_info["Pages"], remove=False)
time.sleep(random.randint(5,10))