This commit is contained in:
cwx 2023-11-11 23:19:43 +08:00
parent fbcf514bed
commit 2a9820949b
4 changed files with 65 additions and 74 deletions

View File

@ -7,14 +7,14 @@ from scrapy.item import Item, Field
from Comics.utils.Constant import ComicPath
from Comics.utils.FileUtils import imageUtils
from itemloaders.processors import TakeFirst, MapCompose, Join
from scrapy.spiders import Spider
def current_project(): return Spider.name
# 繁体中文转为简体中文
def serialize_to_chinese(value): return ComicPath.chinese_convert(value)
# 将路径字符串转为合法路径
def serialize_to_fix_file(value): return ComicPath.fix_file_name(ComicPath.chinese_convert(value))
# 图片数据解析(私有方法)
def _serialize_to_images(value, result_type=None):
(count, images_item, image_urls) = [1,[],[]]
for image in value:
@ -30,7 +30,6 @@ def _serialize_to_images(value, result_type=None):
de_str = str(image_src).split("/")[-1].replace(suffix, "==")
blocks_num = imageUtils.encodeImage(de_str)
image_name = ComicPath.getFileScrambleImageName(count=count_image, block=blocks_num, suffix=suffix)
#images_item.append(ImagesItem(image_name=count_image + suffix, image_url=image_src, image_path=image_name))
if str(image_src).startswith('http'):
images_item.append(image_name)
else:
@ -41,11 +40,13 @@ def _serialize_to_images(value, result_type=None):
if result_type == "image_urls": return image_urls
else: return images_item
# 图像处理方法
def serialize_to_images(value): return _serialize_to_images(value)
# 图像链接处理方法
def serialize_to_image_urls(value): return _serialize_to_images(value, result_type="image_urls")
# ComicItem
class ComicItem(Item):
# 工程
current_project = Field()
@ -75,42 +76,25 @@ class ComicItem(Item):
genre = Field(output_processor=TakeFirst())
# 年龄分级
age_rating = Field(output_processor=TakeFirst())
# 合计
count = Field()
# 旧图片
images_old = Field(serializer=serialize_to_images)
# 章节图像合集
images = Field(serializer=serialize_to_images)
# 图像链接
image_urls = Field(serializer=serialize_to_image_urls)
# 图像名
images_name = Field()
class ImagesItem(Item):
image_name = Field()
image_url = Field()
image_path = Field()
images = Field()
image_urls = Field()
comic = Field()
# 序列化-作者
def serializer_info_writer(value):
(list_value, value) = [[], str(value).replace("&", " ")]
for v in set(str(value).split(" ")):
list_value.append(v)
return ",".join(list_value)
# Result_type name
def _serializer_info_imagesa(value, result_type=None):
info = []
for success, img in value:
img_path = os.path.join(settings.IMAGES_STORE, img['path'])
if result_type == 'name':
info.append(ComicPath().getFileScrambleImageSave(img_path,True,False))
else:
info.append(img_path)
if result_type == "len":
value = len(info)
else:
value = info
return value
# (私有)序列化-图像
def _serialize_info_images(value, result_type=None):
images = []
for image in value:
@ -121,34 +105,28 @@ def _serialize_info_images(value, result_type=None):
else:
return images
# 序列化-图像
def serializer_info_images(value): return _serialize_info_images(value)
# 序列化-合计
def serializer_info_images_count(value): return _serialize_info_images(value, "count")
def serializer_info_images_completed(value): return _serialize_info_images(value, result_type='name')
def serializer_info_images_count(value): return _serialize_info_images(value, result_type='len')
class ComicInfoItem(Item):
Title = Field(info='chapter')#"章节名",True]
Series = Field(info='name')# ","漫画名",True]
Number = Field(info='index')# ","编号",True]
SeriesGroup = Field()# ","别名",False]
Summary = Field(info='dep')# ","概述",True]
Year = Field()# ","年",False]
Month = Field()# ","月",False]
Day = Field()# ","日",False]
Writer = Field(info='author',serializer=serializer_info_writer)# "作者",True]
Publisher = Field()# ","出版社",False]
Genre = Field(info='genre')# ","流派",True]
Tags = Field(info='tags')# ","标签",True]
Web = Field()# ","主页",False]
PageCount = Field(info='count')# ","总页数",True]
#PageCount = Field(info='count',serializer=serializer_info_images_count)# ","总页数",True]
LanguageISO = Field()#","语言",True]
AgeRating = Field(info='age_rating')#","年龄分级",False]
#Pages = Field(info='images_name', serializer=serializer_info_images_completed)#","页码",True]
Pages = Field(info='images', serializer=serializer_info_images)#","页码",True]
Title = Field(desc="章节名", info='chapter')
Series = Field(desc="漫画名", info='name')
Number = Field(desc="编号", info='index')
SeriesGroup = Field(desc="别名")
Summary = Field(desc="概述", info='dep')
Year = Field(desc="")
Month = Field(desc="")
Day = Field(desc="")
Writer = Field(desc="作者", info='author',serializer=serializer_info_writer)
Publisher = Field(desc="出版社")
Genre = Field(desc="流派", info='genre')
Tags = Field(desc="标签", info='tags')
Web = Field(desc="主页")
PageCount = Field(desc="总页数", info='count')
LanguageISO = Field(desc="语言")
AgeRating = Field(desc="年龄分级", info='age_rating')
Pages = Field(desc="页码", info='images', serializer=serializer_info_images)
# ComicInfo.xml and ComicChapter.json end

View File

@ -5,27 +5,27 @@
# useful for handling different item types with a single interface
import os, scrapy,logging,time,random,shutil
import os,scrapy,logging
from Comics import settings
from Comics.settings import CBZ_EXPORT_PATH,OUTPUT_DIR,PROJECT_KEY
from Comics.utils.Constant import ComicPath
from Comics.items import ComicItem
from scrapy.pipelines.images import ImagesPipeline
from Comics.exporters import ComicInfoXmlItemExporter,JsonExport,ItemExporter, ItemImport
from Comics.utils.FileUtils import CBZUtils
from Comics.utils.FileUtils import fileUtils as fu
from Comics.settings import OUTPUT_DIR
from Comics.loader import ComicEntity
from Comics.exporters import ComicInfoXmlItemExporter
from Comics.utils.FileUtils import CBZUtils,fileUtils as fu
from Comics.utils.Constant import ComicPath
from Comics.utils.ComicUtils import checkUtils
from Comics.exporters import JsonExport,ItemExporter
from scrapy.pipelines.images import ImagesPipeline
class ComicsPipeline:
class ComicsPipeline():
def open_spider(self, spider):
pass
# item就是yield后面的对象
def process_item(self, item, spider):
if isinstance(item, ComicItem):
# 'output/rm_comic/json/壞X/第1話 壞X'
if os.path.exists(ComicPath.CBZ(item=item)):
# 'output/rm_comic/json/壞X/第1話 壞X'
if fu.exists(ComicPath.path_cbz(item=item)):
return ItemExporter().export_obj(item)
else:
file = os.path.join(OUTPUT_DIR, spider.name, "json", item['name'], item['chapter'])
@ -37,18 +37,18 @@ class ComicsPipeline:
class ImgDownloadPipeline(ImagesPipeline):
def get_file_path(self, item, file=None, result_type="image"):
return ComicPath.get_file_path(item=item, file=file, result_type= result_type)
return ComicPath().get_file_path(item=item, file=file, result_type= result_type)
def image_scramble_exits(self, item,image_path):
en_image_path = ComicPath().getFileScrambleImageSave(image_path, relative="fullpath")
return os.path.exists(os.path.join(settings.IMAGES_STORE, self.get_file_path(item, en_image_path)))
return fu.exists(fu.join(settings.IMAGES_STORE, self.get_file_path(item, en_image_path)))
## Icon Path : CBZ/NAME/CHAPTER.jpg
def download_icon(self, item, result_type="download"):
icon_path = self.get_file_path(item, result_type="icon_cache")
if result_type == "fullpath":
return os.path.join(settings.IMAGES_STORE, icon_path)
if os.path.exists(icon_path):
return fu.join(settings.IMAGES_STORE, icon_path)
if fu.exists(icon_path):
return False
else:
self.image_urls.append(item['icon'])
@ -80,10 +80,10 @@ class ImgDownloadPipeline(ImagesPipeline):
def pack_icon(self, item):
cbz_icon = self.get_file_path(item=item, result_type="cbz_icon")
dwn_icon = self.get_file_path(item=item, result_type="down_icon")
base_dir = os.path.dirname(dwn_icon)
name = os.path.basename(dwn_icon).split(".")[0]
base_dir = fu.dirname(dwn_icon)
name = fu.basename(dwn_icon).split(".")[0]
for dirname in os.listdir(base_dir):
path = os.path.join(base_dir, dirname)
path = fu.join(base_dir, dirname)
if os.path.isfile(path) and dirname.startswith(name):
fu.update_icon(path, cbz_icon)
@ -97,12 +97,12 @@ class ImgDownloadPipeline(ImagesPipeline):
save_path = self.get_file_path(item=item, result_type="down_icon")
fu.update_icon(image_path, save_path)
def item_completed(self, results, item, info):
# return item
# 打包
cbz_path = self.get_file_path(item, result_type="cbz")
if os.path.exists(cbz_path):
if fu.exists(cbz_path):
self.update_icon(item)
self.pack_icon(item)
else:

View File

@ -90,7 +90,7 @@ class ComicPath:
return file
@classmethod
def CBZ(cls, item):
def path_cbz(cls, item):
return cls.get_file_path(item, result_type="cbz", convert=True)

View File

@ -8,6 +8,19 @@ from Comics.settings import COMIC_INFO_XML_FILE,CBZ_EXPORT_PATH,IMAGES_STORE
from Comics.utils.Constant import ntfy
class fileUtils:
@classmethod
def exists(cls, path): return os.path.exists(path)
@classmethod
def join(cls, path, *paths): return os.path.join(path, *paths);
@classmethod
def dirname(cls, path): return os.path.dirname(path);
@classmethod
def basename(cls, path): return os.path.basename(path);
@classmethod
def save_file(cls,path,data):
root_dir = os.path.dirname(path)