ComicScrapy/Comics/items.py
2024-02-20 21:08:13 +08:00

137 lines
5.0 KiB
Python

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.org/en/latest/topics/items.html
import os,Comics.settings as settings,logging
from scrapy.item import Item, Field
from Comics.utils import ComicPath
from Comics.utils import imageUtils
from itemloaders.processors import TakeFirst
# 繁体中文转为简体中文
def serialize_to_chinese(value): return ComicPath.chinese_convert(value)
# 将路径字符串转为合法路径
def serialize_to_fix_file(value): return ComicPath.fix_file_name(ComicPath.chinese_convert(value))
# 图片数据解析(私有方法)
def _serialize_to_images(value, result_type=None):
(count, images_item, image_urls) = [1,[],[]]
for image in value:
try:
(image_src, scramble) = [image.get("src"), image.get("scramble")]
except:
(image_src, scramble) = [image, False]
count_image = settings.IMAGES_NAME_FORMAT.format(count)
# suffix = "."+str(image_src).split(".")[-1]
suffix = ".jpg"
image_name = count_image + suffix
if scramble:
de_str = str(image_src).split("/")[-1].replace(suffix, "==")
blocks_num = imageUtils.encodeImage(de_str)
image_name = ComicPath.getFileScrambleImageName(count=count_image, block=blocks_num, suffix=suffix)
if str(image_src).startswith('http'):
images_item.append(image_name)
else:
images_item.append(image_src)
image_urls.append(image_src)
count += 1
logging.debug(f"images_len: {len(images_item)}")
if result_type == "image_urls": return image_urls
else: return images_item
# 图像处理方法
def serialize_to_images(value): return _serialize_to_images(value)
# 图像链接处理方法
def serialize_to_image_urls(value): return _serialize_to_images(value, result_type="image_urls")
# ComicItem
class ComicItem(Item):
# 工程
current_project = Field()
# link
link = Field()
# 全部章节
chapters = Field()
# 编号
index = Field(output_processor=TakeFirst())
# 漫画名
name = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst())
# 章节名
chapter = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst())
# 图片链接
list_img = Field(serializer=serialize_to_images)
# 作者
author = Field(serialize_to_chinese=serialize_to_chinese, output_processor=TakeFirst())
# 封面链接
icon = Field(output_processor=TakeFirst())
# 标签
tags = Field(serializer=serialize_to_chinese, output_processor=TakeFirst())
# 概述
dep = Field(serializer=serialize_to_chinese, output_processor=TakeFirst())
# 时间
date = Field(output_processor=TakeFirst())
# 流派
genre = Field(output_processor=TakeFirst())
# 年龄分级
age_rating = Field(output_processor=TakeFirst())
# 合计
count = Field()
# 旧图片
images_old = Field(serializer=serialize_to_images)
# 章节图像合集
images = Field(serializer=serialize_to_images)
# 图像链接
image_urls = Field(serializer=serialize_to_image_urls)
# 图像名
images_name = Field()
#章节链接
chapter_href = Field()
#章节API
chapter_api = Field()
# 序列化-作者
def serializer_info_writer(value):
(list_value, value) = [[], str(value).replace("&", " ")]
for v in set(str(value).split(" ")):
list_value.append(v)
return ",".join(list_value)
# (私有)序列化-图像
def _serialize_info_images(value, result_type=None):
images = []
for image in value:
if os.sep not in image:
images.append(ComicPath().getFileScrambleImageSave(image,True,False))
if result_type == "count":
return len(images)
else:
return images
# 序列化-图像
def serializer_info_images(value): return _serialize_info_images(value)
# 序列化-合计
def serializer_info_images_count(value): return _serialize_info_images(value, "count")
class ComicInfoItem(Item):
Title = Field(desc="章节名", info='chapter')
Series = Field(desc="漫画名", info='name')
Number = Field(desc="编号", info='index')
SeriesGroup = Field(desc="别名")
Summary = Field(desc="概述", info='dep')
Year = Field(desc="")
Month = Field(desc="")
Day = Field(desc="")
Writer = Field(desc="作者", info='author',serializer=serializer_info_writer)
Publisher = Field(desc="出版社")
Genre = Field(desc="流派", info='genre')
Tags = Field(desc="标签", info='tags')
Web = Field(desc="主页")
PageCount = Field(desc="总页数", info='count')
LanguageISO = Field(desc="语言")
AgeRating = Field(desc="年龄分级", info='age_rating')
Pages = Field(desc="页码", info='images', serializer=serializer_info_images)
# ComicInfo.xml and ComicChapter.json end