161 lines
5.7 KiB
Python
161 lines
5.7 KiB
Python
# Define here the models for your scraped items
|
|
#
|
|
# See documentation in:
|
|
# https://docs.org/en/latest/topics/items.html
|
|
import os,Comics.settings as settings,logging
|
|
from scrapy.item import Item, Field
|
|
from Comics.utils import ComicPath
|
|
from Comics.utils import imageUtils,fileUtils
|
|
from itemloaders.processors import TakeFirst
|
|
|
|
# 繁体中文转为简体中文
|
|
def serialize_to_chinese(value): return ComicPath.chinese_convert(value)
|
|
|
|
# 将路径字符串转为合法路径
|
|
def serialize_to_fix_file(value): return ComicPath.fix_file_name(value)
|
|
|
|
def serialize_name_to_chinese(value): return ComicPath.fix_file_name(ComicPath.chinese_convert(value))
|
|
|
|
# 图片数据解析(私有方法)
|
|
def _serialize_to_images(value, result_type=None):
|
|
(count, images_item, image_urls) = [1,[],[]]
|
|
for image in value:
|
|
try:
|
|
(image_src, scramble) = [image.get("src"), image.get("scramble")]
|
|
except:
|
|
(image_src, scramble) = [image, False]
|
|
count_image = settings.IMAGES_NAME_FORMAT.format(count)
|
|
# suffix = "."+str(image_src).split(".")[-1]
|
|
suffix = ".jpg"
|
|
image_name = count_image + suffix
|
|
#if scramble:
|
|
if scramble == "True":
|
|
de_str = str(image_src).split("/")[-1].replace(suffix, "==")
|
|
blocks_num = imageUtils.encodeImage(de_str)
|
|
image_name = ComicPath.getFileScrambleImageName(count=count_image, block=blocks_num, suffix=suffix)
|
|
if str(image_src).startswith('http'):
|
|
images_item.append(image_name)
|
|
else:
|
|
images_item.append(image_src)
|
|
image_urls.append(image_src)
|
|
count += 1
|
|
logging.debug(f"images_len: {len(images_item)}")
|
|
if result_type == "image_urls": return image_urls
|
|
else: return images_item
|
|
|
|
# 图像处理方法
|
|
def serialize_to_images(value): return _serialize_to_images(value)
|
|
|
|
# 图像链接处理方法
|
|
def serialize_to_image_urls(value): return _serialize_to_images(value, result_type="image_urls")
|
|
|
|
# ComicItem
|
|
class ComicItem(Item):
|
|
# 工程
|
|
current_project = Field()
|
|
# link
|
|
link = Field()
|
|
# 全部章节
|
|
chapters = Field()
|
|
# 编号
|
|
index = Field(output_processor=TakeFirst())
|
|
# 漫画名
|
|
name = Field(serializer=serialize_name_to_chinese, output_processor=TakeFirst())
|
|
# 源漫画名
|
|
s_name = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst())
|
|
# 章节名
|
|
chapter = Field(serializer=serialize_name_to_chinese, output_processor=TakeFirst())
|
|
# 源章节名
|
|
s_chapter = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst())
|
|
# 图片链接
|
|
list_img = Field(serializer=serialize_to_images)
|
|
# 作者
|
|
author = Field(serialize_to_chinese=serialize_to_chinese, output_processor=TakeFirst())
|
|
# 封面链接
|
|
icon = Field(output_processor=TakeFirst())
|
|
# 标签
|
|
tags = Field(serializer=serialize_to_chinese, output_processor=TakeFirst())
|
|
# 概述
|
|
dep = Field(serializer=serialize_to_chinese, output_processor=TakeFirst())
|
|
# 时间
|
|
date = Field(output_processor=TakeFirst())
|
|
# 流派
|
|
genre = Field(output_processor=TakeFirst())
|
|
# 年龄分级
|
|
age_rating = Field(output_processor=TakeFirst())
|
|
# 合计
|
|
count = Field()
|
|
# 旧图片
|
|
images_old = Field(serializer=serialize_to_images)
|
|
# 章节图像合集
|
|
images = Field(serializer=serialize_to_images)
|
|
# 图像链接
|
|
image_urls = Field(serializer=serialize_to_image_urls)
|
|
# 图像名
|
|
images_name = Field()
|
|
|
|
domain = Field()
|
|
#章节链接
|
|
chapter_href = Field()
|
|
#章节API
|
|
chapter_api = Field()
|
|
|
|
class BooksItem(Item):
|
|
current_project = Field()
|
|
names = Field()
|
|
urls = Field()
|
|
|
|
class ImageItem(Item):
|
|
image_url = Field()
|
|
image_name = Field()
|
|
image_path = Field()
|
|
image_type = Field()
|
|
isScramble = Field()
|
|
|
|
class Image():
|
|
def setImage(self, url, scramble): return { "src" : url, "scramble": scramble}
|
|
|
|
# 序列化-作者
|
|
def serializer_info_writer(value):
|
|
(list_value, value) = [[], str(value).replace("&", " ")]
|
|
for v in set(str(value).split(" ")):
|
|
list_value.append(v)
|
|
return ",".join(list_value)
|
|
|
|
# (私有)序列化-图像
|
|
def _serialize_info_images(value, result_type=None):
|
|
images = []
|
|
for image in value:
|
|
if os.sep not in image:
|
|
images.append(ComicPath().getFileScrambleImageSave(image,True,False))
|
|
if result_type == "count":
|
|
return len(images)
|
|
else:
|
|
return images
|
|
|
|
# 序列化-图像
|
|
def serializer_info_images(value): return _serialize_info_images(value)
|
|
|
|
# 序列化-合计
|
|
def serializer_info_images_count(value): return _serialize_info_images(value, "count")
|
|
|
|
class ComicInfoItem(Item):
|
|
Title = Field(desc="章节名", info='chapter')
|
|
Series = Field(desc="漫画名", info='name')
|
|
Number = Field(desc="编号", info='index')
|
|
SeriesGroup = Field(desc="别名")
|
|
Summary = Field(desc="概述", info='dep')
|
|
Year = Field(desc="年")
|
|
Month = Field(desc="月")
|
|
Day = Field(desc="日")
|
|
Writer = Field(desc="作者", info='author',serializer=serializer_info_writer)
|
|
Publisher = Field(desc="出版社")
|
|
Genre = Field(desc="流派", info='genre')
|
|
Tags = Field(desc="标签", info='tags')
|
|
Web = Field(desc="主页")
|
|
PageCount = Field(desc="总页数", info='count')
|
|
LanguageISO = Field(desc="语言")
|
|
AgeRating = Field(desc="年龄分级", info='age_rating')
|
|
Pages = Field(desc="页码", info='images', serializer=serializer_info_images)
|
|
# ComicInfo.xml and ComicChapter.json end
|
|
|