ComicScrapy/Comics/items.py
2023-06-20 02:52:51 +08:00

152 lines
5.5 KiB
Python

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.org/en/latest/topics/items.html
import os,Comics.settings as settings,logging
from scrapy.item import Item, Field
from Comics.utils.Constant import ComicPath
from Comics.utils.FileUtils import imageUtils
from scrapy.loader.processors import TakeFirst, MapCompose, Join
def serialize_to_chinese(value):
return ComicPath.chinese_convert(value)
def serialize_to_fix_file(value):
file = ComicPath.chinese_convert(value)
return ComicPath.fix_file_name(file)
def _serialize_to_images(value, result_type=None):
count = 1
images_item = []
image_urls = []
for image in value:
(image_src, scramble) = [image.get("src"), image.get("scramble")]
count_image = settings.IMAGES_NAME_FORMAT.format(count)
suffix = "."+str(image_src).split(".")[-1]
image_name = count_image + suffix
if scramble:
de_str = str(image_src).split("/")[-1].replace(suffix, "==")
blocks_num = imageUtils.encodeImage(de_str)
image_name = ComicPath.getFileScrambleImageName(count=count_image, block=blocks_num, suffix=suffix)
#images_item.append(ImagesItem(image_name=count_image + suffix, image_url=image_src, image_path=image_name))
images_item.append(image_name)
image_urls.append(image_src)
count += 1
logging.info(f"images_len: {len(images_item)}")
if result_type == "image_urls": return image_urls
else: return images_item
def serialize_to_images(value): return _serialize_to_images(value)
def serialize_to_image_urls(value): return _serialize_to_images(value, result_type="image_urls")
class ListComicItem(Item):
name = Field()
link = Field()
class ComicItem(Item):
# 编号
index = Field(output_processor=TakeFirst())
# 漫画名
name = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst())
# 章节名
chapter = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst())
# 图片链接
list_img = Field(serializer=serialize_to_images)
# 作者
author = Field(serialize_to_chinese=serialize_to_chinese, output_processor=TakeFirst())
# 封面链接
icon = Field(output_processor=TakeFirst())
# 标签
tags = Field(serializer=serialize_to_chinese, output_processor=TakeFirst())
# 概述
dep = Field(serializer=serialize_to_chinese, output_processor=TakeFirst())
# 时间
date = Field(output_processor=TakeFirst())
# 流派
genre = Field(output_processor=TakeFirst())
# 年龄分级
age_rating = Field(output_processor=TakeFirst())
images_old = Field(serializer=serialize_to_images)
images = Field(serializer=serialize_to_images)
image_urls = Field(serializer=serialize_to_image_urls)
images_name = Field()
class ImagesItem(Item):
image_name = Field()
image_url = Field()
image_path = Field()
images = Field()
image_urls = Field()
comic = Field()
def serializer_info_writer(value):
list_value = []
str(value).replace("&", " ")
for v in str(value).split(" "):
list_value.append(v)
return ",".join(list_value)
# Result_type name
def _serializer_info_imagesa(value, result_type=None):
info = []
for success, img in value:
img_path = os.path.join(settings.IMAGES_STORE, img['path'])
if result_type == 'name':
info.append(ComicPath().getFileScrambleImageSave(img_path,True,False))
else:
info.append(img_path)
if result_type == "len":
value = len(info)
else:
value = info
return value
def _serialize_info_images(value, result_type=None):
images = []
for image in value:
images.append(ComicPath().getFileScrambleImageSave(image,True,False))
if result_type == "count":
return len(images)
else:
return images
def serializer_info_images(value): return _serialize_info_images(value)
def serializer_info_images_count(value): return _serialize_info_images(value, "count")
def serializer_info_images_completed(value):
return _serialize_info_images(value, result_type='name')
def serializer_info_images_count(value):
return _serialize_info_images(value, result_type='len')
class ComicInfoItem(Item):
Title = Field(info='chapter')#"章节名",True]
Series = Field(info='name')# ","漫画名",True]
Number = Field(info='index')# ","编号",True]
SeriesGroup = Field()# ","别名",False]
Summary = Field(info='dep')# ","概述",True]
Year = Field()# ","年",False]
Month = Field()# ","月",False]
Day = Field()# ","日",False]
Writer = Field(info='author',serializer=serializer_info_writer)# "作者",True]
Publisher = Field()# ","出版社",False]
Genre = Field(info='genre')# ","流派",True]
Tags = Field(info='tags')# ","标签",True]
Web = Field()# ","主页",False]
#PageCount = Field()# ","总页数",True]
PageCount = Field(info='images',serializer=serializer_info_images_count)# ","总页数",True]
LanguageISO = Field()#","语言",True]
AgeRating = Field(info='age_rating')#","年龄分级",False]
#Pages = Field(info='images_name', serializer=serializer_info_images_completed)#","页码",True]
Pages = Field(info='images', serializer=serializer_info_images)#","页码",True]
# ComicInfo.xml and ComicChapter.json end