152 lines
5.5 KiB
Python
152 lines
5.5 KiB
Python
# Define here the models for your scraped items
|
|
#
|
|
# See documentation in:
|
|
# https://docs.org/en/latest/topics/items.html
|
|
import os,Comics.settings as settings,logging
|
|
from scrapy.item import Item, Field
|
|
from Comics.utils.Constant import ComicPath
|
|
from Comics.utils.FileUtils import imageUtils
|
|
from scrapy.loader.processors import TakeFirst, MapCompose, Join
|
|
|
|
def serialize_to_chinese(value):
|
|
return ComicPath.chinese_convert(value)
|
|
|
|
def serialize_to_fix_file(value):
|
|
file = ComicPath.chinese_convert(value)
|
|
return ComicPath.fix_file_name(file)
|
|
|
|
def _serialize_to_images(value, result_type=None):
|
|
count = 1
|
|
images_item = []
|
|
image_urls = []
|
|
for image in value:
|
|
(image_src, scramble) = [image.get("src"), image.get("scramble")]
|
|
count_image = settings.IMAGES_NAME_FORMAT.format(count)
|
|
suffix = "."+str(image_src).split(".")[-1]
|
|
image_name = count_image + suffix
|
|
if scramble:
|
|
de_str = str(image_src).split("/")[-1].replace(suffix, "==")
|
|
blocks_num = imageUtils.encodeImage(de_str)
|
|
image_name = ComicPath.getFileScrambleImageName(count=count_image, block=blocks_num, suffix=suffix)
|
|
#images_item.append(ImagesItem(image_name=count_image + suffix, image_url=image_src, image_path=image_name))
|
|
images_item.append(image_name)
|
|
image_urls.append(image_src)
|
|
count += 1
|
|
logging.info(f"images_len: {len(images_item)}")
|
|
if result_type == "image_urls": return image_urls
|
|
else: return images_item
|
|
|
|
def serialize_to_images(value): return _serialize_to_images(value)
|
|
|
|
|
|
def serialize_to_image_urls(value): return _serialize_to_images(value, result_type="image_urls")
|
|
|
|
|
|
class ListComicItem(Item):
|
|
name = Field()
|
|
link = Field()
|
|
|
|
|
|
class ComicItem(Item):
|
|
# 编号
|
|
index = Field(output_processor=TakeFirst())
|
|
# 漫画名
|
|
name = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst())
|
|
# 章节名
|
|
chapter = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst())
|
|
# 图片链接
|
|
list_img = Field(serializer=serialize_to_images)
|
|
# 作者
|
|
author = Field(serialize_to_chinese=serialize_to_chinese, output_processor=TakeFirst())
|
|
# 封面链接
|
|
icon = Field(output_processor=TakeFirst())
|
|
# 标签
|
|
tags = Field(serializer=serialize_to_chinese, output_processor=TakeFirst())
|
|
# 概述
|
|
dep = Field(serializer=serialize_to_chinese, output_processor=TakeFirst())
|
|
# 时间
|
|
date = Field(output_processor=TakeFirst())
|
|
# 流派
|
|
genre = Field(output_processor=TakeFirst())
|
|
# 年龄分级
|
|
age_rating = Field(output_processor=TakeFirst())
|
|
|
|
images_old = Field(serializer=serialize_to_images)
|
|
images = Field(serializer=serialize_to_images)
|
|
image_urls = Field(serializer=serialize_to_image_urls)
|
|
images_name = Field()
|
|
|
|
class ImagesItem(Item):
|
|
image_name = Field()
|
|
image_url = Field()
|
|
image_path = Field()
|
|
images = Field()
|
|
image_urls = Field()
|
|
comic = Field()
|
|
|
|
def serializer_info_writer(value):
|
|
list_value = []
|
|
str(value).replace("&", " ")
|
|
for v in str(value).split(" "):
|
|
list_value.append(v)
|
|
return ",".join(list_value)
|
|
|
|
# Result_type name
|
|
def _serializer_info_imagesa(value, result_type=None):
|
|
info = []
|
|
for success, img in value:
|
|
img_path = os.path.join(settings.IMAGES_STORE, img['path'])
|
|
if result_type == 'name':
|
|
info.append(ComicPath().getFileScrambleImageSave(img_path,True,False))
|
|
else:
|
|
info.append(img_path)
|
|
if result_type == "len":
|
|
value = len(info)
|
|
else:
|
|
value = info
|
|
return value
|
|
|
|
def _serialize_info_images(value, result_type=None):
|
|
images = []
|
|
for image in value:
|
|
images.append(ComicPath().getFileScrambleImageSave(image,True,False))
|
|
if result_type == "count":
|
|
return len(images)
|
|
else:
|
|
return images
|
|
|
|
|
|
def serializer_info_images(value): return _serialize_info_images(value)
|
|
|
|
def serializer_info_images_count(value): return _serialize_info_images(value, "count")
|
|
|
|
def serializer_info_images_completed(value):
|
|
return _serialize_info_images(value, result_type='name')
|
|
|
|
def serializer_info_images_count(value):
|
|
return _serialize_info_images(value, result_type='len')
|
|
|
|
|
|
class ComicInfoItem(Item):
|
|
Title = Field(info='chapter')#"章节名",True]
|
|
Series = Field(info='name')# ","漫画名",True]
|
|
Number = Field(info='index')# ","编号",True]
|
|
SeriesGroup = Field()# ","别名",False]
|
|
Summary = Field(info='dep')# ","概述",True]
|
|
Year = Field()# ","年",False]
|
|
Month = Field()# ","月",False]
|
|
Day = Field()# ","日",False]
|
|
Writer = Field(info='author',serializer=serializer_info_writer)# "作者",True]
|
|
Publisher = Field()# ","出版社",False]
|
|
Genre = Field(info='genre')# ","流派",True]
|
|
Tags = Field(info='tags')# ","标签",True]
|
|
Web = Field()# ","主页",False]
|
|
#PageCount = Field()# ","总页数",True]
|
|
PageCount = Field(info='images',serializer=serializer_info_images_count)# ","总页数",True]
|
|
LanguageISO = Field()#","语言",True]
|
|
AgeRating = Field(info='age_rating')#","年龄分级",False]
|
|
#Pages = Field(info='images_name', serializer=serializer_info_images_completed)#","页码",True]
|
|
Pages = Field(info='images', serializer=serializer_info_images)#","页码",True]
|
|
# ComicInfo.xml and ComicChapter.json end
|
|
|