重构ComicInfo

This commit is contained in:
caiwx86 2024-07-18 06:16:28 +08:00
parent d6cb8e4251
commit dca89dccc7
5 changed files with 305 additions and 113 deletions

127
ComicInfo_2.1.xsd Normal file
View File

@ -0,0 +1,127 @@
<?xml version="1.0" encoding="utf-8"?>
<xs:schema elementFormDefault="qualified" xmlns:xs="http://www.w3.org/2001/XMLSchema">
<xs:element name="ComicInfo" nillable="true" type="ComicInfo"/>
<xs:complexType name="ComicInfo">
<xs:sequence>
<xs:element minOccurs="0" maxOccurs="1" default="" name="Title" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="Series" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="Number" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="-1" name="Count" type="xs:int"/>
<xs:element minOccurs="0" maxOccurs="1" default="-1" name="Volume" type="xs:int"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="AlternateSeries" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="AlternateNumber" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="-1" name="AlternateCount" type="xs:int"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="Summary" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="Notes" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="-1" name="Year" type="xs:int"/>
<xs:element minOccurs="0" maxOccurs="1" default="-1" name="Month" type="xs:int"/>
<xs:element minOccurs="0" maxOccurs="1" default="-1" name="Day" type="xs:int"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="Writer" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="Penciller" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="Inker" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="Colorist" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="Letterer" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="CoverArtist" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="Editor" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="Translator" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="Publisher" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="Imprint" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="Genre" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="Tags" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="Web" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="0" name="PageCount" type="xs:int"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="LanguageISO" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="Format" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="Unknown" name="BlackAndWhite" type="YesNo"/>
<xs:element minOccurs="0" maxOccurs="1" default="Unknown" name="Manga" type="Manga"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="Characters" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="Teams" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="Locations" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="ScanInformation" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="StoryArc" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="StoryArcNumber" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="SeriesGroup" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="Unknown" name="AgeRating" type="AgeRating"/>
<xs:element minOccurs="0" maxOccurs="1" name="Pages" type="ArrayOfComicPageInfo"/>
<xs:element minOccurs="0" maxOccurs="1" name="CommunityRating" type="Rating"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="MainCharacterOrTeam" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="Review" type="xs:string"/>
<xs:element minOccurs="0" maxOccurs="1" default="" name="GTIN" type="xs:string"/>
</xs:sequence>
</xs:complexType>
<xs:simpleType name="YesNo">
<xs:restriction base="xs:string">
<xs:enumeration value="Unknown"/>
<xs:enumeration value="No"/>
<xs:enumeration value="Yes"/>
</xs:restriction>
</xs:simpleType>
<xs:simpleType name="Manga">
<xs:restriction base="xs:string">
<xs:enumeration value="Unknown"/>
<xs:enumeration value="No"/>
<xs:enumeration value="Yes"/>
<xs:enumeration value="YesAndRightToLeft"/>
</xs:restriction>
</xs:simpleType>
<xs:simpleType name="Rating">
<xs:restriction base="xs:decimal">
<xs:minInclusive value="0"/>
<xs:maxInclusive value="5"/>
<xs:fractionDigits value="1"/>
</xs:restriction>
</xs:simpleType>
<xs:simpleType name="AgeRating">
<xs:restriction base="xs:string">
<xs:enumeration value="Unknown"/>
<xs:enumeration value="Adults Only 18+"/>
<xs:enumeration value="Early Childhood"/>
<xs:enumeration value="Everyone"/>
<xs:enumeration value="Everyone 10+"/>
<xs:enumeration value="G"/>
<xs:enumeration value="Kids to Adults"/>
<xs:enumeration value="M"/>
<xs:enumeration value="MA15+"/>
<xs:enumeration value="Mature 17+"/>
<xs:enumeration value="PG"/>
<xs:enumeration value="R18+"/>
<xs:enumeration value="Rating Pending"/>
<xs:enumeration value="Teen"/>
<xs:enumeration value="X18+"/>
</xs:restriction>
</xs:simpleType>
<xs:complexType name="ArrayOfComicPageInfo">
<xs:sequence>
<xs:element minOccurs="0" maxOccurs="unbounded" name="Page" nillable="true" type="ComicPageInfo"/>
</xs:sequence>
</xs:complexType>
<xs:complexType name="ComicPageInfo">
<xs:attribute name="Image" type="xs:int" use="required"/>
<xs:attribute default="Story" name="Type" type="ComicPageType"/>
<xs:attribute default="false" name="DoublePage" type="xs:boolean"/>
<xs:attribute default="0" name="ImageSize" type="xs:long"/>
<xs:attribute default="" name="Key" type="xs:string"/>
<xs:attribute default="" name="Bookmark" type="xs:string"/>
<xs:attribute default="-1" name="ImageWidth" type="xs:int"/>
<xs:attribute default="-1" name="ImageHeight" type="xs:int"/>
</xs:complexType>
<xs:simpleType name="ComicPageType">
<xs:list>
<xs:simpleType>
<xs:restriction base="xs:string">
<xs:enumeration value="FrontCover"/>
<xs:enumeration value="InnerCover"/>
<xs:enumeration value="Roundup"/>
<xs:enumeration value="Story"/>
<xs:enumeration value="Advertisement"/>
<xs:enumeration value="Editorial"/>
<xs:enumeration value="Letters"/>
<xs:enumeration value="Preview"/>
<xs:enumeration value="BackCover"/>
<xs:enumeration value="Other"/>
<xs:enumeration value="Deleted"/>
</xs:restriction>
</xs:simpleType>
</xs:list>
</xs:simpleType>
</xs:schema>

168
Comics/_utils/ComicInfo.py Normal file
View File

@ -0,0 +1,168 @@
import xml.etree.ElementTree as ET
from xml.dom import minidom
from typing import List
import json,os
from lxml import etree
# Define the ComicInfo and ComicPageInfo classes
class ComicInfo:
def __init__(self):
self.Title: str = ""
"""标题"""
self.Series: str = ""
self.Number: str = ""
self.Count: int = -1
self.Volume: int = -1
self.AlternateSeries: str = ""
self.AlternateNumber: str = ""
self.AlternateCount: int = -1
self.Summary: str = ""
self.Notes: str = ""
self.Year: int = -1
self.Month: int = -1
self.Day: int = -1
self.Writer: str = ""
self.Penciller: str = ""
self.Inker: str = ""
self.Colorist: str = ""
self.Letterer: str = ""
self.CoverArtist: str = ""
self.Editor: str = ""
self.Publisher: str = ""
self.Imprint: str = ""
self.Genre: str = ""
self.Tags: str = ""
self.Web: str = ""
self.PageCount: int = -1
self.LanguageISO: str = ""
self.Format: str = ""
self.BlackAndWhite: str = ""
self.Manga: str = ""
self.Characters: str = ""
self.Teams: str = ""
self.Locations: str = ""
self.ScanInformation: str = ""
self.StoryArc: str = ""
self.SeriesGroup: str = ""
self.AgeRating: str = ""
self.Pages: List[ComicPageInfo] = []
class ComicPageInfo:
def __init__(self):
self.Image: int = -1
self.Type: str = "Story"
self.DoublePage: bool = False
self.ImageSize: int = -1
self.Key: str = ""
self.Bookmark: str = ""
self.ImageWidth: int = -1
self.ImageHeight: int = -1
def toString(self):
data = {}
def add(key, value):
if value != -1 and value != "": data[key] = str(value)
add("Image", self.Image)
add("ImageSize", self.ImageSize)
add("ImageWidth", self.ImageWidth)
add("ImageHeight", self.ImageHeight)
return data
class ComicInfoXml:
def save_xml_to_file(self, xml_string, filename):
"""
Save the XML string to a file
"""
base_dir = os.path.dirname(filename)
if not os.path.exists(base_dir): os.makedirs(base_dir)
with open(filename, "w", encoding="utf-8") as file:
file.write(xml_string)
def validate_xml_with_xsd_file(self, xml_file, xsd_file, remove=True):
"""
Validate the XML file against the XSD file
"""
xml_doc = etree.parse(xml_file)
with open(xsd_file, 'r', encoding="utf-8") as file:
xsd_doc = etree.XMLSchema(etree.parse(file))
try:
xsd_doc.assertValid(xml_doc)
print("XML is valid according to the XSD.")
except etree.DocumentInvalid as e:
print("XML is not valid:")
print(e)
if remove:
os.remove(xml_file)
def parse_comicinfo(self, comic: ComicInfo, save_dir=None, xml_filename="ComicInfo.xml", xsd_filename="ComicInfo_2.1.xsd"):
"""_summary_
Args:
comic (ComicInfo): _description_
save_dir (_type_, optional): _description_. Defaults to None.
xml_filename (str, optional): _description_. Defaults to "ComicInfo.xml".
xsd_filename (str, optional): _description_. Defaults to "ComicInfo_2.1.xsd".
"""
# Serialize to XML with formatted output
def serialize_comic_info(comic: ComicInfo) -> str:
# Create root element with XML declaration and namespaces
comic_elem = ET.Element('ComicInfo')
comic_elem.set('xmlns:xsd', 'http://www.w3.org/2001/XMLSchema')
comic_elem.set('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance')
# Add subelements and attributes based on presence and requirements
for attr, value in comic.__dict__.items():
# if value or (attr in ['Volume', 'Year', 'Month', 'Day', 'PageCount'] and (value == -1 or value == "" ) ): # Check required attributes
if value != -1 and value != '':
if attr == 'Pages':
pages_elem = ET.SubElement(comic_elem, 'Pages')
for page in value:
cpi = ComicPageInfo()
cpi.Image = page.Image
cpi.ImageSize = page.ImageSize
cpi.ImageWidth = page.ImageWidth
cpi.ImageHeight = page.ImageHeight
page_elem = ET.SubElement(pages_elem, 'Page', cpi.toString())
else:
ET.SubElement(comic_elem, attr).text = str(value)
# Create a formatted XML string
xml_str = ET.tostring(comic_elem, encoding='utf-8', method='xml')
parsed_xml = minidom.parseString(xml_str)
formatted_xml = parsed_xml.toprettyxml(indent=" ", encoding="utf-8") # Adjust the number of spaces for indentation as needed
# Convert bytes to string and add XML declaration
return formatted_xml.decode('utf-8')
# Serialize the ComicInfo object
serialized_xml = serialize_comic_info(comic)
print(serialized_xml)
# 保存数据XML到文件
if save_dir != None: xml_filename = os.path.join(save_dir, xml_filename)
self.save_xml_to_file(serialized_xml, xml_filename)
self.validate_xml_with_xsd_file(xml_filename, xsd_filename) # 将 JSON 转换为 XML
#xml_data = json_to_xml_with_declaration(json_data)
#print(xml_data)
def scrapy_xml_by_json(self, json_data, save_dir=None):
comic = ComicInfo()
comic.Title = json_data.get("chapter", "")
comic.Series = json_data.get("name", "")
comic.Writer = json_data.get("author", "")
comic.AgeRating = json_data.get("age_rating", "")
comic.Tags = json_data.get("tags", "")
comic.Summary = json_data.get("dep", "")
comic.Genre = json_data.get("genre", "")
comic.Number = json_data.get("index", "")
comic.PageCount = json_data.get("count", "")
comic.Writer = json_data.get("author", "")
image_names = json_data.get("images", "")
pages = []
# Adding pages to the comic
for image_name in image_names:
page = ComicPageInfo()
page.Image = image_name.split(".")[0].split("_")[-1]
pages.append(page.Image)
comic.Pages.append(page)
self.parse_comicinfo(comic, save_dir=save_dir)
return pages

View File

@ -1,14 +1,6 @@
import os.path,json,ast import os.path
from Comics.settings import COMIC_INFO_FIELDS_TO_EXPORT
from scrapy.exporters import XmlItemExporter
from scrapy.exporters import PythonItemExporter from scrapy.exporters import PythonItemExporter
from scrapy.exporters import JsonItemExporter from scrapy.exporters import JsonItemExporter
from Comics.items import ComicInfoItem
from Comics.items import ComicItem
from Comics.settings import COMIC_INFO_XML_STORE
from Comics.utils import ComicPath
from scrapy.utils.python import is_listlike, to_bytes, to_unicode
from itemadapter import ItemAdapter
class CommonExporter(): class CommonExporter():
def getPath(self, file , sufix=None): def getPath(self, file , sufix=None):
@ -47,83 +39,3 @@ class JsonExport(JsonItemExporter):
self.file.close() self.file.close()
if if_return: if if_return:
return ItemExporter().export_obj(json_object) return ItemExporter().export_obj(json_object)
class ComicInfoXmlItemExporter(XmlItemExporter):
custom_root_element = "ComicInfo"
def __init__(self, dir):
file_path = os.path.join(COMIC_INFO_XML_STORE, dir,
f"{self.custom_root_element}.xml")
dir_path = os.path.dirname(file_path)
if not os.path.exists(dir_path): os.makedirs(dir_path)
self.xml_file = open(file_path, "wb")
super(ComicInfoXmlItemExporter, self).__init__(self.xml_file,
root_element=self.custom_root_element,
indent=1,fields_to_export=COMIC_INFO_FIELDS_TO_EXPORT)
def serialize_field(self, field, name, value):
#通过序列化
value = ComicPath.chinese_convert(value)
return super().serialize_field(field, name, value)
def start_exporting(self):
self.xg.startDocument()
self.xg.startElement(self.custom_root_element, {})
def comic_to_info_item(self, comic_item):
comic_info = {}
info_item = ItemAdapter(ComicInfoItem())
comic_info_dict = {}
for field in info_item.field_names():
meta_info = info_item.get_field_meta(field).get('info')
if meta_info is not None:
comic_info_dict[meta_info] = field
for key, value in ComicItem(comic_item).items():
new_key = comic_info_dict.get(key)
if new_key is not None:
comic_info[new_key] = value
return ItemExporter().export_obj(ComicInfoItem(comic_info))
def export_item(self, item):
comic_info = self.comic_to_info_item(item)
child_element = "Page"
self._beautify_indent(depth=1)
self._beautify_newline()
for name, value in self._get_serialized_fields(comic_info, default_value=""):
if name == "Pages":
value = ast.literal_eval(value)
if value is not None or value != "":
self._export_xml_field(name, value, depth=2, child_element=child_element)
#self._beautify_indent(depth=1)
return comic_info
def _export_xml_field(self, name, serialized_value, depth, child_element="value"):
self._beautify_indent(depth=depth)
self.xg.startElement(name, {})
if hasattr(serialized_value, "items"):
self._beautify_newline()
for sub_name, value in serialized_value.items():
self._export_xml_field(sub_name, value, depth=depth + 1)
self._beautify_indent(depth=depth)
elif is_listlike(serialized_value):
self._beautify_newline()
for value in serialized_value:
self._export_xml_field(child_element, value, depth=depth + 1)
self._beautify_indent(depth=depth)
elif isinstance(serialized_value, str):
self.xg.characters(serialized_value)
else:
self.xg.characters(str(serialized_value))
self.xg.endElement(name)
self._beautify_newline()
def finish_exporting(self):
self.xg.endElement(self.custom_root_element)
self.xg.endDocument()
self.xml_file.close()
def export_xml(self, item):
self.start_exporting()
comic_info = self.export_item(item)
self.finish_exporting()
return comic_info

View File

@ -9,12 +9,12 @@ import os,scrapy,logging
from Comics import settings from Comics import settings
from Comics.items import ComicItem from Comics.items import ComicItem
from Comics.loader import ComicLoader from Comics.loader import ComicLoader
from Comics.exporters import ComicInfoXmlItemExporter
from Comics.utils import CBZUtils,fileUtils as fu from Comics.utils import CBZUtils,fileUtils as fu
from Comics.utils import ComicPath from Comics.utils import ComicPath
from Comics.utils import checkUtils from Comics.utils import checkUtils
from Comics.exporters import JsonExport,ItemExporter from Comics.exporters import JsonExport,ItemExporter
from scrapy.pipelines.images import ImagesPipeline from scrapy.pipelines.images import ImagesPipeline
from Comics._utils.ComicInfo import ComicInfoXml
class ComicsPipeline(): class ComicsPipeline():
@ -123,10 +123,14 @@ class ImgDownloadPipeline(BaseImagesPipeline):
self.pack_icon(item) self.pack_icon(item)
else: else:
# ComicInfoXml 生成 # ComicInfoXml 生成
comic_info = ComicInfoXmlItemExporter(dir=super().get_file_path(item=item, result_type="comic_info")).export_xml(item) #comic_info = ComicInfoXmlItemExporter(dir=super().get_file_path(item=item, result_type="comic_info")).export_xml(item)
comic_pages = ComicInfoXml().scrapy_xml_by_json(item, save_dir=super().get_file_path(item=item, result_type="images_dir"))
#if CBZUtils.packComicChapterCBZ(src_dir= super().get_file_path(item, result_type="images_dir"),
# dts_path= cbz_path,
# comic_info_images= comic_info['Pages'], remove=True):
if CBZUtils.packComicChapterCBZ(src_dir= super().get_file_path(item, result_type="images_dir"), if CBZUtils.packComicChapterCBZ(src_dir= super().get_file_path(item, result_type="images_dir"),
dts_path= cbz_path, dts_path= cbz_path,
comic_info_images= comic_info['Pages'], remove=True): comic_info_images= comic_pages, remove=True):
super().update_icon(item) super().update_icon(item)
self.pack_icon(item) self.pack_icon(item)
# CBZ校验失败 # CBZ校验失败

View File

@ -126,22 +126,3 @@ LOG_STDOUT = True # 标准化输出
CBZ_EXPORT_PATH = os.path.join(BASE_OUTPUT,"CBZ") CBZ_EXPORT_PATH = os.path.join(BASE_OUTPUT,"CBZ")
#数据导出类 排序 #数据导出类 排序
COMIC_INFO_XML_FILE = "ComicInfo.xml" COMIC_INFO_XML_FILE = "ComicInfo.xml"
COMIC_INFO_FIELDS_TO_EXPORT = [
"Title",
"Series",
"Number",
"SeriesGroup",
"Summary",
"Year",
"Month",
"Day",
"Writer",
"Publisher",
"Genre",
"Tags",
"Web",
"PageCount",
"LanguageISO",
"AgeRating",
"Pages"
]