ComicScrapy/Comics/_utils/ComicInfo.py
2024-10-28 00:03:20 +08:00

168 lines
6.7 KiB
Python

import xml.etree.ElementTree as ET
from xml.dom import minidom
from typing import List
import json,os
from lxml import etree
from Comics.settings import COMIC_INFO_XML_FILE,COMIC_INFO_XSD_FILE,OUTPUT_DIR,PROJECT_KEY
# Define the ComicInfo and ComicPageInfo classes
class ComicInfo:
def __init__(self):
self.Title: str = ""
"""标题"""
self.Series: str = ""
self.Number: str = ""
self.Count: int = -1
self.Volume: int = -1
self.AlternateSeries: str = ""
self.AlternateNumber: str = ""
self.AlternateCount: int = -1
self.Summary: str = ""
self.Notes: str = ""
self.Year: int = -1
self.Month: int = -1
self.Day: int = -1
self.Writer: str = ""
self.Penciller: str = ""
self.Inker: str = ""
self.Colorist: str = ""
self.Letterer: str = ""
self.CoverArtist: str = ""
self.Editor: str = ""
self.Publisher: str = ""
self.Imprint: str = ""
self.Genre: str = ""
self.Tags: str = ""
self.Web: str = ""
self.PageCount: int = -1
self.LanguageISO: str = ""
self.Format: str = ""
self.BlackAndWhite: str = ""
self.Manga: str = ""
self.Characters: str = ""
self.Teams: str = ""
self.Locations: str = ""
self.ScanInformation: str = ""
self.StoryArc: str = ""
self.SeriesGroup: str = ""
self.AgeRating: str = ""
self.Pages: List[ComicPageInfo] = []
class ComicPageInfo:
def __init__(self):
self.Image: int = -1
self.Type: str = "Story"
self.DoublePage: bool = False
self.ImageSize: int = -1
self.Key: str = ""
self.Bookmark: str = ""
self.ImageWidth: int = -1
self.ImageHeight: int = -1
def toString(self):
data = {}
def add(key, value):
if value != -1 and value != "": data[key] = str(value)
add("Image", self.Image)
add("ImageSize", self.ImageSize)
add("ImageWidth", self.ImageWidth)
add("ImageHeight", self.ImageHeight)
return data
class ComicInfoXml:
def save_xml_to_file(self, xml_string, filename):
"""
Save the XML string to a file
"""
base_dir = os.path.dirname(filename)
if not os.path.exists(base_dir): os.makedirs(base_dir)
with open(filename, "w", encoding="utf-8") as file:
file.write(xml_string)
def validate_xml_with_xsd_file(self, xml_file, xsd_file, remove=True):
"""
Validate the XML file against the XSD file
"""
xml_doc = etree.parse(xml_file)
with open(xsd_file, 'r', encoding="utf-8") as file:
xsd_doc = etree.XMLSchema(etree.parse(file))
try:
xsd_doc.assertValid(xml_doc)
print("XML is valid according to the XSD.")
except etree.DocumentInvalid as e:
print("XML is not valid:")
print(e)
if remove:
os.remove(xml_file)
def parse_comicinfo(self, comic: ComicInfo, save_dir=None, xml_filename="ComicInfo.xml", xsd_filename="ComicInfo.xsd"):
"""_summary_
Args:
comic (ComicInfo): _description_
save_dir (_type_, optional): _description_. Defaults to None.
xml_filename (str, optional): _description_. Defaults to "ComicInfo.xml".
xsd_filename (str, optional): _description_. Defaults to "ComicInfo_2.1.xsd".
"""
# Serialize to XML with formatted output
def serialize_comic_info(comic: ComicInfo) -> str:
# Create root element with XML declaration and namespaces
comic_elem = ET.Element('ComicInfo')
comic_elem.set('xmlns:xsd', 'http://www.w3.org/2001/XMLSchema')
comic_elem.set('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance')
# Add subelements and attributes based on presence and requirements
for attr, value in comic.__dict__.items():
# if value or (attr in ['Volume', 'Year', 'Month', 'Day', 'PageCount'] and (value == -1 or value == "" ) ): # Check required attributes
if value != -1 and value != '':
if attr == 'Pages':
pages_elem = ET.SubElement(comic_elem, 'Pages')
for page in value:
cpi = ComicPageInfo()
cpi.Image = page.Image
cpi.ImageSize = page.ImageSize
cpi.ImageWidth = page.ImageWidth
cpi.ImageHeight = page.ImageHeight
page_elem = ET.SubElement(pages_elem, 'Page', cpi.toString())
else:
ET.SubElement(comic_elem, attr).text = str(value)
# Create a formatted XML string
xml_str = ET.tostring(comic_elem, encoding='utf-8', method='xml')
parsed_xml = minidom.parseString(xml_str)
formatted_xml = parsed_xml.toprettyxml(indent=" ", encoding="utf-8") # Adjust the number of spaces for indentation as needed
# Convert bytes to string and add XML declaration
return formatted_xml.decode('utf-8')
# Serialize the ComicInfo object
serialized_xml = serialize_comic_info(comic)
# 保存数据XML到文件
if save_dir != None: xml_filename = os.path.join(save_dir, xml_filename)
self.save_xml_to_file(serialized_xml, xml_filename)
self.validate_xml_with_xsd_file(xml_filename, xsd_filename) # 将 JSON 转换为 XML
#xml_data = json_to_xml_with_declaration(json_data)
#print(xml_data)
def scrapy_xml_by_json(self, json_data, save_dir=None, xsd_file=COMIC_INFO_XSD_FILE):
comic = ComicInfo()
comic.Title = json_data.get("chapter", "")
comic.Series = json_data.get("name", "")
comic.Writer = json_data.get("author", "")
comic.AgeRating = json_data.get("age_rating", "")
comic.Tags = json_data.get("tags", "")
comic.Summary = json_data.get("dep", "")
comic.Genre = json_data.get("genre", "")
comic.Number = json_data.get("index", "")
comic.PageCount = json_data.get("count", "")
comic.Writer = json_data.get("author", "")
image_names = json_data.get("images", "")
pages = []
# Adding pages to the comic
for image_name in image_names:
page = ComicPageInfo()
page.Image = image_name.split(".")[0].split("_")[-1]
pages.append(page.Image)
comic.Pages.append(page)
self.parse_comicinfo(comic, save_dir=save_dir, xsd_filename=xsd_file)
return pages