168 lines
6.7 KiB
Python
168 lines
6.7 KiB
Python
import xml.etree.ElementTree as ET
|
|
from xml.dom import minidom
|
|
from typing import List
|
|
import json,os
|
|
from lxml import etree
|
|
from Comics.settings import COMIC_INFO_XML_FILE,COMIC_INFO_XSD_FILE,OUTPUT_DIR,PROJECT_KEY
|
|
|
|
# Define the ComicInfo and ComicPageInfo classes
|
|
class ComicInfo:
|
|
def __init__(self):
|
|
self.Title: str = ""
|
|
"""标题"""
|
|
self.Series: str = ""
|
|
self.Number: str = ""
|
|
self.Count: int = -1
|
|
self.Volume: int = -1
|
|
self.AlternateSeries: str = ""
|
|
self.AlternateNumber: str = ""
|
|
self.AlternateCount: int = -1
|
|
self.Summary: str = ""
|
|
self.Notes: str = ""
|
|
self.Year: int = -1
|
|
self.Month: int = -1
|
|
self.Day: int = -1
|
|
self.Writer: str = ""
|
|
self.Penciller: str = ""
|
|
self.Inker: str = ""
|
|
self.Colorist: str = ""
|
|
self.Letterer: str = ""
|
|
self.CoverArtist: str = ""
|
|
self.Editor: str = ""
|
|
self.Publisher: str = ""
|
|
self.Imprint: str = ""
|
|
self.Genre: str = ""
|
|
self.Tags: str = ""
|
|
self.Web: str = ""
|
|
self.PageCount: int = -1
|
|
self.LanguageISO: str = ""
|
|
self.Format: str = ""
|
|
self.BlackAndWhite: str = ""
|
|
self.Manga: str = ""
|
|
self.Characters: str = ""
|
|
self.Teams: str = ""
|
|
self.Locations: str = ""
|
|
self.ScanInformation: str = ""
|
|
self.StoryArc: str = ""
|
|
self.SeriesGroup: str = ""
|
|
self.AgeRating: str = ""
|
|
self.Pages: List[ComicPageInfo] = []
|
|
|
|
class ComicPageInfo:
|
|
def __init__(self):
|
|
self.Image: int = -1
|
|
self.Type: str = "Story"
|
|
self.DoublePage: bool = False
|
|
self.ImageSize: int = -1
|
|
self.Key: str = ""
|
|
self.Bookmark: str = ""
|
|
self.ImageWidth: int = -1
|
|
self.ImageHeight: int = -1
|
|
|
|
def toString(self):
|
|
data = {}
|
|
def add(key, value):
|
|
if value != -1 and value != "": data[key] = str(value)
|
|
add("Image", self.Image)
|
|
add("ImageSize", self.ImageSize)
|
|
add("ImageWidth", self.ImageWidth)
|
|
add("ImageHeight", self.ImageHeight)
|
|
return data
|
|
|
|
class ComicInfoXml:
|
|
def save_xml_to_file(self, xml_string, filename):
|
|
"""
|
|
Save the XML string to a file
|
|
"""
|
|
base_dir = os.path.dirname(filename)
|
|
if not os.path.exists(base_dir): os.makedirs(base_dir)
|
|
with open(filename, "w", encoding="utf-8") as file:
|
|
file.write(xml_string)
|
|
|
|
def validate_xml_with_xsd_file(self, xml_file, xsd_file, remove=True):
|
|
"""
|
|
Validate the XML file against the XSD file
|
|
"""
|
|
xml_doc = etree.parse(xml_file)
|
|
with open(xsd_file, 'r', encoding="utf-8") as file:
|
|
xsd_doc = etree.XMLSchema(etree.parse(file))
|
|
try:
|
|
xsd_doc.assertValid(xml_doc)
|
|
print("XML is valid according to the XSD.")
|
|
except etree.DocumentInvalid as e:
|
|
print("XML is not valid:")
|
|
print(e)
|
|
if remove:
|
|
os.remove(xml_file)
|
|
|
|
def parse_comicinfo(self, comic: ComicInfo, save_dir=None, xml_filename="ComicInfo.xml", xsd_filename="ComicInfo.xsd"):
|
|
"""_summary_
|
|
|
|
Args:
|
|
comic (ComicInfo): _description_
|
|
save_dir (_type_, optional): _description_. Defaults to None.
|
|
xml_filename (str, optional): _description_. Defaults to "ComicInfo.xml".
|
|
xsd_filename (str, optional): _description_. Defaults to "ComicInfo_2.1.xsd".
|
|
"""
|
|
# Serialize to XML with formatted output
|
|
def serialize_comic_info(comic: ComicInfo) -> str:
|
|
# Create root element with XML declaration and namespaces
|
|
comic_elem = ET.Element('ComicInfo')
|
|
comic_elem.set('xmlns:xsd', 'http://www.w3.org/2001/XMLSchema')
|
|
comic_elem.set('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance')
|
|
# Add subelements and attributes based on presence and requirements
|
|
for attr, value in comic.__dict__.items():
|
|
# if value or (attr in ['Volume', 'Year', 'Month', 'Day', 'PageCount'] and (value == -1 or value == "" ) ): # Check required attributes
|
|
if value != -1 and value != '':
|
|
if attr == 'Pages':
|
|
pages_elem = ET.SubElement(comic_elem, 'Pages')
|
|
for page in value:
|
|
cpi = ComicPageInfo()
|
|
cpi.Image = page.Image
|
|
cpi.ImageSize = page.ImageSize
|
|
cpi.ImageWidth = page.ImageWidth
|
|
cpi.ImageHeight = page.ImageHeight
|
|
page_elem = ET.SubElement(pages_elem, 'Page', cpi.toString())
|
|
else:
|
|
ET.SubElement(comic_elem, attr).text = str(value)
|
|
|
|
# Create a formatted XML string
|
|
xml_str = ET.tostring(comic_elem, encoding='utf-8', method='xml')
|
|
parsed_xml = minidom.parseString(xml_str)
|
|
formatted_xml = parsed_xml.toprettyxml(indent=" ", encoding="utf-8") # Adjust the number of spaces for indentation as needed
|
|
|
|
# Convert bytes to string and add XML declaration
|
|
return formatted_xml.decode('utf-8')
|
|
|
|
# Serialize the ComicInfo object
|
|
serialized_xml = serialize_comic_info(comic)
|
|
|
|
# 保存数据XML到文件
|
|
if save_dir != None: xml_filename = os.path.join(save_dir, xml_filename)
|
|
self.save_xml_to_file(serialized_xml, xml_filename)
|
|
self.validate_xml_with_xsd_file(xml_filename, xsd_filename) # 将 JSON 转换为 XML
|
|
#xml_data = json_to_xml_with_declaration(json_data)
|
|
#print(xml_data)
|
|
|
|
def scrapy_xml_by_json(self, json_data, save_dir=None, xsd_file=COMIC_INFO_XSD_FILE):
|
|
comic = ComicInfo()
|
|
comic.Title = json_data.get("chapter", "")
|
|
comic.Series = json_data.get("name", "")
|
|
comic.Writer = json_data.get("author", "")
|
|
comic.AgeRating = json_data.get("age_rating", "")
|
|
comic.Tags = json_data.get("tags", "")
|
|
comic.Summary = json_data.get("dep", "")
|
|
comic.Genre = json_data.get("genre", "")
|
|
comic.Number = json_data.get("index", "")
|
|
comic.PageCount = json_data.get("count", "")
|
|
comic.Writer = json_data.get("author", "")
|
|
image_names = json_data.get("images", "")
|
|
pages = []
|
|
# Adding pages to the comic
|
|
for image_name in image_names:
|
|
page = ComicPageInfo()
|
|
page.Image = image_name.split(".")[0].split("_")[-1]
|
|
pages.append(page.Image)
|
|
comic.Pages.append(page)
|
|
self.parse_comicinfo(comic, save_dir=save_dir, xsd_filename=xsd_file)
|
|
return pages |