ComicScrapy/Comics/_utils/ComicInfo.py

import xml.etree.ElementTree as ET
from xml.dom import minidom
from typing import List
import json,os
from lxml import etree
from Comics.settings import COMIC_INFO_XML_FILE,COMIC_INFO_XSD_FILE,OUTPUT_DIR,PROJECT_KEY

# Define the ComicInfo and ComicPageInfo classes
class ComicInfo:
    def __init__(self):
        self.Title: str = ""
        """标题"""
        self.Series: str = ""
        self.Number: str = ""
        self.Count: int = -1
        self.Volume: int = -1
        self.AlternateSeries: str = ""
        self.AlternateNumber: str = ""
        self.AlternateCount: int = -1
        self.Summary: str = ""
        self.Notes: str = ""
        self.Year: int = -1
        self.Month: int = -1
        self.Day: int = -1
        self.Writer: str = ""
        self.Penciller: str = ""
        self.Inker: str = ""
        self.Colorist: str = ""
        self.Letterer: str = ""
        self.CoverArtist: str = ""
        self.Editor: str = ""
        self.Publisher: str = ""
        self.Imprint: str = ""
        self.Genre: str = ""
        self.Tags: str = ""
        self.Web: str = ""
        self.PageCount: int = -1
        self.LanguageISO: str = ""
        self.Format: str = ""
        self.BlackAndWhite: str = ""
        self.Manga: str = ""
        self.Characters: str = ""
        self.Teams: str = ""
        self.Locations: str = ""
        self.ScanInformation: str = ""
        self.StoryArc: str = ""
        self.SeriesGroup: str = ""
        self.AgeRating: str = ""
        self.Pages: List[ComicPageInfo] = []

class ComicPageInfo:
    def __init__(self):
        self.Image: int = -1
        self.Type: str = "Story"
        self.DoublePage: bool = False
        self.ImageSize: int = -1
        self.Key: str = ""
        self.Bookmark: str = ""
        self.ImageWidth: int = -1
        self.ImageHeight: int = -1

    def toString(self):
        data = {}
        def add(key, value):
            if value != -1 and value != "": data[key] = str(value)
        add("Image", self.Image)
        add("ImageSize", self.ImageSize)
        add("ImageWidth", self.ImageWidth)
        add("ImageHeight", self.ImageHeight)
        return data

class ComicInfoXml:
    def save_xml_to_file(self, xml_string, filename):
        """
        Save the XML string to a file
        """
        base_dir = os.path.dirname(filename)
        if not os.path.exists(base_dir): os.makedirs(base_dir)
        with open(filename, "w", encoding="utf-8") as file:
            file.write(xml_string)

    def validate_xml_with_xsd_file(self, xml_file, xsd_file, remove=True):
        """
        Validate the XML file against the XSD file
        """
        xml_doc = etree.parse(xml_file)
        with open(xsd_file, 'r', encoding="utf-8") as file:
            xsd_doc = etree.XMLSchema(etree.parse(file))
        try:
            xsd_doc.assertValid(xml_doc)
            print("XML is valid according to the XSD.")
        except etree.DocumentInvalid as e:
            print("XML is not valid:")
            print(e)
            if remove:
                os.remove(xml_file)

    def parse_comicinfo(self, comic: ComicInfo, save_dir=None, xml_filename="ComicInfo.xml", xsd_filename="ComicInfo.xsd"):
        """_summary_

        Args:
            comic (ComicInfo): _description_
            save_dir (_type_, optional): _description_. Defaults to None.
            xml_filename (str, optional): _description_. Defaults to "ComicInfo.xml".
            xsd_filename (str, optional): _description_. Defaults to "ComicInfo_2.1.xsd".
        """
        # Serialize to XML with formatted output
        def serialize_comic_info(comic: ComicInfo) -> str:
            # Create root element with XML declaration and namespaces
            comic_elem = ET.Element('ComicInfo')
            comic_elem.set('xmlns:xsd', 'http://www.w3.org/2001/XMLSchema')
            comic_elem.set('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance')
            # Add subelements and attributes based on presence and requirements
            for attr, value in comic.__dict__.items():
    #            if value or (attr in ['Volume', 'Year', 'Month', 'Day', 'PageCount'] and (value == -1 or value == "" ) ):  # Check required attributes
                if value != -1 and value != '':
                    if attr == 'Pages':
                        pages_elem = ET.SubElement(comic_elem, 'Pages')
                        for page in value:
                            cpi = ComicPageInfo()
                            cpi.Image = page.Image
                            cpi.ImageSize = page.ImageSize
                            cpi.ImageWidth = page.ImageWidth
                            cpi.ImageHeight = page.ImageHeight
                            page_elem = ET.SubElement(pages_elem, 'Page', cpi.toString())
                    else:
                        ET.SubElement(comic_elem, attr).text = str(value)

            # Create a formatted XML string
            xml_str = ET.tostring(comic_elem, encoding='utf-8', method='xml')
            parsed_xml = minidom.parseString(xml_str)
            formatted_xml = parsed_xml.toprettyxml(indent="  ", encoding="utf-8")  # Adjust the number of spaces for indentation as needed

            # Convert bytes to string and add XML declaration
            return formatted_xml.decode('utf-8')

        # Serialize the ComicInfo object
        serialized_xml = serialize_comic_info(comic)

        # 保存数据XML到文件
        if save_dir != None: xml_filename = os.path.join(save_dir, xml_filename)
        self.save_xml_to_file(serialized_xml, xml_filename)
        self.validate_xml_with_xsd_file(xml_filename, xsd_filename)   # 将 JSON 转换为 XML
        #xml_data = json_to_xml_with_declaration(json_data)
        #print(xml_data)

    def scrapy_xml_by_json(self, json_data, save_dir=None, xsd_file=COMIC_INFO_XSD_FILE):
        comic = ComicInfo()
        comic.Title = json_data.get("chapter", "")
        comic.Series = json_data.get("name", "")
        comic.Writer = json_data.get("author", "")
        comic.AgeRating = json_data.get("age_rating", "")
        comic.Tags = json_data.get("tags", "")
        comic.Summary = json_data.get("dep", "")
        comic.Genre = json_data.get("genre", "")
        comic.Number = json_data.get("index", "")
        comic.PageCount = json_data.get("count", "")
        comic.Writer = json_data.get("author", "")
        image_names = json_data.get("images", "")
        pages = []
        # Adding pages to the comic
        for image_name in image_names:
            page = ComicPageInfo()
            page.Image = image_name.split(".")[0].split("_")[-1]
            pages.append(page.Image)
            comic.Pages.append(page)
        self.parse_comicinfo(comic, save_dir=save_dir, xsd_filename=xsd_file)
        return pages