add update CBZ ComicInfo.xml

2025-02-10 12:18:30 +08:00 · 2025-02-10 12:18:30 +08:00 · 97449dca0b
commit 97449dca0b
parent 0d5e26662f
2 changed files with 213 additions and 4 deletions
--- a/src/common/ComicInfo.py
+++ b/src/common/ComicInfo.py
@ -205,6 +205,7 @@ class ImageInfo:
 
 # Define the ComicInfo and ComicPageInfo classes
 class ComicInfo:
+    # ComicInfo.xml 中的选项
    def __init__(self):
        self.Title: str = ""
        """标题"""
@ -247,6 +248,7 @@ class ComicInfo:
        self.Pages: List[ComicPageInfo] = []

 class ComicPageInfo:
+    # ComicInfo.xml 中的<Page>
    def __init__(self):
        self.Image: int = -1
        self.Type: str = "Story"
@ -269,6 +271,9 @@ class ComicPageInfo:
        return data

 class ComicInfoXml:
+    """
+        生成ComicInfo.xml
+    """
    def _save_xml_to_file(self, xml_string, filename):
        """
        Save the XML string to a file
@ -365,9 +370,14 @@ class ComicInfoXml:
        #print(xml_data)
    
    def _required_attributes(self):
+        """
+            必需值，如果为空刚报错
+        """
        return ["Title", "Series", "Number", "PageCount", "Writer"]
    
    def _gen_pageinfo(self, image_names, save_dir):
+        """ 获取PageInfo数据
+        """
        pages = []
        # Adding pages to the comic
        for image_name in image_names:
@ -377,8 +387,10 @@ class ComicInfoXml:
            # 图像属性 文件名 大小 长
            pages.append(page)
        return pages
-         
+     
    def scrapy_xml_by_json(self, json_data, save_dir=None, xsd_file=XSD_FILE):
+        """ 根据Json数据生成ComicInfo.xml
+        """
        comic = ComicInfo()
        comic.Title = json_data.get("chapter", "")
        comic.Series = json_data.get("name", "")
--- a/test.py
+++ b/test.py
@ -2,11 +2,20 @@ from src.common.naming import FileNaming
 from src.common.ComicInfo import ImageInfo
 from zipfile import ZipFile
 from datetime import datetime
-import os
+import os,hashlib
+import xml.etree.ElementTree as ET
+from PIL import Image
+from io import BytesIO
+from tempfile import NamedTemporaryFile
+from xml.dom import minidom
+

 class test:
    
-    def clean_cbz(self): 
+    def clean_min_cbz(self):
+        """
+            清理3KB以下CBZ文件
+        """ 
        dir_path = "/mnt/Comics/CBZ/rm_comic"
        for dir in os.listdir(dir_path):
            c_dir = os.path.join(dir_path, dir)
@ -49,6 +58,194 @@ class test:
                files = list(FileNaming.get_filenames_optimized(c_dir, ext_filter=['.CBZ']))
                for file in files: 
                    self._clean_old_cbz(file)
+
+class comicInfo:
+
+    def find_actual_path(self, zip_ref, target_path):
+        """不区分大小写查找压缩包内的实际文件路径"""
+        target_lower = target_path.lower()
+        for name in zip_ref.namelist():
+            if name.lower() == target_lower:
+                return name
+        return None
+
+    def process_cbz(self, cbz_path):
+        try:
+            with ZipFile(cbz_path, 'r') as cbz:
+                # ============================================
+                # 第一部分：读取 ComicInfo.xml 的元数据字段
+                # ============================================
+                xml_files = [f for f in cbz.namelist() if f.lower() == 'comicinfo.xml']
+                if not xml_files:
+                    print("未找到 ComicInfo.xml")
+                    return None
+                xml_file_name = xml_files[0]
+
+                # 解析 XML 元数据
+                metadata = {}
+                with cbz.open(xml_file_name) as xml_file:
+                    xml_content = xml_file.read().decode('utf-8')
+                    root = ET.fromstring(xml_content)
+
+                    # 定义需要提取的元数据字段（用户自定义的字段列表）
+                    metadata_fields = [
+                        "Title", "Series", "Number", "Summary", "Writer",
+                        "Genre", "PageCount", "AgeRating"
+                    ]
+
+                    for field in metadata_fields:
+                        element = root.find(field)
+                        metadata[field] = element.text if element is not None else None
+
+                # ============================================
+                # 第二部分：读取 Page 标签的图片信息
+                # ============================================
+                pages_info = []
+                with cbz.open(xml_file_name) as xml_file:
+                    xml_content = xml_file.read().decode('utf-8')
+                    root = ET.fromstring(xml_content)
+
+                    # 提取所有 Page 标签
+                    pages = root.find('Pages')
+                    if pages is None:
+                        print("XML 中缺少 Pages 标签")
+                        return {"metadata": metadata, "pages": None}
+                
+                    page_list = pages.findall('Page')
+                    if not page_list:
+                        print("Pages 标签下无 Page 元素")
+                        return {"metadata": metadata, "pages": None}
                    
+                    # 收集图片路径
+                    image_paths = [page.get('Image') for page in page_list if page.get('Image')]
+                    
+                    # 处理每个图片文件
+                    for img_path in image_paths:
+                        actual_path = self.find_actual_path(cbz, img_path+".jpg")
+                        if not actual_path:
+                            print(f"警告：图片 '{img_path}' 不存在于压缩包中")
+                            continue
+                        
+                        with cbz.open(actual_path) as img_file:
+                            content = img_file.read()
+                            
+                            # 计算 MD5 和文件大小
+                            file_md5 = hashlib.md5(content).hexdigest()
+                            file_size = len(content)
+                            
+                            # 读取图片尺寸
+                            img_width, img_height = None, None
+                            try:
+                                with Image.open(BytesIO(content)) as img:
+                                    img_width, img_height = img.size
+                            except Exception as e:
+                                print(f"无法读取图片尺寸：{actual_path}，错误：{e}")
+                            
+                            # 存储图片信息
+                            pages_info.append({
+                                "name": os.path.basename(actual_path).split(".")[0],
+                                "size": file_size,
+                                "key": file_md5,
+                                "width": img_width,
+                                "height": img_height
+                            })
+    
+                return {
+                    "metadata": metadata,
+                    "pages": pages_info
+                }
+        except Exception as e:
+            print(f"处理 CBZ 文件时出错: {e}")
+            return None
+    
+    def generate_comic_info_xml(self, metadata, pages_info):
+        """根据元数据和页面信息生成 ComicInfo.xml 内容"""
+        # 创建根节点
+        root = ET.Element("ComicInfo", xmlns="http://comicrack.cyolito.com/comicinfo")
+
+        # 添加元数据字段
+        for field, value in metadata.items():
+            if value is not None:
+                elem = ET.SubElement(root, field)
+                elem.text = str(value)
+
+        # 添加 Pages 节点
+        if pages_info:
+            pages_elem = ET.SubElement(root, "Pages")
+            for page in pages_info:
+                # 示例中保留关键属性，可根据需要扩展其他属性
+                page_elem = ET.SubElement(pages_elem, "Page", attrib={
+                    "Image": page.get("name", ""),
+                    "ImageSize": str(page.get("size", 0)),
+                    "Key": str(page.get("key", 0)),
+                    "ImageWidth": str(page.get("width", 0)),
+                    "ImageHeight": str(page.get("height", 0))
+                })
+
+        # 生成 XML 字符串
+        #tree = ET.ElementTree(root)
+        #xml_content = BytesIO()
+        #tree.write(xml_content, encoding="utf-8", xml_declaration=True)
+        
+        # Create a formatted XML string
+        xml_str = ET.tostring(root, encoding='utf-8', method='xml')
+        parsed_xml = minidom.parseString(xml_str)
+        formatted_xml = parsed_xml.toprettyxml(indent="  ", encoding="utf-8")  # Adjust the number of spaces for indentation as needed
+    
+        # Convert bytes to string and add XML declaration
+        return formatted_xml.decode('utf-8')
+        
+        #return xml_content.getvalue()
+
+    def update_cbz_with_new_xml(self, cbz_path, new_xml_content, output_path=None):
+        """将新生成的 ComicInfo.xml 更新到 CBZ 文件中"""
+        try:
+            # 默认输出路径为原文件路径（覆盖原文件）
+            if output_path is None:
+                output_path = cbz_path
+
+            # 创建临时文件处理覆盖操作
+            with NamedTemporaryFile(delete=False) as tmp:
+                tmp.close()
+                os.replace(cbz_path, tmp.name)  # 备份原文件
+
+                # 读取原文件并替换 ComicInfo.xml
+                with ZipFile(tmp.name, 'r') as source_zip:
+                    with ZipFile(output_path, 'w') as new_zip:
+                        # 复制原文件（跳过旧 XML）
+                        for item in source_zip.infolist():
+                            if item.filename.lower() != 'comicinfo.xml':
+                                new_zip.writestr(item, source_zip.read(item.filename))
+
+                        # 添加新 XML
+                        new_zip.writestr("ComicInfo.xml", new_xml_content)
+
+                os.remove(tmp.name)  # 清理临时文件
+                return True
+        except Exception as e:
+            print(f"更新 CBZ 文件失败: {e}")
+            if os.path.exists(tmp.name):
+                os.replace(tmp.name, cbz_path)  # 恢复备份
+            return False
+    
+    def update_comicinfo_cbz(self, cbz_path):
+        """更新CBZ中的ComicInfo.xml
+
+        Args:
+            cbz_path (_type_): _description_
+        """
+        data = self.process_cbz(cbz_path)
+        # 生成 XML 内容
+        new_xml = self.generate_comic_info_xml(data["metadata"], data["pages"])
+        # 测试：保存 XML 到本地查看
+        with open("NewComicInfo.xml", "w", encoding="utf-8") as f:
+            f.write(new_xml)
+        print("已生成 NewComicInfo.xml")
+        # 更新 CBZ 文件（示例路径，实际操作前请备份）
+        success = comicInfo().update_cbz_with_new_xml("example.cbz", new_xml, "example_updated.cbz")
+        # if success:
+        #     print("CBZ 文件更新成功")
+
 if __name__ == "__main__":
-    test().clean_old_cbz()
+    # 清除3KB以下CBZ文件
+    test().clean_min_cbz()