add update CBZ ComicInfo.xml

This commit is contained in:
caiwx86 2025-02-10 12:18:30 +08:00
parent 0d5e26662f
commit 97449dca0b
2 changed files with 213 additions and 4 deletions

View File

@ -205,6 +205,7 @@ class ImageInfo:
# Define the ComicInfo and ComicPageInfo classes
class ComicInfo:
# ComicInfo.xml 中的选项
def __init__(self):
self.Title: str = ""
"""标题"""
@ -247,6 +248,7 @@ class ComicInfo:
self.Pages: List[ComicPageInfo] = []
class ComicPageInfo:
# ComicInfo.xml 中的<Page>
def __init__(self):
self.Image: int = -1
self.Type: str = "Story"
@ -269,6 +271,9 @@ class ComicPageInfo:
return data
class ComicInfoXml:
"""
生成ComicInfo.xml
"""
def _save_xml_to_file(self, xml_string, filename):
"""
Save the XML string to a file
@ -365,9 +370,14 @@ class ComicInfoXml:
#print(xml_data)
def _required_attributes(self):
"""
必需值如果为空刚报错
"""
return ["Title", "Series", "Number", "PageCount", "Writer"]
def _gen_pageinfo(self, image_names, save_dir):
""" 获取PageInfo数据
"""
pages = []
# Adding pages to the comic
for image_name in image_names:
@ -377,8 +387,10 @@ class ComicInfoXml:
# 图像属性 文件名 大小 长
pages.append(page)
return pages
def scrapy_xml_by_json(self, json_data, save_dir=None, xsd_file=XSD_FILE):
""" 根据Json数据生成ComicInfo.xml
"""
comic = ComicInfo()
comic.Title = json_data.get("chapter", "")
comic.Series = json_data.get("name", "")

203
test.py
View File

@ -2,11 +2,20 @@ from src.common.naming import FileNaming
from src.common.ComicInfo import ImageInfo
from zipfile import ZipFile
from datetime import datetime
import os
import os,hashlib
import xml.etree.ElementTree as ET
from PIL import Image
from io import BytesIO
from tempfile import NamedTemporaryFile
from xml.dom import minidom
class test:
def clean_cbz(self):
def clean_min_cbz(self):
"""
清理3KB以下CBZ文件
"""
dir_path = "/mnt/Comics/CBZ/rm_comic"
for dir in os.listdir(dir_path):
c_dir = os.path.join(dir_path, dir)
@ -49,6 +58,194 @@ class test:
files = list(FileNaming.get_filenames_optimized(c_dir, ext_filter=['.CBZ']))
for file in files:
self._clean_old_cbz(file)
class comicInfo:
def find_actual_path(self, zip_ref, target_path):
"""不区分大小写查找压缩包内的实际文件路径"""
target_lower = target_path.lower()
for name in zip_ref.namelist():
if name.lower() == target_lower:
return name
return None
def process_cbz(self, cbz_path):
try:
with ZipFile(cbz_path, 'r') as cbz:
# ============================================
# 第一部分:读取 ComicInfo.xml 的元数据字段
# ============================================
xml_files = [f for f in cbz.namelist() if f.lower() == 'comicinfo.xml']
if not xml_files:
print("未找到 ComicInfo.xml")
return None
xml_file_name = xml_files[0]
# 解析 XML 元数据
metadata = {}
with cbz.open(xml_file_name) as xml_file:
xml_content = xml_file.read().decode('utf-8')
root = ET.fromstring(xml_content)
# 定义需要提取的元数据字段(用户自定义的字段列表)
metadata_fields = [
"Title", "Series", "Number", "Summary", "Writer",
"Genre", "PageCount", "AgeRating"
]
for field in metadata_fields:
element = root.find(field)
metadata[field] = element.text if element is not None else None
# ============================================
# 第二部分:读取 Page 标签的图片信息
# ============================================
pages_info = []
with cbz.open(xml_file_name) as xml_file:
xml_content = xml_file.read().decode('utf-8')
root = ET.fromstring(xml_content)
# 提取所有 Page 标签
pages = root.find('Pages')
if pages is None:
print("XML 中缺少 Pages 标签")
return {"metadata": metadata, "pages": None}
page_list = pages.findall('Page')
if not page_list:
print("Pages 标签下无 Page 元素")
return {"metadata": metadata, "pages": None}
# 收集图片路径
image_paths = [page.get('Image') for page in page_list if page.get('Image')]
# 处理每个图片文件
for img_path in image_paths:
actual_path = self.find_actual_path(cbz, img_path+".jpg")
if not actual_path:
print(f"警告:图片 '{img_path}' 不存在于压缩包中")
continue
with cbz.open(actual_path) as img_file:
content = img_file.read()
# 计算 MD5 和文件大小
file_md5 = hashlib.md5(content).hexdigest()
file_size = len(content)
# 读取图片尺寸
img_width, img_height = None, None
try:
with Image.open(BytesIO(content)) as img:
img_width, img_height = img.size
except Exception as e:
print(f"无法读取图片尺寸:{actual_path},错误:{e}")
# 存储图片信息
pages_info.append({
"name": os.path.basename(actual_path).split(".")[0],
"size": file_size,
"key": file_md5,
"width": img_width,
"height": img_height
})
return {
"metadata": metadata,
"pages": pages_info
}
except Exception as e:
print(f"处理 CBZ 文件时出错: {e}")
return None
def generate_comic_info_xml(self, metadata, pages_info):
"""根据元数据和页面信息生成 ComicInfo.xml 内容"""
# 创建根节点
root = ET.Element("ComicInfo", xmlns="http://comicrack.cyolito.com/comicinfo")
# 添加元数据字段
for field, value in metadata.items():
if value is not None:
elem = ET.SubElement(root, field)
elem.text = str(value)
# 添加 Pages 节点
if pages_info:
pages_elem = ET.SubElement(root, "Pages")
for page in pages_info:
# 示例中保留关键属性,可根据需要扩展其他属性
page_elem = ET.SubElement(pages_elem, "Page", attrib={
"Image": page.get("name", ""),
"ImageSize": str(page.get("size", 0)),
"Key": str(page.get("key", 0)),
"ImageWidth": str(page.get("width", 0)),
"ImageHeight": str(page.get("height", 0))
})
# 生成 XML 字符串
#tree = ET.ElementTree(root)
#xml_content = BytesIO()
#tree.write(xml_content, encoding="utf-8", xml_declaration=True)
# Create a formatted XML string
xml_str = ET.tostring(root, encoding='utf-8', method='xml')
parsed_xml = minidom.parseString(xml_str)
formatted_xml = parsed_xml.toprettyxml(indent=" ", encoding="utf-8") # Adjust the number of spaces for indentation as needed
# Convert bytes to string and add XML declaration
return formatted_xml.decode('utf-8')
#return xml_content.getvalue()
def update_cbz_with_new_xml(self, cbz_path, new_xml_content, output_path=None):
"""将新生成的 ComicInfo.xml 更新到 CBZ 文件中"""
try:
# 默认输出路径为原文件路径(覆盖原文件)
if output_path is None:
output_path = cbz_path
# 创建临时文件处理覆盖操作
with NamedTemporaryFile(delete=False) as tmp:
tmp.close()
os.replace(cbz_path, tmp.name) # 备份原文件
# 读取原文件并替换 ComicInfo.xml
with ZipFile(tmp.name, 'r') as source_zip:
with ZipFile(output_path, 'w') as new_zip:
# 复制原文件(跳过旧 XML
for item in source_zip.infolist():
if item.filename.lower() != 'comicinfo.xml':
new_zip.writestr(item, source_zip.read(item.filename))
# 添加新 XML
new_zip.writestr("ComicInfo.xml", new_xml_content)
os.remove(tmp.name) # 清理临时文件
return True
except Exception as e:
print(f"更新 CBZ 文件失败: {e}")
if os.path.exists(tmp.name):
os.replace(tmp.name, cbz_path) # 恢复备份
return False
def update_comicinfo_cbz(self, cbz_path):
"""更新CBZ中的ComicInfo.xml
Args:
cbz_path (_type_): _description_
"""
data = self.process_cbz(cbz_path)
# 生成 XML 内容
new_xml = self.generate_comic_info_xml(data["metadata"], data["pages"])
# 测试:保存 XML 到本地查看
with open("NewComicInfo.xml", "w", encoding="utf-8") as f:
f.write(new_xml)
print("已生成 NewComicInfo.xml")
# 更新 CBZ 文件(示例路径,实际操作前请备份)
success = comicInfo().update_cbz_with_new_xml("example.cbz", new_xml, "example_updated.cbz")
# if success:
# print("CBZ 文件更新成功")
if __name__ == "__main__":
test().clean_old_cbz()
# 清除3KB以下CBZ文件
test().clean_min_cbz()