diff --git a/src/common/ComicInfo.py b/src/common/ComicInfo.py index 1e8d10c..fba47b6 100644 --- a/src/common/ComicInfo.py +++ b/src/common/ComicInfo.py @@ -205,6 +205,7 @@ class ImageInfo: # Define the ComicInfo and ComicPageInfo classes class ComicInfo: + # ComicInfo.xml 中的选项 def __init__(self): self.Title: str = "" """标题""" @@ -247,6 +248,7 @@ class ComicInfo: self.Pages: List[ComicPageInfo] = [] class ComicPageInfo: + # ComicInfo.xml 中的 def __init__(self): self.Image: int = -1 self.Type: str = "Story" @@ -269,6 +271,9 @@ class ComicPageInfo: return data class ComicInfoXml: + """ + 生成ComicInfo.xml + """ def _save_xml_to_file(self, xml_string, filename): """ Save the XML string to a file @@ -365,9 +370,14 @@ class ComicInfoXml: #print(xml_data) def _required_attributes(self): + """ + 必需值,如果为空刚报错 + """ return ["Title", "Series", "Number", "PageCount", "Writer"] def _gen_pageinfo(self, image_names, save_dir): + """ 获取PageInfo数据 + """ pages = [] # Adding pages to the comic for image_name in image_names: @@ -377,8 +387,10 @@ class ComicInfoXml: # 图像属性 文件名 大小 长 pages.append(page) return pages - + def scrapy_xml_by_json(self, json_data, save_dir=None, xsd_file=XSD_FILE): + """ 根据Json数据生成ComicInfo.xml + """ comic = ComicInfo() comic.Title = json_data.get("chapter", "") comic.Series = json_data.get("name", "") diff --git a/test.py b/test.py index 1d7187d..6ed3faa 100644 --- a/test.py +++ b/test.py @@ -2,11 +2,20 @@ from src.common.naming import FileNaming from src.common.ComicInfo import ImageInfo from zipfile import ZipFile from datetime import datetime -import os +import os,hashlib +import xml.etree.ElementTree as ET +from PIL import Image +from io import BytesIO +from tempfile import NamedTemporaryFile +from xml.dom import minidom + class test: - def clean_cbz(self): + def clean_min_cbz(self): + """ + 清理3KB以下CBZ文件 + """ dir_path = "/mnt/Comics/CBZ/rm_comic" for dir in os.listdir(dir_path): c_dir = os.path.join(dir_path, dir) @@ -49,6 +58,194 @@ class test: files = list(FileNaming.get_filenames_optimized(c_dir, ext_filter=['.CBZ'])) for file in files: self._clean_old_cbz(file) + +class comicInfo: + + def find_actual_path(self, zip_ref, target_path): + """不区分大小写查找压缩包内的实际文件路径""" + target_lower = target_path.lower() + for name in zip_ref.namelist(): + if name.lower() == target_lower: + return name + return None + + def process_cbz(self, cbz_path): + try: + with ZipFile(cbz_path, 'r') as cbz: + # ============================================ + # 第一部分:读取 ComicInfo.xml 的元数据字段 + # ============================================ + xml_files = [f for f in cbz.namelist() if f.lower() == 'comicinfo.xml'] + if not xml_files: + print("未找到 ComicInfo.xml") + return None + xml_file_name = xml_files[0] + + # 解析 XML 元数据 + metadata = {} + with cbz.open(xml_file_name) as xml_file: + xml_content = xml_file.read().decode('utf-8') + root = ET.fromstring(xml_content) + + # 定义需要提取的元数据字段(用户自定义的字段列表) + metadata_fields = [ + "Title", "Series", "Number", "Summary", "Writer", + "Genre", "PageCount", "AgeRating" + ] + + for field in metadata_fields: + element = root.find(field) + metadata[field] = element.text if element is not None else None + + # ============================================ + # 第二部分:读取 Page 标签的图片信息 + # ============================================ + pages_info = [] + with cbz.open(xml_file_name) as xml_file: + xml_content = xml_file.read().decode('utf-8') + root = ET.fromstring(xml_content) + + # 提取所有 Page 标签 + pages = root.find('Pages') + if pages is None: + print("XML 中缺少 Pages 标签") + return {"metadata": metadata, "pages": None} + + page_list = pages.findall('Page') + if not page_list: + print("Pages 标签下无 Page 元素") + return {"metadata": metadata, "pages": None} + # 收集图片路径 + image_paths = [page.get('Image') for page in page_list if page.get('Image')] + + # 处理每个图片文件 + for img_path in image_paths: + actual_path = self.find_actual_path(cbz, img_path+".jpg") + if not actual_path: + print(f"警告:图片 '{img_path}' 不存在于压缩包中") + continue + + with cbz.open(actual_path) as img_file: + content = img_file.read() + + # 计算 MD5 和文件大小 + file_md5 = hashlib.md5(content).hexdigest() + file_size = len(content) + + # 读取图片尺寸 + img_width, img_height = None, None + try: + with Image.open(BytesIO(content)) as img: + img_width, img_height = img.size + except Exception as e: + print(f"无法读取图片尺寸:{actual_path},错误:{e}") + + # 存储图片信息 + pages_info.append({ + "name": os.path.basename(actual_path).split(".")[0], + "size": file_size, + "key": file_md5, + "width": img_width, + "height": img_height + }) + + return { + "metadata": metadata, + "pages": pages_info + } + except Exception as e: + print(f"处理 CBZ 文件时出错: {e}") + return None + + def generate_comic_info_xml(self, metadata, pages_info): + """根据元数据和页面信息生成 ComicInfo.xml 内容""" + # 创建根节点 + root = ET.Element("ComicInfo", xmlns="http://comicrack.cyolito.com/comicinfo") + + # 添加元数据字段 + for field, value in metadata.items(): + if value is not None: + elem = ET.SubElement(root, field) + elem.text = str(value) + + # 添加 Pages 节点 + if pages_info: + pages_elem = ET.SubElement(root, "Pages") + for page in pages_info: + # 示例中保留关键属性,可根据需要扩展其他属性 + page_elem = ET.SubElement(pages_elem, "Page", attrib={ + "Image": page.get("name", ""), + "ImageSize": str(page.get("size", 0)), + "Key": str(page.get("key", 0)), + "ImageWidth": str(page.get("width", 0)), + "ImageHeight": str(page.get("height", 0)) + }) + + # 生成 XML 字符串 + #tree = ET.ElementTree(root) + #xml_content = BytesIO() + #tree.write(xml_content, encoding="utf-8", xml_declaration=True) + + # Create a formatted XML string + xml_str = ET.tostring(root, encoding='utf-8', method='xml') + parsed_xml = minidom.parseString(xml_str) + formatted_xml = parsed_xml.toprettyxml(indent=" ", encoding="utf-8") # Adjust the number of spaces for indentation as needed + + # Convert bytes to string and add XML declaration + return formatted_xml.decode('utf-8') + + #return xml_content.getvalue() + + def update_cbz_with_new_xml(self, cbz_path, new_xml_content, output_path=None): + """将新生成的 ComicInfo.xml 更新到 CBZ 文件中""" + try: + # 默认输出路径为原文件路径(覆盖原文件) + if output_path is None: + output_path = cbz_path + + # 创建临时文件处理覆盖操作 + with NamedTemporaryFile(delete=False) as tmp: + tmp.close() + os.replace(cbz_path, tmp.name) # 备份原文件 + + # 读取原文件并替换 ComicInfo.xml + with ZipFile(tmp.name, 'r') as source_zip: + with ZipFile(output_path, 'w') as new_zip: + # 复制原文件(跳过旧 XML) + for item in source_zip.infolist(): + if item.filename.lower() != 'comicinfo.xml': + new_zip.writestr(item, source_zip.read(item.filename)) + + # 添加新 XML + new_zip.writestr("ComicInfo.xml", new_xml_content) + + os.remove(tmp.name) # 清理临时文件 + return True + except Exception as e: + print(f"更新 CBZ 文件失败: {e}") + if os.path.exists(tmp.name): + os.replace(tmp.name, cbz_path) # 恢复备份 + return False + + def update_comicinfo_cbz(self, cbz_path): + """更新CBZ中的ComicInfo.xml + + Args: + cbz_path (_type_): _description_ + """ + data = self.process_cbz(cbz_path) + # 生成 XML 内容 + new_xml = self.generate_comic_info_xml(data["metadata"], data["pages"]) + # 测试:保存 XML 到本地查看 + with open("NewComicInfo.xml", "w", encoding="utf-8") as f: + f.write(new_xml) + print("已生成 NewComicInfo.xml") + # 更新 CBZ 文件(示例路径,实际操作前请备份) + success = comicInfo().update_cbz_with_new_xml("example.cbz", new_xml, "example_updated.cbz") + # if success: + # print("CBZ 文件更新成功") + if __name__ == "__main__": - test().clean_old_cbz() \ No newline at end of file + # 清除3KB以下CBZ文件 + test().clean_min_cbz() \ No newline at end of file