From 4874600a0723c5c74645e6f334f7cfaa67e907c7 Mon Sep 17 00:00:00 2001 From: caiwx86 Date: Tue, 8 Jul 2025 13:06:54 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test.py | 124 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 119 insertions(+), 5 deletions(-) diff --git a/test.py b/test.py index ba13ad7..fd1004f 100644 --- a/test.py +++ b/test.py @@ -2,7 +2,8 @@ from src.common.naming import FileNaming from src.common.ComicInfo import ImageInfo from zipfile import ZipFile from datetime import datetime -import time, shutil +import time, shutil,re, xxhash, json +from typing import Any import os,hashlib import xml.etree.ElementTree as ET from PIL import Image @@ -10,7 +11,10 @@ from io import BytesIO from tempfile import NamedTemporaryFile from xml.dom import minidom from src.common.ComicInfo import ComicInfoXml - +from lxml import etree +from collections import defaultdict +from typing import List, Dict, Tuple + class test: def clean_min_cbz(self): @@ -283,8 +287,92 @@ class comicInfo: #os.remove(cbz_path) except: raise exit(f"ver_comicinfo_xml 错误") - -if __name__ == "__main__": + + + def _comic_info_xml_pages(self, zip_file): + """获取 ComicInfo.xml 文件中的 标签值""" + # data = { "file" : os.path.basename(zip_file), "path" : str(zip_file) } + # data = { "file" : os.path.basename(zip_file)} + data = { "file" : str(zip_file) } + list_page = [] + # 打开ZIP文件 + with ZipFile(str(zip_file), 'r') as z: + try: + # 假设ZIP中的文件名是'text.txt' + with z.open('ComicInfo.xml', 'r') as file: + # 从文件流中解析 XML 数据 + file_string = file.read().decode("utf-8") + # 解析字符串 + root = etree.fromstring(file_string.encode()) # 需要字节数据,或直接传字符串(Python 3.8+) + # 方法 2:遍历所有 Page 元素(适用于多个 Page) + for page in root.findall('Pages/Page'): + page_attrib = page.attrib + list_page.append(page_attrib) + #image_name = page.get('Image') + #print(f"遍历提取 Image 值: {image_name}") + #image_size = page.get('ImageSize') + #print(f"遍历提取 ImageSize 值: {image_size}") + #key_value = page.get('Key') + #print(f"遍历提取 Key 值: {key_value}") + #image_width = page.get('ImageWidth') + #print(f"遍历提取 ImageWidth 值: {image_width}") + #image_height = page.get('ImageHeight') + #print(f"遍历提取 ImageHeight 值: {image_height}") + except Exception as e: + raise exit(f"获取 ComicInfo.xml 文件中的 标签值失败: {zip_file},错误: {str(e)}") + # data["list_page"] = list_page + data["list_hash"] = self.generate_xxhash(list_page) + return data + + def generate_xxhash(self, data: Any) -> str: + """ + 使用 xxhash 生成更快的哈希值 + + 特点: + - 比 MD5 快 2-5 倍 + - 产生 64 位或 128 位哈希 + """ + # 创建哈希对象 + hasher = xxhash.xxh64() + + # 使用 JSON 序列化 + serialized = json.dumps(str(data), sort_keys=True, ensure_ascii=False) + + # 更新哈希 + hasher.update(serialized.encode('utf-8')) + + # 返回十六进制摘要 + return hasher.hexdigest() + + def extract_duplicate_files(self, data: List[Dict[str, str]]) -> Dict[str, List[str]]: + """ + 提取具有重复 list_hash 的文件名 + + 参数: + data: 包含字典的列表,每个字典包含 'file' 和 'list_hash' 键 + + 返回: + 字典: {重复的list_hash: [重复的文件名列表]} + """ + # 第一步: 创建哈希映射表 + hash_map = defaultdict(list) + + # 第二步: 填充映射表 (O(n) 时间复杂度) + for item in data: + file_name = item['file'] + list_hash = item['list_hash'] + hash_map[list_hash].append(file_name) + + # 第三步: 过滤出重复项 (O(m) 时间复杂度,m 是唯一哈希数) + duplicates = { + hash_val: files + for hash_val, files in hash_map.items() + if len(files) > 1 + } + + return duplicates + +if __name__ == "__main1__": # 清除3KB以下CBZ文件 # comicInfo().update_comicinfo_cbz("") #cbz_path = "/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/福利女姊姊/第1话 福利女姊姊.CBZ" @@ -309,4 +397,30 @@ if __name__ == "__main__": comicInfo().ver_comicinfo_xml(file) #if size < 3000: # os.remove(file) - # print(f"已删除{file}") \ No newline at end of file + # print(f"已删除{file}") +if __name__ == "__main__": + # 批量删除漫画下的重复图片章节 + # comicInfo()._comic_info_xml_pages("/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/和朋友的妈妈做朋友/第36话 36.CBZ") + dir_path = "CBZ/rm_comic" + #dir_path = "/mnt/Comics/CBZ/rm_comic" + for dir in os.listdir(dir_path): + c_dir = os.path.join(dir_path, dir) + if os.path.isdir(c_dir): + comic_pages = [] + files = list(FileNaming.get_filenames_optimized(c_dir, ext_filter=['.CBZ'])) + for file in files: + page_data = comicInfo()._comic_info_xml_pages(file) + comic_pages.append(page_data) + #print(page_data) + # 一本漫画读取完毕 + #print(comic_pages) + duplicates = comicInfo().extract_duplicate_files(comic_pages) + for hash_val, delete_files in duplicates.items(): + # 删除重复文件 + for file_path in delete_files: + try: + # os.remove(file_path) + print(f"已删除: {file_path}") + except Exception as e: + print(f"删除失败 {file_path}: {e}") + \ No newline at end of file