添加test

2025-07-08 13:06:54 +08:00 · 2025-07-08 13:06:54 +08:00 · 4874600a07
commit 4874600a07
parent b02850681f
1 changed files with 119 additions and 5 deletions
--- a/test.py
+++ b/test.py
@ -2,7 +2,8 @@ from src.common.naming import FileNaming
 from src.common.ComicInfo import ImageInfo
 from zipfile import ZipFile
 from datetime import datetime
-import time, shutil
+import time, shutil,re, xxhash, json
+from typing import Any
 import os,hashlib
 import xml.etree.ElementTree as ET
 from PIL import Image
@ -10,6 +11,9 @@ from io import BytesIO
 from tempfile import NamedTemporaryFile
 from xml.dom import minidom
 from src.common.ComicInfo import ComicInfoXml
+from lxml import etree
+from collections import defaultdict
+from typing import List, Dict, Tuple
    
 class test:
    
@ -284,7 +288,91 @@ class comicInfo:
        except:
            raise exit(f"ver_comicinfo_xml 错误")
              
-if __name__ == "__main__":
+              
+    def _comic_info_xml_pages(self, zip_file):
+        """获取 ComicInfo.xml 文件中的 <PageCount> 标签值"""
+        # data = { "file" : os.path.basename(zip_file), "path" : str(zip_file) }
+        # data = { "file" : os.path.basename(zip_file)}
+        data = { "file" : str(zip_file) }
+        list_page = []
+        # 打开ZIP文件
+        with ZipFile(str(zip_file), 'r') as z:
+            try:
+                # 假设ZIP中的文件名是'text.txt'
+                with z.open('ComicInfo.xml', 'r') as file:
+                    # 从文件流中解析 XML 数据
+                    file_string = file.read().decode("utf-8")
+                    # 解析字符串
+                    root = etree.fromstring(file_string.encode())  # 需要字节数据，或直接传字符串（Python 3.8+）
+                    # 方法 2：遍历所有 Page 元素（适用于多个 Page）
+                    for page in root.findall('Pages/Page'):
+                        page_attrib = page.attrib
+                        list_page.append(page_attrib)
+                        #image_name = page.get('Image')
+                        #print(f"遍历提取 Image 值: {image_name}")
+                        #image_size = page.get('ImageSize')
+                        #print(f"遍历提取 ImageSize 值: {image_size}")
+                        #key_value = page.get('Key')
+                        #print(f"遍历提取 Key 值: {key_value}")
+                        #image_width = page.get('ImageWidth')
+                        #print(f"遍历提取 ImageWidth 值: {image_width}")
+                        #image_height = page.get('ImageHeight')
+                        #print(f"遍历提取 ImageHeight 值: {image_height}")
+            except Exception as e:
+                raise exit(f"获取 ComicInfo.xml 文件中的 <PageCount> 标签值失败: {zip_file}，错误: {str(e)}")
+        # data["list_page"] = list_page
+        data["list_hash"] = self.generate_xxhash(list_page)
+        return data  
+
+    def generate_xxhash(self, data: Any) -> str:
+        """
+        使用 xxhash 生成更快的哈希值
+    
+        特点:
+            - 比 MD5 快 2-5 倍
+            - 产生 64 位或 128 位哈希
+        """
+        # 创建哈希对象
+        hasher = xxhash.xxh64()
+    
+        # 使用 JSON 序列化
+        serialized = json.dumps(str(data), sort_keys=True, ensure_ascii=False)
+    
+        # 更新哈希
+        hasher.update(serialized.encode('utf-8'))
+    
+        # 返回十六进制摘要
+        return hasher.hexdigest()
+
+    def extract_duplicate_files(self, data: List[Dict[str, str]]) -> Dict[str, List[str]]:
+        """
+        提取具有重复 list_hash 的文件名
+    
+        参数:
+            data: 包含字典的列表，每个字典包含 'file' 和 'list_hash' 键
+        
+        返回:
+            字典: {重复的list_hash: [重复的文件名列表]}
+        """
+        # 第一步: 创建哈希映射表
+        hash_map = defaultdict(list)
+    
+        # 第二步: 填充映射表 (O(n) 时间复杂度)
+        for item in data:
+            file_name = item['file']
+            list_hash = item['list_hash']
+            hash_map[list_hash].append(file_name)
+    
+        # 第三步: 过滤出重复项 (O(m) 时间复杂度，m 是唯一哈希数)
+        duplicates = {
+            hash_val: files 
+            for hash_val, files in hash_map.items() 
+            if len(files) > 1
+        }
+    
+        return duplicates
+
+if __name__ == "__main1__":
    # 清除3KB以下CBZ文件
    # comicInfo().update_comicinfo_cbz("")
    #cbz_path = "/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/福利女姊姊/第1话 福利女姊姊.CBZ"
@ -310,3 +398,29 @@ if __name__ == "__main__":
                #if size < 3000:
                #    os.remove(file)
                #    print(f"已删除{file}")
+if __name__ == "__main__":
+    #    批量删除漫画下的重复图片章节
+    # comicInfo()._comic_info_xml_pages("/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/和朋友的妈妈做朋友/第36话 36.CBZ")
+    dir_path = "CBZ/rm_comic"
+    #dir_path = "/mnt/Comics/CBZ/rm_comic"
+    for dir in os.listdir(dir_path):
+        c_dir = os.path.join(dir_path, dir)
+        if os.path.isdir(c_dir):
+            comic_pages = []
+            files = list(FileNaming.get_filenames_optimized(c_dir, ext_filter=['.CBZ']))
+            for file in files: 
+                page_data = comicInfo()._comic_info_xml_pages(file)
+                comic_pages.append(page_data)
+                #print(page_data)
+            # 一本漫画读取完毕
+            #print(comic_pages)
+            duplicates = comicInfo().extract_duplicate_files(comic_pages)
+            for hash_val, delete_files in duplicates.items():
+            # 删除重复文件
+                for file_path in delete_files:
+                    try:
+                        # os.remove(file_path)
+                        print(f"已删除: {file_path}")
+                    except Exception as e:
+                        print(f"删除失败 {file_path}: {e}")
+