添加test

2025-07-08 13:06:54 +08:00 · 2025-07-08 13:06:54 +08:00 · 4874600a07
commit 4874600a07
parent b02850681f
1 changed files with 119 additions and 5 deletions
--- a/test.py
+++ b/test.py
@ -2,7 +2,8 @@ from src.common.naming import FileNaming
 from src.common.ComicInfo import ImageInfo
 from zipfile import ZipFile
 from datetime import datetime
-import time, shutil
+import time, shutil,re, xxhash, json
 from typing import Any
 import os,hashlib
 import xml.etree.ElementTree as ET
 from PIL import Image
@ -10,6 +11,9 @@ from io import BytesIO
 from tempfile import NamedTemporaryFile
 from xml.dom import minidom
 from src.common.ComicInfo import ComicInfoXml
 from lxml import etree
 from collections import defaultdict
 from typing import List, Dict, Tuple
 class test:
@ -284,7 +288,91 @@ class comicInfo:
        except:
            raise exit(f"ver_comicinfo_xml 错误")
-if __name__ == "__main__":
+              
    def _comic_info_xml_pages(self, zip_file):
        """获取 ComicInfo.xml 文件中的 <PageCount> 标签值"""
        # data = { "file" : os.path.basename(zip_file), "path" : str(zip_file) }
        # data = { "file" : os.path.basename(zip_file)}
        data = { "file" : str(zip_file) }
        list_page = []
        # 打开ZIP文件
        with ZipFile(str(zip_file), 'r') as z:
            try:
                # 假设ZIP中的文件名是'text.txt'
                with z.open('ComicInfo.xml', 'r') as file:
                    # 从文件流中解析 XML 数据
                    file_string = file.read().decode("utf-8")
                    # 解析字符串
                    root = etree.fromstring(file_string.encode())  # 需要字节数据，或直接传字符串（Python 3.8+）
                    # 方法 2：遍历所有 Page 元素（适用于多个 Page）
                    for page in root.findall('Pages/Page'):
                        page_attrib = page.attrib
                        list_page.append(page_attrib)
                        #image_name = page.get('Image')
                        #print(f"遍历提取 Image 值: {image_name}")
                        #image_size = page.get('ImageSize')
                        #print(f"遍历提取 ImageSize 值: {image_size}")
                        #key_value = page.get('Key')
                        #print(f"遍历提取 Key 值: {key_value}")
                        #image_width = page.get('ImageWidth')
                        #print(f"遍历提取 ImageWidth 值: {image_width}")
                        #image_height = page.get('ImageHeight')
                        #print(f"遍历提取 ImageHeight 值: {image_height}")
            except Exception as e:
                raise exit(f"获取 ComicInfo.xml 文件中的 <PageCount> 标签值失败: {zip_file}，错误: {str(e)}")
        # data["list_page"] = list_page
        data["list_hash"] = self.generate_xxhash(list_page)
        return data  
    def generate_xxhash(self, data: Any) -> str:
        """
        使用 xxhash 生成更快的哈希值
        特点:
            - 比 MD5 快 2-5 倍
            - 产生 64 位或 128 位哈希
        """
        # 创建哈希对象
        hasher = xxhash.xxh64()
        # 使用 JSON 序列化
        serialized = json.dumps(str(data), sort_keys=True, ensure_ascii=False)
        # 更新哈希
        hasher.update(serialized.encode('utf-8'))
        # 返回十六进制摘要
        return hasher.hexdigest()
    def extract_duplicate_files(self, data: List[Dict[str, str]]) -> Dict[str, List[str]]:
        """
        提取具有重复 list_hash 的文件名
        参数:
            data: 包含字典的列表，每个字典包含 'file' 和 'list_hash' 键
        返回:
            字典: {重复的list_hash: [重复的文件名列表]}
        """
        # 第一步: 创建哈希映射表
        hash_map = defaultdict(list)
        # 第二步: 填充映射表 (O(n) 时间复杂度)
        for item in data:
            file_name = item['file']
            list_hash = item['list_hash']
            hash_map[list_hash].append(file_name)
        # 第三步: 过滤出重复项 (O(m) 时间复杂度，m 是唯一哈希数)
        duplicates = {
            hash_val: files 
            for hash_val, files in hash_map.items() 
            if len(files) > 1
        }
        return duplicates
 if __name__ == "__main1__":
    # 清除3KB以下CBZ文件
    # comicInfo().update_comicinfo_cbz("")
    #cbz_path = "/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/福利女姊姊/第1话 福利女姊姊.CBZ"
@ -310,3 +398,29 @@ if __name__ == "__main__":
                #if size < 3000:
                #    os.remove(file)
                #    print(f"已删除{file}")
 if __name__ == "__main__":
    #    批量删除漫画下的重复图片章节
    # comicInfo()._comic_info_xml_pages("/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/和朋友的妈妈做朋友/第36话 36.CBZ")
    dir_path = "CBZ/rm_comic"
    #dir_path = "/mnt/Comics/CBZ/rm_comic"
    for dir in os.listdir(dir_path):
        c_dir = os.path.join(dir_path, dir)
        if os.path.isdir(c_dir):
            comic_pages = []
            files = list(FileNaming.get_filenames_optimized(c_dir, ext_filter=['.CBZ']))
            for file in files: 
                page_data = comicInfo()._comic_info_xml_pages(file)
                comic_pages.append(page_data)
                #print(page_data)
            # 一本漫画读取完毕
            #print(comic_pages)
            duplicates = comicInfo().extract_duplicate_files(comic_pages)
            for hash_val, delete_files in duplicates.items():
            # 删除重复文件
                for file_path in delete_files:
                    try:
                        # os.remove(file_path)
                        print(f"已删除: {file_path}")
                    except Exception as e:
                        print(f"删除失败 {file_path}: {e}")