添加test
This commit is contained in:
parent
b02850681f
commit
4874600a07
124
test.py
124
test.py
@ -2,7 +2,8 @@ from src.common.naming import FileNaming
|
||||
from src.common.ComicInfo import ImageInfo
|
||||
from zipfile import ZipFile
|
||||
from datetime import datetime
|
||||
import time, shutil
|
||||
import time, shutil,re, xxhash, json
|
||||
from typing import Any
|
||||
import os,hashlib
|
||||
import xml.etree.ElementTree as ET
|
||||
from PIL import Image
|
||||
@ -10,7 +11,10 @@ from io import BytesIO
|
||||
from tempfile import NamedTemporaryFile
|
||||
from xml.dom import minidom
|
||||
from src.common.ComicInfo import ComicInfoXml
|
||||
|
||||
from lxml import etree
|
||||
from collections import defaultdict
|
||||
from typing import List, Dict, Tuple
|
||||
|
||||
class test:
|
||||
|
||||
def clean_min_cbz(self):
|
||||
@ -283,8 +287,92 @@ class comicInfo:
|
||||
#os.remove(cbz_path)
|
||||
except:
|
||||
raise exit(f"ver_comicinfo_xml 错误")
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
|
||||
def _comic_info_xml_pages(self, zip_file):
|
||||
"""获取 ComicInfo.xml 文件中的 <PageCount> 标签值"""
|
||||
# data = { "file" : os.path.basename(zip_file), "path" : str(zip_file) }
|
||||
# data = { "file" : os.path.basename(zip_file)}
|
||||
data = { "file" : str(zip_file) }
|
||||
list_page = []
|
||||
# 打开ZIP文件
|
||||
with ZipFile(str(zip_file), 'r') as z:
|
||||
try:
|
||||
# 假设ZIP中的文件名是'text.txt'
|
||||
with z.open('ComicInfo.xml', 'r') as file:
|
||||
# 从文件流中解析 XML 数据
|
||||
file_string = file.read().decode("utf-8")
|
||||
# 解析字符串
|
||||
root = etree.fromstring(file_string.encode()) # 需要字节数据,或直接传字符串(Python 3.8+)
|
||||
# 方法 2:遍历所有 Page 元素(适用于多个 Page)
|
||||
for page in root.findall('Pages/Page'):
|
||||
page_attrib = page.attrib
|
||||
list_page.append(page_attrib)
|
||||
#image_name = page.get('Image')
|
||||
#print(f"遍历提取 Image 值: {image_name}")
|
||||
#image_size = page.get('ImageSize')
|
||||
#print(f"遍历提取 ImageSize 值: {image_size}")
|
||||
#key_value = page.get('Key')
|
||||
#print(f"遍历提取 Key 值: {key_value}")
|
||||
#image_width = page.get('ImageWidth')
|
||||
#print(f"遍历提取 ImageWidth 值: {image_width}")
|
||||
#image_height = page.get('ImageHeight')
|
||||
#print(f"遍历提取 ImageHeight 值: {image_height}")
|
||||
except Exception as e:
|
||||
raise exit(f"获取 ComicInfo.xml 文件中的 <PageCount> 标签值失败: {zip_file},错误: {str(e)}")
|
||||
# data["list_page"] = list_page
|
||||
data["list_hash"] = self.generate_xxhash(list_page)
|
||||
return data
|
||||
|
||||
def generate_xxhash(self, data: Any) -> str:
|
||||
"""
|
||||
使用 xxhash 生成更快的哈希值
|
||||
|
||||
特点:
|
||||
- 比 MD5 快 2-5 倍
|
||||
- 产生 64 位或 128 位哈希
|
||||
"""
|
||||
# 创建哈希对象
|
||||
hasher = xxhash.xxh64()
|
||||
|
||||
# 使用 JSON 序列化
|
||||
serialized = json.dumps(str(data), sort_keys=True, ensure_ascii=False)
|
||||
|
||||
# 更新哈希
|
||||
hasher.update(serialized.encode('utf-8'))
|
||||
|
||||
# 返回十六进制摘要
|
||||
return hasher.hexdigest()
|
||||
|
||||
def extract_duplicate_files(self, data: List[Dict[str, str]]) -> Dict[str, List[str]]:
|
||||
"""
|
||||
提取具有重复 list_hash 的文件名
|
||||
|
||||
参数:
|
||||
data: 包含字典的列表,每个字典包含 'file' 和 'list_hash' 键
|
||||
|
||||
返回:
|
||||
字典: {重复的list_hash: [重复的文件名列表]}
|
||||
"""
|
||||
# 第一步: 创建哈希映射表
|
||||
hash_map = defaultdict(list)
|
||||
|
||||
# 第二步: 填充映射表 (O(n) 时间复杂度)
|
||||
for item in data:
|
||||
file_name = item['file']
|
||||
list_hash = item['list_hash']
|
||||
hash_map[list_hash].append(file_name)
|
||||
|
||||
# 第三步: 过滤出重复项 (O(m) 时间复杂度,m 是唯一哈希数)
|
||||
duplicates = {
|
||||
hash_val: files
|
||||
for hash_val, files in hash_map.items()
|
||||
if len(files) > 1
|
||||
}
|
||||
|
||||
return duplicates
|
||||
|
||||
if __name__ == "__main1__":
|
||||
# 清除3KB以下CBZ文件
|
||||
# comicInfo().update_comicinfo_cbz("")
|
||||
#cbz_path = "/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/福利女姊姊/第1话 福利女姊姊.CBZ"
|
||||
@ -309,4 +397,30 @@ if __name__ == "__main__":
|
||||
comicInfo().ver_comicinfo_xml(file)
|
||||
#if size < 3000:
|
||||
# os.remove(file)
|
||||
# print(f"已删除{file}")
|
||||
# print(f"已删除{file}")
|
||||
if __name__ == "__main__":
|
||||
# 批量删除漫画下的重复图片章节
|
||||
# comicInfo()._comic_info_xml_pages("/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/和朋友的妈妈做朋友/第36话 36.CBZ")
|
||||
dir_path = "CBZ/rm_comic"
|
||||
#dir_path = "/mnt/Comics/CBZ/rm_comic"
|
||||
for dir in os.listdir(dir_path):
|
||||
c_dir = os.path.join(dir_path, dir)
|
||||
if os.path.isdir(c_dir):
|
||||
comic_pages = []
|
||||
files = list(FileNaming.get_filenames_optimized(c_dir, ext_filter=['.CBZ']))
|
||||
for file in files:
|
||||
page_data = comicInfo()._comic_info_xml_pages(file)
|
||||
comic_pages.append(page_data)
|
||||
#print(page_data)
|
||||
# 一本漫画读取完毕
|
||||
#print(comic_pages)
|
||||
duplicates = comicInfo().extract_duplicate_files(comic_pages)
|
||||
for hash_val, delete_files in duplicates.items():
|
||||
# 删除重复文件
|
||||
for file_path in delete_files:
|
||||
try:
|
||||
# os.remove(file_path)
|
||||
print(f"已删除: {file_path}")
|
||||
except Exception as e:
|
||||
print(f"删除失败 {file_path}: {e}")
|
||||
|
||||
Loading…
Reference in New Issue
Block a user