添加test
This commit is contained in:
parent
b02850681f
commit
4874600a07
118
test.py
118
test.py
@ -2,7 +2,8 @@ from src.common.naming import FileNaming
|
|||||||
from src.common.ComicInfo import ImageInfo
|
from src.common.ComicInfo import ImageInfo
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import time, shutil
|
import time, shutil,re, xxhash, json
|
||||||
|
from typing import Any
|
||||||
import os,hashlib
|
import os,hashlib
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
@ -10,6 +11,9 @@ from io import BytesIO
|
|||||||
from tempfile import NamedTemporaryFile
|
from tempfile import NamedTemporaryFile
|
||||||
from xml.dom import minidom
|
from xml.dom import minidom
|
||||||
from src.common.ComicInfo import ComicInfoXml
|
from src.common.ComicInfo import ComicInfoXml
|
||||||
|
from lxml import etree
|
||||||
|
from collections import defaultdict
|
||||||
|
from typing import List, Dict, Tuple
|
||||||
|
|
||||||
class test:
|
class test:
|
||||||
|
|
||||||
@ -284,7 +288,91 @@ class comicInfo:
|
|||||||
except:
|
except:
|
||||||
raise exit(f"ver_comicinfo_xml 错误")
|
raise exit(f"ver_comicinfo_xml 错误")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
|
def _comic_info_xml_pages(self, zip_file):
|
||||||
|
"""获取 ComicInfo.xml 文件中的 <PageCount> 标签值"""
|
||||||
|
# data = { "file" : os.path.basename(zip_file), "path" : str(zip_file) }
|
||||||
|
# data = { "file" : os.path.basename(zip_file)}
|
||||||
|
data = { "file" : str(zip_file) }
|
||||||
|
list_page = []
|
||||||
|
# 打开ZIP文件
|
||||||
|
with ZipFile(str(zip_file), 'r') as z:
|
||||||
|
try:
|
||||||
|
# 假设ZIP中的文件名是'text.txt'
|
||||||
|
with z.open('ComicInfo.xml', 'r') as file:
|
||||||
|
# 从文件流中解析 XML 数据
|
||||||
|
file_string = file.read().decode("utf-8")
|
||||||
|
# 解析字符串
|
||||||
|
root = etree.fromstring(file_string.encode()) # 需要字节数据,或直接传字符串(Python 3.8+)
|
||||||
|
# 方法 2:遍历所有 Page 元素(适用于多个 Page)
|
||||||
|
for page in root.findall('Pages/Page'):
|
||||||
|
page_attrib = page.attrib
|
||||||
|
list_page.append(page_attrib)
|
||||||
|
#image_name = page.get('Image')
|
||||||
|
#print(f"遍历提取 Image 值: {image_name}")
|
||||||
|
#image_size = page.get('ImageSize')
|
||||||
|
#print(f"遍历提取 ImageSize 值: {image_size}")
|
||||||
|
#key_value = page.get('Key')
|
||||||
|
#print(f"遍历提取 Key 值: {key_value}")
|
||||||
|
#image_width = page.get('ImageWidth')
|
||||||
|
#print(f"遍历提取 ImageWidth 值: {image_width}")
|
||||||
|
#image_height = page.get('ImageHeight')
|
||||||
|
#print(f"遍历提取 ImageHeight 值: {image_height}")
|
||||||
|
except Exception as e:
|
||||||
|
raise exit(f"获取 ComicInfo.xml 文件中的 <PageCount> 标签值失败: {zip_file},错误: {str(e)}")
|
||||||
|
# data["list_page"] = list_page
|
||||||
|
data["list_hash"] = self.generate_xxhash(list_page)
|
||||||
|
return data
|
||||||
|
|
||||||
|
def generate_xxhash(self, data: Any) -> str:
|
||||||
|
"""
|
||||||
|
使用 xxhash 生成更快的哈希值
|
||||||
|
|
||||||
|
特点:
|
||||||
|
- 比 MD5 快 2-5 倍
|
||||||
|
- 产生 64 位或 128 位哈希
|
||||||
|
"""
|
||||||
|
# 创建哈希对象
|
||||||
|
hasher = xxhash.xxh64()
|
||||||
|
|
||||||
|
# 使用 JSON 序列化
|
||||||
|
serialized = json.dumps(str(data), sort_keys=True, ensure_ascii=False)
|
||||||
|
|
||||||
|
# 更新哈希
|
||||||
|
hasher.update(serialized.encode('utf-8'))
|
||||||
|
|
||||||
|
# 返回十六进制摘要
|
||||||
|
return hasher.hexdigest()
|
||||||
|
|
||||||
|
def extract_duplicate_files(self, data: List[Dict[str, str]]) -> Dict[str, List[str]]:
|
||||||
|
"""
|
||||||
|
提取具有重复 list_hash 的文件名
|
||||||
|
|
||||||
|
参数:
|
||||||
|
data: 包含字典的列表,每个字典包含 'file' 和 'list_hash' 键
|
||||||
|
|
||||||
|
返回:
|
||||||
|
字典: {重复的list_hash: [重复的文件名列表]}
|
||||||
|
"""
|
||||||
|
# 第一步: 创建哈希映射表
|
||||||
|
hash_map = defaultdict(list)
|
||||||
|
|
||||||
|
# 第二步: 填充映射表 (O(n) 时间复杂度)
|
||||||
|
for item in data:
|
||||||
|
file_name = item['file']
|
||||||
|
list_hash = item['list_hash']
|
||||||
|
hash_map[list_hash].append(file_name)
|
||||||
|
|
||||||
|
# 第三步: 过滤出重复项 (O(m) 时间复杂度,m 是唯一哈希数)
|
||||||
|
duplicates = {
|
||||||
|
hash_val: files
|
||||||
|
for hash_val, files in hash_map.items()
|
||||||
|
if len(files) > 1
|
||||||
|
}
|
||||||
|
|
||||||
|
return duplicates
|
||||||
|
|
||||||
|
if __name__ == "__main1__":
|
||||||
# 清除3KB以下CBZ文件
|
# 清除3KB以下CBZ文件
|
||||||
# comicInfo().update_comicinfo_cbz("")
|
# comicInfo().update_comicinfo_cbz("")
|
||||||
#cbz_path = "/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/福利女姊姊/第1话 福利女姊姊.CBZ"
|
#cbz_path = "/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/福利女姊姊/第1话 福利女姊姊.CBZ"
|
||||||
@ -310,3 +398,29 @@ if __name__ == "__main__":
|
|||||||
#if size < 3000:
|
#if size < 3000:
|
||||||
# os.remove(file)
|
# os.remove(file)
|
||||||
# print(f"已删除{file}")
|
# print(f"已删除{file}")
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# 批量删除漫画下的重复图片章节
|
||||||
|
# comicInfo()._comic_info_xml_pages("/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/和朋友的妈妈做朋友/第36话 36.CBZ")
|
||||||
|
dir_path = "CBZ/rm_comic"
|
||||||
|
#dir_path = "/mnt/Comics/CBZ/rm_comic"
|
||||||
|
for dir in os.listdir(dir_path):
|
||||||
|
c_dir = os.path.join(dir_path, dir)
|
||||||
|
if os.path.isdir(c_dir):
|
||||||
|
comic_pages = []
|
||||||
|
files = list(FileNaming.get_filenames_optimized(c_dir, ext_filter=['.CBZ']))
|
||||||
|
for file in files:
|
||||||
|
page_data = comicInfo()._comic_info_xml_pages(file)
|
||||||
|
comic_pages.append(page_data)
|
||||||
|
#print(page_data)
|
||||||
|
# 一本漫画读取完毕
|
||||||
|
#print(comic_pages)
|
||||||
|
duplicates = comicInfo().extract_duplicate_files(comic_pages)
|
||||||
|
for hash_val, delete_files in duplicates.items():
|
||||||
|
# 删除重复文件
|
||||||
|
for file_path in delete_files:
|
||||||
|
try:
|
||||||
|
# os.remove(file_path)
|
||||||
|
print(f"已删除: {file_path}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"删除失败 {file_path}: {e}")
|
||||||
|
|
||||||
Loading…
Reference in New Issue
Block a user