285 lines
12 KiB
Python
285 lines
12 KiB
Python
from src.common.naming import FileNaming
|
||
from src.common.ComicInfo import ImageInfo
|
||
from zipfile import ZipFile
|
||
from datetime import datetime
|
||
import time
|
||
import os,hashlib
|
||
import xml.etree.ElementTree as ET
|
||
from PIL import Image
|
||
from io import BytesIO
|
||
from tempfile import NamedTemporaryFile
|
||
from xml.dom import minidom
|
||
from src.common.ComicInfo import ComicInfoXml
|
||
|
||
class test:
|
||
|
||
def clean_min_cbz(self):
|
||
"""
|
||
清理3KB以下CBZ文件
|
||
"""
|
||
dir_path = "/mnt/Comics/CBZ/rm_comic"
|
||
for dir in os.listdir(dir_path):
|
||
c_dir = os.path.join(dir_path, dir)
|
||
if os.path.isdir(c_dir):
|
||
files = list(FileNaming.get_filenames_optimized(c_dir, ext_filter=['.CBZ']))
|
||
for file in files:
|
||
size = os.path.getsize(file)
|
||
if size < 3000:
|
||
os.remove(file)
|
||
print(f"已删除{file}")
|
||
|
||
def _clean_old_cbz(self, cbz_path):
|
||
m_time = datetime.fromtimestamp(os.path.getmtime(cbz_path))
|
||
str_strftime = '%Y%m%d'
|
||
zip_time = m_time.strftime(str_strftime)
|
||
|
||
with ZipFile(cbz_path, 'r') as zip_ref:
|
||
old_img = 0
|
||
for file_info in zip_ref.infolist():
|
||
# 获取日期时间信息,格式为 (year, month, day, hour, minute, second)
|
||
date_time = file_info.date_time
|
||
# 将日期时间元组转换为datetime对象
|
||
dt = datetime(*date_time)
|
||
# 格式化输出日期时间,例如:YYYY-MM-DD HH:MM:SS
|
||
file_date_time = dt.strftime(str_strftime)
|
||
# 一周内的图片跳过
|
||
if int(zip_time) - int(file_date_time) > 7:
|
||
#print(f"Clear Filename: {file_info.filename}, zip: {cbz_path}")
|
||
old_img += 1
|
||
|
||
if old_img > 0:
|
||
#os.remove(cbz_path)
|
||
print(f"remove cbz {cbz_path}")
|
||
|
||
def clean_old_cbz(self):
|
||
dir_path = "/mnt/Comics/CBZ/rm_comic"
|
||
for dir in os.listdir(dir_path):
|
||
c_dir = os.path.join(dir_path, dir)
|
||
if os.path.isdir(c_dir):
|
||
files = list(FileNaming.get_filenames_optimized(c_dir, ext_filter=['.CBZ']))
|
||
for file in files:
|
||
self._clean_old_cbz(file)
|
||
|
||
class comicInfo:
|
||
|
||
def find_actual_path(self, zip_ref, target_path):
|
||
"""不区分大小写查找压缩包内的实际文件路径"""
|
||
target_lower = target_path.lower()
|
||
for name in zip_ref.namelist():
|
||
if name.lower() == target_lower:
|
||
return name
|
||
return None
|
||
|
||
def process_cbz(self, cbz_path):
|
||
try:
|
||
with ZipFile(cbz_path, 'r') as cbz:
|
||
# ============================================
|
||
# 第一部分:读取 ComicInfo.xml 的元数据字段
|
||
# ============================================
|
||
xml_files = [f for f in cbz.namelist() if f.lower() == 'comicinfo.xml']
|
||
if not xml_files:
|
||
print("未找到 ComicInfo.xml")
|
||
return None
|
||
xml_file_name = xml_files[0]
|
||
|
||
# 解析 XML 元数据
|
||
metadata = {}
|
||
with cbz.open(xml_file_name) as xml_file:
|
||
xml_content = xml_file.read().decode('utf-8')
|
||
root = ET.fromstring(xml_content)
|
||
|
||
# 定义需要提取的元数据字段(用户自定义的字段列表)
|
||
metadata_fields = [
|
||
"Title", "Series", "Number", "Summary", "Writer",
|
||
"Genre", "Tags", "PageCount", "AgeRating"
|
||
]
|
||
|
||
for field in metadata_fields:
|
||
element = root.find(field)
|
||
metadata[field] = element.text if element is not None else None
|
||
|
||
# ============================================
|
||
# 第二部分:读取 Page 标签的图片信息
|
||
# ============================================
|
||
pages_info = []
|
||
with cbz.open(xml_file_name) as xml_file:
|
||
xml_content = xml_file.read().decode('utf-8')
|
||
root = ET.fromstring(xml_content)
|
||
|
||
# 提取所有 Page 标签
|
||
pages = root.find('Pages')
|
||
if pages is None:
|
||
print("XML 中缺少 Pages 标签")
|
||
return {"metadata": metadata, "pages": None}
|
||
|
||
page_list = pages.findall('Page')
|
||
if not page_list:
|
||
print("Pages 标签下无 Page 元素")
|
||
return {"metadata": metadata, "pages": None}
|
||
|
||
# 收集图片路径
|
||
image_paths = [page.get('Image') for page in page_list if page.get('Image')]
|
||
|
||
# 处理每个图片文件
|
||
for img_path in image_paths:
|
||
actual_path = self.find_actual_path(cbz, img_path+".jpg")
|
||
if not actual_path:
|
||
print(f"警告:图片 '{img_path}' 不存在于压缩包中")
|
||
continue
|
||
|
||
with cbz.open(actual_path) as img_file:
|
||
content = img_file.read()
|
||
|
||
# 计算 MD5 和文件大小
|
||
file_md5 = hashlib.md5(content).hexdigest()
|
||
file_size = len(content)
|
||
|
||
# 读取图片尺寸
|
||
img_width, img_height = None, None
|
||
try:
|
||
with Image.open(BytesIO(content)) as img:
|
||
img_width, img_height = img.size
|
||
except Exception as e:
|
||
print(f"无法读取图片尺寸:{actual_path},错误:{e}")
|
||
|
||
# 存储图片信息
|
||
pages_info.append({
|
||
"name": os.path.basename(actual_path).split(".")[0],
|
||
"size": file_size,
|
||
"key": file_md5,
|
||
"width": img_width,
|
||
"height": img_height
|
||
})
|
||
|
||
return {
|
||
"metadata": metadata,
|
||
"pages": pages_info
|
||
}
|
||
except Exception as e:
|
||
print(f"处理 CBZ 文件时出错: {e}")
|
||
raise exit(f"处理CBZ出错")
|
||
|
||
def generate_comic_info_xml(self, metadata, pages_info):
|
||
"""根据元数据和页面信息生成 ComicInfo.xml 内容"""
|
||
# 创建根节点
|
||
root = ET.Element('ComicInfo')
|
||
root.set('xmlns:xsd', 'http://www.w3.org/2001/XMLSchema')
|
||
root.set('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance')
|
||
|
||
# 添加元数据字段
|
||
for field, value in metadata.items():
|
||
if value is not None:
|
||
elem = ET.SubElement(root, field)
|
||
elem.text = str(value)
|
||
|
||
# 添加 Pages 节点
|
||
if pages_info:
|
||
pages_elem = ET.SubElement(root, "Pages")
|
||
for page in pages_info:
|
||
# 示例中保留关键属性,可根据需要扩展其他属性
|
||
page_elem = ET.SubElement(pages_elem, "Page", attrib={
|
||
"Image": page.get("name", ""),
|
||
"ImageSize": str(page.get("size", 0)),
|
||
"Key": str(page.get("key", 0)),
|
||
"ImageWidth": str(page.get("width", 0)),
|
||
"ImageHeight": str(page.get("height", 0))
|
||
})
|
||
|
||
# 生成 XML 字符串
|
||
#tree = ET.ElementTree(root)
|
||
#xml_content = BytesIO()
|
||
#tree.write(xml_content, encoding="utf-8", xml_declaration=True)
|
||
|
||
# Create a formatted XML string
|
||
xml_str = ET.tostring(root, encoding='utf-8', method='xml')
|
||
parsed_xml = minidom.parseString(xml_str)
|
||
formatted_xml = parsed_xml.toprettyxml(indent=" ", encoding="utf-8") # Adjust the number of spaces for indentation as needed
|
||
|
||
# Convert bytes to string and add XML declaration
|
||
return formatted_xml.decode('utf-8')
|
||
|
||
#return xml_content.getvalue()
|
||
|
||
def update_cbz_with_new_xml(self, cbz_path, new_xml_content, output_path=None):
|
||
"""将新生成的 ComicInfo.xml 更新到 CBZ 文件中"""
|
||
try:
|
||
# 默认输出路径为原文件路径(覆盖原文件)
|
||
if output_path is None:
|
||
output_path = cbz_path
|
||
|
||
# 创建临时文件处理覆盖操作
|
||
with NamedTemporaryFile(delete=False) as tmp:
|
||
tmp.close()
|
||
os.replace(cbz_path, tmp.name) # 备份原文件
|
||
|
||
# 读取原文件并替换 ComicInfo.xml
|
||
with ZipFile(tmp.name, 'r') as source_zip:
|
||
with ZipFile(output_path, 'w') as new_zip:
|
||
# 复制原文件(跳过旧 XML)
|
||
for item in source_zip.infolist():
|
||
if item.filename.lower() != 'comicinfo.xml':
|
||
new_zip.writestr(item, source_zip.read(item.filename))
|
||
|
||
# 添加新 XML
|
||
new_zip.writestr("ComicInfo.xml", new_xml_content)
|
||
|
||
os.remove(tmp.name) # 清理临时文件
|
||
return True
|
||
except Exception as e:
|
||
print(f"更新 CBZ 文件失败: {e}")
|
||
if os.path.exists(tmp.name):
|
||
os.replace(tmp.name, cbz_path) # 恢复备份
|
||
return False
|
||
|
||
def update_comicinfo_cbz(self, cbz_path):
|
||
"""更新CBZ中的ComicInfo.xml
|
||
|
||
Args:
|
||
cbz_path (_type_): _description_
|
||
"""
|
||
data = self.process_cbz(cbz_path)
|
||
metadata = data["metadata"]
|
||
author = data["metadata"].get("Writer", "")
|
||
tags = data["metadata"].get("Tags", "")
|
||
|
||
(list_value, value) = [[], str(author).replace("&", " ")]
|
||
for val in set(str(value).split(" ")):
|
||
list_value.append(val)
|
||
author = FileNaming.chinese_file_name(",".join(list_value))
|
||
data["metadata"]["Writer"] = author
|
||
# 生成 XML 内容
|
||
new_xml = self.generate_comic_info_xml(data["metadata"], data["pages"])
|
||
xml_file = "NewComicInfo.xml"
|
||
# 测试:保存 XML 到本地查看
|
||
with open(xml_file, "w", encoding="utf-8") as f:
|
||
f.write(new_xml)
|
||
print(f"已生成 {xml_file}")
|
||
ComicInfoXml()._validate_xml_with_xsd_file(xml_file=xml_file ,xsd_file="src/assets/ComicInfo_2.1.xsd")
|
||
# 更新 CBZ 文件(示例路径,实际操作前请备份)
|
||
success = comicInfo().update_cbz_with_new_xml(cbz_path, new_xml)
|
||
# if success:
|
||
# print("CBZ 文件更新成功")
|
||
os.remove(xml_file)
|
||
|
||
if __name__ == "__main__":
|
||
# 清除3KB以下CBZ文件
|
||
# comicInfo().update_comicinfo_cbz("")
|
||
#cbz_path = "/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/福利女姊姊/第1话 福利女姊姊.CBZ"
|
||
|
||
dir_path = "CBZ/rm_comic"
|
||
#dir_path = "/mnt/Comics/CBZ/rm_comic"
|
||
for dir in os.listdir(dir_path):
|
||
c_dir = os.path.join(dir_path, dir)
|
||
if os.path.isdir(c_dir):
|
||
files = list(FileNaming.get_filenames_optimized(c_dir, ext_filter=['.CBZ']))
|
||
for file in files:
|
||
# 获取文件的创建时间(仅在Linux/MacOS中可用)
|
||
create_time = time.localtime(os.path.getctime(file)) # 注意:st_birthtime 在Linux/MacOS中可用,但不是所有系统都支持
|
||
# 格式化时间
|
||
formatted_time = time.strftime('%Y%m%d', create_time)
|
||
if int(formatted_time) < 20250204:
|
||
print(f"{file} 文件创建时间:", formatted_time)
|
||
comicInfo().update_comicinfo_cbz(file)
|
||
#if size < 3000:
|
||
# os.remove(file)
|
||
# print(f"已删除{file}") |