From 3a45a11a651ae359ad52ab65995ec0af5e7412a2 Mon Sep 17 00:00:00 2001 From: caiwx86 Date: Fri, 11 Jul 2025 09:04:51 +0800 Subject: [PATCH] update --- src/common/ComicInfo.py | 135 +++++++++++++++++++++++++++++---- src/common/item.py | 14 ++++ src/common/utils.py | 32 +++++++- src/sites/base.py | 19 ++++- src/sites/configs/rouman.yml | 3 + test.py | 122 ++++++++++++++++------------- tests/common/test_ComicInfo.py | 97 +++++++++++++++++++++++ 7 files changed, 352 insertions(+), 70 deletions(-) create mode 100644 tests/common/test_ComicInfo.py diff --git a/src/common/ComicInfo.py b/src/common/ComicInfo.py index fba47b6..1f524f1 100644 --- a/src/common/ComicInfo.py +++ b/src/common/ComicInfo.py @@ -1,20 +1,16 @@ +import os, re, requests, hashlib import xml.etree.ElementTree as ET from xml.dom import minidom from typing import List -import os from lxml import etree from src.config import XSD_FILE from src.common.logging import setup_logging -import logging from zipfile import ZipFile from pathlib import Path -import re -import requests from urllib.parse import urlparse from PIL import Image -from concurrent.futures import ThreadPoolExecutor -import hashlib from io import BytesIO +from dataclasses import dataclass logger = setup_logging(__name__) @@ -207,8 +203,12 @@ class ImageInfo: class ComicInfo: # ComicInfo.xml 中的选项 def __init__(self): + self._init_default_values() + + def _init_default_values(self): + """初始化默认值""" self.Title: str = "" - """标题""" + """

标题

""" self.Series: str = "" self.Number: str = "" self.Count: int = -1 @@ -245,11 +245,20 @@ class ComicInfo: self.StoryArc: str = "" self.SeriesGroup: str = "" self.AgeRating: str = "" - self.Pages: List[ComicPageInfo] = [] - + self.Pages: List[ComicPageInfo] = [] + + def __init__(self, **kwargs): + values = kwargs.values() + self._init_default_values() + if len(values) > 0: + for key, value in kwargs.items(): + if hasattr(self, key): + setattr(self, key, value) +@dataclass class ComicPageInfo: # ComicInfo.xml 中的 - def __init__(self): + + def _init_default_values(self): self.Image: int = -1 self.Type: str = "Story" self.DoublePage: bool = False @@ -257,8 +266,19 @@ class ComicPageInfo: self.Key: str = "" self.Bookmark: str = "" self.ImageWidth: int = -1 - self.ImageHeight: int = -1 + self.ImageHeight: int = -1 + def __init__(self): + self._init_default_values() + + def __init__(self, **kwargs): + values = kwargs.values() + self._init_default_values() + if len(values) > 0: + for key, value in kwargs.items(): + if hasattr(self, key): + setattr(self, key, value) + def toString(self): data = {} def add(key, value): @@ -314,7 +334,7 @@ class ComicInfoXml: logger.debug(f"zip_file={zip_file} PageCount: {page_count}") return page_count - def _parse_comicinfo(self, comic: ComicInfo, save_dir=None, xml_filename="ComicInfo.xml", xsd_filename="ComicInfo.xsd"): + def _parse_comicinfo(self, comic: ComicInfo, save_dir=None, xml_filename="ComicInfo.xml", xsd_filename=XSD_FILE) -> Path: """_summary_ Args: @@ -368,12 +388,13 @@ class ComicInfoXml: self._validate_xml_with_xsd_file(xml_filename, xsd_filename) # 将 JSON 转换为 XML #xml_data = json_to_xml_with_declaration(json_data) #print(xml_data) + return Path(xml_filename) def _required_attributes(self): """ 必需值,如果为空刚报错 """ - return ["Title", "Series", "Number", "PageCount", "Writer"] + return ["Title", "Series", "Number", "Writer", "PageCount", "Pages" ] def _gen_pageinfo(self, image_names, save_dir): """ 获取PageInfo数据 @@ -387,7 +408,92 @@ class ComicInfoXml: # 图像属性 文件名 大小 长 pages.append(page) return pages - + + def _xml_file_to_comicinfo(self, cbz_path=None, xml_file=None) -> ComicInfo: + """ 读取 CBZ 文件或 XML 文件中的 ComicInfo.xml 元数据,返回 ComicInfo 对象 + Args: + 以下参数任意一个都可以 + cbz_path (_type_, optional): 任选择CBZ文件路径或XML文件路径. Defaults to None. + xml_file (_type_, optional): XML文件路径. Defaults to None + Returns: + ci: returns a ComicInfo object with the updated page count + """ + + def xml_parse(xml_str) -> ComicInfo: + """ 解析 XML 字符串并提取指定字段的值 + + Args: + xml_str (_type_): xml文件内容字符串 + keys (_type_): 需要提取的字段列表 + + Returns: + ci: returns a ComicInfo object with the extracted values + """ + + import xmltodict + xml_dict = xmltodict.parse(xml_str) + # 解析 XML 元数据 + metadata = {} + # 获取comicinfo.xml 字段 + keys = ComicInfo().__dict__.keys() + for key in keys: + key_element = xml_dict.get("ComicInfo", {}).get(key, "") + if key == "Pages": + pages = [] + page_list_element = xml_dict.get("ComicInfo", {}).get("Pages", []).get("Page", "") + for page_element in page_list_element: + pages.append(ComicPageInfo(**{ "Image": page_element['@Image'], + "ImageSize": int(page_element['@ImageSize']), + "Key": page_element['@Key'], + "ImageWidth": int(page_element['@ImageWidth']), + "ImageHeight": int(page_element['@ImageHeight'])})) + key_element = pages + if key_element is not None: + metadata[key] = key_element if key_element else "" + else: + metadata[key] = "" + return ComicInfo(**metadata) + + def read_zip_file(zip_file_path): + """读取 ZIP 文件并返回其内容""" + try: + with ZipFile(zip_file_path, 'r') as zip_ref: + # 获取 ZIP 文件中的所有文件名 + file_list = zip_ref.namelist() + # 读取 ComicInfo.xml 文件内容 + if 'ComicInfo.xml' in file_list: + with zip_ref.open('ComicInfo.xml') as xml_file: + return xml_file.read().decode('utf-8') + else: + raise FileNotFoundError("ComicInfo.xml not found in the ZIP file.") + except Exception as e: + print(f"处理 CBZ 文件时出错: {e}") + raise exit(f"处理CBZ出错") + + if cbz_path is not None: + xml_content = read_zip_file(cbz_path) + elif xml_file is not None: + with open(xml_file, 'r', encoding='utf-8') as f: + xml_content = f.read() + else: + raise ValueError("请提供 cbz_path 或 xml_file 参数, 否则无法处理 XML 文件") + return xml_parse(xml_content) + + def update_comicinfo_count(self, count, cbz_path: Path) -> Path: + """ 更新 ComicInfo.xml 中的 PageCount 字段 + Args: + cbz_path (Path): CBZ 文件路径 + xml_filename (str, optional): XML 文件名. Defaults to "ComicInfo.xml". + xsd_filename (str, optional): XSD 文件名. Defaults to "ComicInfo.xsd". + Returns: + pages: 返回更新后的页面信息列表 + """ + # 读取 ComicInfo.xml 文件 + comicinfo = self._xml_file_to_comicinfo(cbz_path=cbz_path) + comicinfo.Count = count + # 保存更新后的 ComicInfo.xml 文件 + return self._parse_comicinfo(comicinfo, save_dir=os.path.dirname(cbz_path)) + def scrapy_xml_by_json(self, json_data, save_dir=None, xsd_file=XSD_FILE): """ 根据Json数据生成ComicInfo.xml """ @@ -395,6 +501,7 @@ class ComicInfoXml: comic.Title = json_data.get("chapter", "") comic.Series = json_data.get("name", "") comic.Writer = json_data.get("author", "") + comic.Count = json_data.get("count", -1) comic.AgeRating = json_data.get("age_rating", "") comic.Tags = json_data.get("tags", "") comic.Summary = json_data.get("description", "") diff --git a/src/common/item.py b/src/common/item.py index 40b4357..ec03fd2 100644 --- a/src/common/item.py +++ b/src/common/item.py @@ -122,6 +122,14 @@ class MangaInfo(BaseModel): list_value.append(val) return FileNaming.chinese_file_name(",".join(list_value)) + status: str + """漫画状态""" + @field_validator('status', mode='before') + def validate_status(cls, v): + if isinstance(v, str): + return FileNaming.chinese_file_name(v) + return v + description: Optional[str] = None """漫画描述""" @field_validator('description', mode='before') @@ -235,11 +243,17 @@ class MangaItem(BaseModel): filename_list = [] for image in cls.chapter_images: filename_list.append(image.filename) + count = -1 + if cls.info.status == "已完结" and len(cls.chapters) > 1: + # 本章节为最终章节刚添加Count字段 + if cls.number > 0 and cls.number == len(cls.chapters): + count = len(cls.chapters) return { "name": cls.info.title, "chapter": cls.chapter.title, "author": cls.info.author, + "count" : count, "tags": cls.info.tags, "images": filename_list, "description": cls.info.description, diff --git a/src/common/utils.py b/src/common/utils.py index b4c75d6..8c4cfeb 100644 --- a/src/common/utils.py +++ b/src/common/utils.py @@ -468,7 +468,37 @@ class CBZUtils: if old_img > 0: #os.remove(cbz_path) print(f"remove cbz {cbz_path}") - + + def update_cbz_with_new_xml(self, cbz_path, new_xml_content, output_path=None): + """将新生成的 ComicInfo.xml 更新到 CBZ 文件中""" + try: + # 默认输出路径为原文件路径(覆盖原文件) + if output_path is None: + output_path = cbz_path + + # 创建临时文件处理覆盖操作 + with NamedTemporaryFile(delete=False) as tmp: + tmp.close() + shutil.move(cbz_path, tmp.name) + # 读取原文件并替换 ComicInfo.xml + with ZipFile(tmp.name, 'r') as source_zip: + with ZipFile(output_path, 'w') as new_zip: + # 复制原文件(跳过旧 XML) + for item in source_zip.infolist(): + if item.filename.lower() != 'comicinfo.xml': + new_zip.writestr(item, source_zip.read(item.filename)) + + # 添加新 XML + new_zip.writestr("ComicInfo.xml", new_xml_content) + + os.remove(tmp.name) # 清理临时文件 + return True + except Exception as e: + print(f"更新 CBZ 文件失败: {e}") + if os.path.exists(tmp.name): + shutil.move(tmp.name, cbz_path) # 恢复备份 + raise exit(f"更新失败") + class ImageUtils: @classmethod diff --git a/src/sites/base.py b/src/sites/base.py index 583af79..7c0766c 100644 --- a/src/sites/base.py +++ b/src/sites/base.py @@ -11,7 +11,7 @@ from src.common.item import Chapter, MangaItem, MangaInfo,CoverItem from src.common.exceptions import SiteError, NetworkError, ParseError from src.common.logging import setup_logging from src.common.naming import DirectoryNaming,FileNaming -from src.common.ComicInfo import ComicInfo, ImageInfo +from src.common.ComicInfo import ComicInfo, ImageInfo, ComicInfoXml logger = setup_logging(__name__) @@ -88,10 +88,27 @@ class BaseSite(ABC): try: # result_type list[Chapter] list_chapter = manga_info.get_list_chapter() + # 临时添加begin + # 获取最新章节 + last_chapter = list_chapter[-1] if list_chapter else [] + # 临时添加end down_chapter = [] for chapter in list_chapter: cbz_path = FileNaming.chapter_cbz(manga_info=manga_info,chapter=chapter) if os.path.exists(cbz_path): + # 临时添加begin + if chapter.title == last_chapter.title and manga_info.status == "已完结": + # 如果是最新章节且漫画已完结,则不再下 + ci = ComicInfoXml()._xml_file_to_comicinfo(cbz_path=cbz_path) + if ci.Count == "": + # 生成ComicInfo.xml + xml_path = ComicInfoXml().update_comicinfo_count(count=len(list_chapter), cbz_path=cbz_path) + # 更新ComicInfo.xml至CBZ文件中 + CBZUtils().update_cbz_with_new_xml(cbz_path, xml_path.read_text(encoding="utf-8")) + # 更新完成后删除临时生成的ComicInfo.xml + xml_path.unlink() + logger.debug(f"更新 {cbz_path} 的 ComicInfo.xml Count完成") + # 临时添加end logger.debug(f"{chapter.title} 章节已存在") chapter.status = "downloaded" down_chapter.append(chapter) diff --git a/src/sites/configs/rouman.yml b/src/sites/configs/rouman.yml index e2c4d60..fbfc523 100644 --- a/src/sites/configs/rouman.yml +++ b/src/sites/configs/rouman.yml @@ -13,6 +13,9 @@ selectors: author: selector: '//div[@class="basis-3/5 text-sm sm:text-base"]//span[@class="text-foreground"]/text()' index: 0 + status: + selector: '//div[@class="basis-3/5 text-sm sm:text-base"]//span[@class="text-foreground"]/text()' + index: 1 description: selector: '//div[@class="my-2 text-foreground text-sm sm:text-base"]/p/text()' index: 1 diff --git a/test.py b/test.py index fd1004f..57c4a19 100644 --- a/test.py +++ b/test.py @@ -1,5 +1,5 @@ from src.common.naming import FileNaming -from src.common.ComicInfo import ImageInfo +from src.common.ComicInfo import ImageInfo, ComicInfo as ci, ComicPageInfo from zipfile import ZipFile from datetime import datetime import time, shutil,re, xxhash, json @@ -288,6 +288,33 @@ class comicInfo: except: raise exit(f"ver_comicinfo_xml 错误") + def clear_cbz(self): + # 清除3KB以下CBZ文件 + # comicInfo().update_comicinfo_cbz("") + #cbz_path = "/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/福利女姊姊/第1话 福利女姊姊.CBZ" + + dir_path = "CBZ/rm_comic" + #dir_path = "/mnt/Comics/CBZ/rm_comic" + for dir in os.listdir(dir_path): + c_dir = os.path.join(dir_path, dir) + if os.path.isdir(c_dir): + files = list(FileNaming.get_filenames_optimized(c_dir, ext_filter=['.CBZ'])) + for file in files: + # 获取文件的创建时间(仅在Linux/MacOS中可用) + # 修改时间 + create_time = time.localtime(os.utime(file)) # 注意:st_birthtime 在Linux/MacOS中可用,但不是所有系统都支持 + # 格式化时间 + formatted_time = time.strftime('%Y%m%d%H', create_time) + if int(formatted_time) > 2025020401: + print(f"{file} 文件创建时间:", formatted_time) + # 更新ComicInfoxml + # comicInfo().update_comicinfo_cbz(file) + # 检查CBZ是否存在ComicInfo.xml + comicInfo().ver_comicinfo_xml(file) + #if size < 3000: + # os.remove(file) + # print(f"已删除{file}") + def _comic_info_xml_pages(self, zip_file): """获取 ComicInfo.xml 文件中的 标签值""" @@ -324,7 +351,7 @@ class comicInfo: data["list_hash"] = self.generate_xxhash(list_page) return data - def generate_xxhash(self, data: Any) -> str: + def _generate_xxhash(self, data: Any) -> str: """ 使用 xxhash 生成更快的哈希值 @@ -344,7 +371,7 @@ class comicInfo: # 返回十六进制摘要 return hasher.hexdigest() - def extract_duplicate_files(self, data: List[Dict[str, str]]) -> Dict[str, List[str]]: + def _extract_duplicate_files(self, data: List[Dict[str, str]]) -> Dict[str, List[str]]: """ 提取具有重复 list_hash 的文件名 @@ -371,56 +398,43 @@ class comicInfo: } return duplicates - -if __name__ == "__main1__": - # 清除3KB以下CBZ文件 - # comicInfo().update_comicinfo_cbz("") - #cbz_path = "/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/福利女姊姊/第1话 福利女姊姊.CBZ" - dir_path = "CBZ/rm_comic" - #dir_path = "/mnt/Comics/CBZ/rm_comic" - for dir in os.listdir(dir_path): - c_dir = os.path.join(dir_path, dir) - if os.path.isdir(c_dir): - files = list(FileNaming.get_filenames_optimized(c_dir, ext_filter=['.CBZ'])) - for file in files: - # 获取文件的创建时间(仅在Linux/MacOS中可用) - # 修改时间 - create_time = time.localtime(os.utime(file)) # 注意:st_birthtime 在Linux/MacOS中可用,但不是所有系统都支持 - # 格式化时间 - formatted_time = time.strftime('%Y%m%d%H', create_time) - if int(formatted_time) > 2025020401: - print(f"{file} 文件创建时间:", formatted_time) - # 更新ComicInfoxml - # comicInfo().update_comicinfo_cbz(file) - # 检查CBZ是否存在ComicInfo.xml - comicInfo().ver_comicinfo_xml(file) - #if size < 3000: - # os.remove(file) - # print(f"已删除{file}") + + def delete_repeat_file(self, cbz_path) -> None: + """ 删除 CBZ 文件中的重复图片章节 + + Args: + cbz_path (_type_): _description_ + """ + # 批量删除漫画下的重复图片章节 + + + dir_path = "CBZ/rm_comic" + #dir_path = "/mnt/Comics/CBZ/rm_comic" + for dir in os.listdir(dir_path): + c_dir = os.path.join(dir_path, dir) + if os.path.isdir(c_dir): + comic_pages = [] + files = list(FileNaming.get_filenames_optimized(c_dir, ext_filter=['.CBZ'])) + for file in files: + page_data = self._comic_info_xml_pages(file) + comic_pages.append(page_data) + # 一本漫画读取完毕 + duplicates = comicInfo()._extract_duplicate_files(comic_pages) + for hash_val, delete_files in duplicates.items(): + # 删除重复文件 + for file_path in delete_files: + try: + # os.remove(file_path) + print(f"已删除: {file_path}") + except Exception as e: + print(f"删除失败 {file_path}: {e}") + + if __name__ == "__main__": - # 批量删除漫画下的重复图片章节 - # comicInfo()._comic_info_xml_pages("/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/和朋友的妈妈做朋友/第36话 36.CBZ") - dir_path = "CBZ/rm_comic" - #dir_path = "/mnt/Comics/CBZ/rm_comic" - for dir in os.listdir(dir_path): - c_dir = os.path.join(dir_path, dir) - if os.path.isdir(c_dir): - comic_pages = [] - files = list(FileNaming.get_filenames_optimized(c_dir, ext_filter=['.CBZ'])) - for file in files: - page_data = comicInfo()._comic_info_xml_pages(file) - comic_pages.append(page_data) - #print(page_data) - # 一本漫画读取完毕 - #print(comic_pages) - duplicates = comicInfo().extract_duplicate_files(comic_pages) - for hash_val, delete_files in duplicates.items(): - # 删除重复文件 - for file_path in delete_files: - try: - # os.remove(file_path) - print(f"已删除: {file_path}") - except Exception as e: - print(f"删除失败 {file_path}: {e}") - \ No newline at end of file + print("开始处理") + # ComicInfoXml()._xml_file_to_comicinfo("/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/和朋友的妈妈做朋友/第37话 37.CBZ") + xml_path = ComicInfoXml().update_comicinfo_count(37,"/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/和朋友的妈妈做朋友/第37话 37.CBZ") + comicInfo().update_cbz_with_new_xml("/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/和朋友的妈妈做朋友/第37话 37.CBZ", xml_path.read_text(encoding="utf-8")) + #items = ci().__dict__.keys() + #print(items) \ No newline at end of file diff --git a/tests/common/test_ComicInfo.py b/tests/common/test_ComicInfo.py new file mode 100644 index 0000000..e7ba230 --- /dev/null +++ b/tests/common/test_ComicInfo.py @@ -0,0 +1,97 @@ +# module_b.py +import sys +import os + +# 获取当前文件所在目录 +# current_dir = os.path.dirname(os.path.abspath(__file__)) +current_dir = "/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader" +# 添加父目录到 sys.path +# parent_dir = os.path.join(current_dir, '..') +# sys.path.insert(0, parent_dir) +sys.path.insert(0, current_dir) + +# 现在可以使用绝对导入 +from src.common.ComicInfo import ComicInfo, ComicPageInfo, ImageInfo, ComicInfoXml +from os import path as Path + +class test_ImageInfo: + + def test_get_image_size(self): + print(ImageInfo().get_image_size("photo.jpg")) + + def test_get_image_hash(self): + print(ImageInfo().get_image_hash_advanced("photo.jpg")) + + def test_get_image_metadata(self): + """获取图片信息""" + page = self.get_image_metadata("photo.jpg") + print(page) + + def test_get_image_metadata_from_zip(self): + """从ZIP文件中获取图片信息""" + pages = ImageInfo().get_image_metadata_from_zip("test.zip") + print(pages) + +# Define the ComicInfo and ComicPageInfo classes +class test_ComicInfo: + # ComicInfo.xml 中的选项 + + def test_ToString(self): + """测试ComicInfo的字符串表示""" + comic = ComicInfo() + comic.Title = "Test Comic" + comic.Series = "Test Series" + comic.Number = "1" + comic.PageCount = 10 + comic.Writer = "Test Writer" + comic.Pages.append(ComicPageInfo()) + print(comic.toString()) + +class test_ComicPageInfo: + # ComicInfo.xml 中的 + def test_ToString(self): + """测试ComicPageInfo的字符串表示""" + page = ComicPageInfo() + page.Image = "test_image.jpg" + page.ImageSize = 123456 + +class test_ComicInfoXml: + + def test_get_page_count(self): + """测试获取ComicInfo.xml中的PageCount""" + zip_file = Path("test.zip") + page_count = self.get_page_count(zip_file) + print(f"zip_file={zip_file} PageCount: {page_count}") + + def test_scrapy_xml_by_json(self): + """ 根据Json数据生成ComicInfo.xml + """ + json_data = { + "name": "选手村母猪调教", + "chapter": "第2话-总教练最「疼爱」的选手", + "author": "沃林,蜜果实", + "tags": "凌辱,调教,报仇,选手村,体操,硬调色情,新作", + "images": [ + "001.jpg", "scramble=6_002.jpg", "scramble=5_003.jpg", + "004.jpg", "005.jpg", "scramble=5_006.jpg", + "007.jpg", "008.jpg", "scramble=7_009.jpg", + "scramble=9_010.jpg", "011.jpg", "012.jpg", + "scramble=6_013.jpg", "014.jpg", "015.jpg", + "scramble=7_016.jpg", "017.jpg", "018.jpg", + "019.jpg" + ], + "description": ("「总教练,我愿意用身体换取机会…」在腐败的选手村里,总教练握有绝对的权力," + + "选手们只能任凭摆布。人们对成功的渴望不断滋长,却也因为过度的欲望濒临崩溃…"), + "genre": "韩漫", + "age_rating": "R18+", + "series": "选手村母猪调教", + "number": 2, + 'page_count': 286 + } + save_dir = "/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/output/rm_comic/images/选手村母猪调教/第2话-总教练最「疼爱」的选手" + xsd_file = "ComicInfo_2.1.xsd" + pages = ComicInfoXml().scrapy_xml_by_json(json_data, save_dir=save_dir, xsd_file=xsd_file) + print(f"Generated pages: {pages}") + +if __name__ == "__main__": + test_ComicInfoXml().test_scrapy_xml_by_json() \ No newline at end of file