This commit is contained in:
caiwx86 2025-07-11 09:04:51 +08:00
parent 4874600a07
commit 3a45a11a65
7 changed files with 352 additions and 70 deletions

View File

@ -1,20 +1,16 @@
import os, re, requests, hashlib
import xml.etree.ElementTree as ET
from xml.dom import minidom
from typing import List
import os
from lxml import etree
from src.config import XSD_FILE
from src.common.logging import setup_logging
import logging
from zipfile import ZipFile
from pathlib import Path
import re
import requests
from urllib.parse import urlparse
from PIL import Image
from concurrent.futures import ThreadPoolExecutor
import hashlib
from io import BytesIO
from dataclasses import dataclass
logger = setup_logging(__name__)
@ -207,8 +203,12 @@ class ImageInfo:
class ComicInfo:
# ComicInfo.xml 中的选项
def __init__(self):
self._init_default_values()
def _init_default_values(self):
"""初始化默认值"""
self.Title: str = ""
"""标题"""
"""<h2>标题</h2>"""
self.Series: str = ""
self.Number: str = ""
self.Count: int = -1
@ -247,9 +247,18 @@ class ComicInfo:
self.AgeRating: str = ""
self.Pages: List[ComicPageInfo] = []
def __init__(self, **kwargs):
values = kwargs.values()
self._init_default_values()
if len(values) > 0:
for key, value in kwargs.items():
if hasattr(self, key):
setattr(self, key, value)
@dataclass
class ComicPageInfo:
# ComicInfo.xml 中的<Page>
def __init__(self):
def _init_default_values(self):
self.Image: int = -1
self.Type: str = "Story"
self.DoublePage: bool = False
@ -259,6 +268,17 @@ class ComicPageInfo:
self.ImageWidth: int = -1
self.ImageHeight: int = -1
def __init__(self):
self._init_default_values()
def __init__(self, **kwargs):
values = kwargs.values()
self._init_default_values()
if len(values) > 0:
for key, value in kwargs.items():
if hasattr(self, key):
setattr(self, key, value)
def toString(self):
data = {}
def add(key, value):
@ -314,7 +334,7 @@ class ComicInfoXml:
logger.debug(f"zip_file={zip_file} PageCount: {page_count}")
return page_count
def _parse_comicinfo(self, comic: ComicInfo, save_dir=None, xml_filename="ComicInfo.xml", xsd_filename="ComicInfo.xsd"):
def _parse_comicinfo(self, comic: ComicInfo, save_dir=None, xml_filename="ComicInfo.xml", xsd_filename=XSD_FILE) -> Path:
"""_summary_
Args:
@ -368,12 +388,13 @@ class ComicInfoXml:
self._validate_xml_with_xsd_file(xml_filename, xsd_filename) # 将 JSON 转换为 XML
#xml_data = json_to_xml_with_declaration(json_data)
#print(xml_data)
return Path(xml_filename)
def _required_attributes(self):
"""
必需值如果为空刚报错
"""
return ["Title", "Series", "Number", "PageCount", "Writer"]
return ["Title", "Series", "Number", "Writer", "PageCount", "Pages" ]
def _gen_pageinfo(self, image_names, save_dir):
""" 获取PageInfo数据
@ -388,6 +409,91 @@ class ComicInfoXml:
pages.append(page)
return pages
def _xml_file_to_comicinfo(self, cbz_path=None, xml_file=None) -> ComicInfo:
""" 读取 CBZ 文件或 XML 文件中的 ComicInfo.xml 元数据,返回 ComicInfo 对象
Args:
以下参数任意一个都可以
cbz_path (_type_, optional): 任选择CBZ文件路径或XML文件路径. Defaults to None.
xml_file (_type_, optional): XML文件路径. Defaults to None
Returns:
ci: returns a ComicInfo object with the updated page count
"""
def xml_parse(xml_str) -> ComicInfo:
""" 解析 XML 字符串并提取指定字段的值
Args:
xml_str (_type_): xml文件内容字符串
keys (_type_): 需要提取的字段列表
Returns:
ci: returns a ComicInfo object with the extracted values
"""
import xmltodict
xml_dict = xmltodict.parse(xml_str)
# 解析 XML 元数据
metadata = {}
# 获取comicinfo.xml 字段
keys = ComicInfo().__dict__.keys()
for key in keys:
key_element = xml_dict.get("ComicInfo", {}).get(key, "")
if key == "Pages":
pages = []
page_list_element = xml_dict.get("ComicInfo", {}).get("Pages", []).get("Page", "")
for page_element in page_list_element:
pages.append(ComicPageInfo(**{ "Image": page_element['@Image'],
"ImageSize": int(page_element['@ImageSize']),
"Key": page_element['@Key'],
"ImageWidth": int(page_element['@ImageWidth']),
"ImageHeight": int(page_element['@ImageHeight'])}))
key_element = pages
if key_element is not None:
metadata[key] = key_element if key_element else ""
else:
metadata[key] = ""
return ComicInfo(**metadata)
def read_zip_file(zip_file_path):
"""读取 ZIP 文件并返回其内容"""
try:
with ZipFile(zip_file_path, 'r') as zip_ref:
# 获取 ZIP 文件中的所有文件名
file_list = zip_ref.namelist()
# 读取 ComicInfo.xml 文件内容
if 'ComicInfo.xml' in file_list:
with zip_ref.open('ComicInfo.xml') as xml_file:
return xml_file.read().decode('utf-8')
else:
raise FileNotFoundError("ComicInfo.xml not found in the ZIP file.")
except Exception as e:
print(f"处理 CBZ 文件时出错: {e}")
raise exit(f"处理CBZ出错")
if cbz_path is not None:
xml_content = read_zip_file(cbz_path)
elif xml_file is not None:
with open(xml_file, 'r', encoding='utf-8') as f:
xml_content = f.read()
else:
raise ValueError("请提供 cbz_path 或 xml_file 参数, 否则无法处理 XML 文件")
return xml_parse(xml_content)
def update_comicinfo_count(self, count, cbz_path: Path) -> Path:
""" 更新 ComicInfo.xml 中的 PageCount 字段
Args:
cbz_path (Path): CBZ 文件路径
xml_filename (str, optional): XML 文件名. Defaults to "ComicInfo.xml".
xsd_filename (str, optional): XSD 文件名. Defaults to "ComicInfo.xsd".
Returns:
pages: 返回更新后的页面信息列表
"""
# 读取 ComicInfo.xml 文件
comicinfo = self._xml_file_to_comicinfo(cbz_path=cbz_path)
comicinfo.Count = count
# 保存更新后的 ComicInfo.xml 文件
return self._parse_comicinfo(comicinfo, save_dir=os.path.dirname(cbz_path))
def scrapy_xml_by_json(self, json_data, save_dir=None, xsd_file=XSD_FILE):
""" 根据Json数据生成ComicInfo.xml
"""
@ -395,6 +501,7 @@ class ComicInfoXml:
comic.Title = json_data.get("chapter", "")
comic.Series = json_data.get("name", "")
comic.Writer = json_data.get("author", "")
comic.Count = json_data.get("count", -1)
comic.AgeRating = json_data.get("age_rating", "")
comic.Tags = json_data.get("tags", "")
comic.Summary = json_data.get("description", "")

View File

@ -122,6 +122,14 @@ class MangaInfo(BaseModel):
list_value.append(val)
return FileNaming.chinese_file_name(",".join(list_value))
status: str
"""漫画状态"""
@field_validator('status', mode='before')
def validate_status(cls, v):
if isinstance(v, str):
return FileNaming.chinese_file_name(v)
return v
description: Optional[str] = None
"""漫画描述"""
@field_validator('description', mode='before')
@ -235,11 +243,17 @@ class MangaItem(BaseModel):
filename_list = []
for image in cls.chapter_images:
filename_list.append(image.filename)
count = -1
if cls.info.status == "已完结" and len(cls.chapters) > 1:
# 本章节为最终章节刚添加Count字段
if cls.number > 0 and cls.number == len(cls.chapters):
count = len(cls.chapters)
return {
"name": cls.info.title,
"chapter": cls.chapter.title,
"author": cls.info.author,
"count" : count,
"tags": cls.info.tags,
"images": filename_list,
"description": cls.info.description,

View File

@ -469,6 +469,36 @@ class CBZUtils:
#os.remove(cbz_path)
print(f"remove cbz {cbz_path}")
def update_cbz_with_new_xml(self, cbz_path, new_xml_content, output_path=None):
"""将新生成的 ComicInfo.xml 更新到 CBZ 文件中"""
try:
# 默认输出路径为原文件路径(覆盖原文件)
if output_path is None:
output_path = cbz_path
# 创建临时文件处理覆盖操作
with NamedTemporaryFile(delete=False) as tmp:
tmp.close()
shutil.move(cbz_path, tmp.name)
# 读取原文件并替换 ComicInfo.xml
with ZipFile(tmp.name, 'r') as source_zip:
with ZipFile(output_path, 'w') as new_zip:
# 复制原文件(跳过旧 XML
for item in source_zip.infolist():
if item.filename.lower() != 'comicinfo.xml':
new_zip.writestr(item, source_zip.read(item.filename))
# 添加新 XML
new_zip.writestr("ComicInfo.xml", new_xml_content)
os.remove(tmp.name) # 清理临时文件
return True
except Exception as e:
print(f"更新 CBZ 文件失败: {e}")
if os.path.exists(tmp.name):
shutil.move(tmp.name, cbz_path) # 恢复备份
raise exit(f"更新失败")
class ImageUtils:
@classmethod

View File

@ -11,7 +11,7 @@ from src.common.item import Chapter, MangaItem, MangaInfo,CoverItem
from src.common.exceptions import SiteError, NetworkError, ParseError
from src.common.logging import setup_logging
from src.common.naming import DirectoryNaming,FileNaming
from src.common.ComicInfo import ComicInfo, ImageInfo
from src.common.ComicInfo import ComicInfo, ImageInfo, ComicInfoXml
logger = setup_logging(__name__)
@ -88,10 +88,27 @@ class BaseSite(ABC):
try:
# result_type list[Chapter]
list_chapter = manga_info.get_list_chapter()
# 临时添加begin
# 获取最新章节
last_chapter = list_chapter[-1] if list_chapter else []
# 临时添加end
down_chapter = []
for chapter in list_chapter:
cbz_path = FileNaming.chapter_cbz(manga_info=manga_info,chapter=chapter)
if os.path.exists(cbz_path):
# 临时添加begin
if chapter.title == last_chapter.title and manga_info.status == "已完结":
# 如果是最新章节且漫画已完结,则不再下
ci = ComicInfoXml()._xml_file_to_comicinfo(cbz_path=cbz_path)
if ci.Count == "":
# 生成ComicInfo.xml
xml_path = ComicInfoXml().update_comicinfo_count(count=len(list_chapter), cbz_path=cbz_path)
# 更新ComicInfo.xml至CBZ文件中
CBZUtils().update_cbz_with_new_xml(cbz_path, xml_path.read_text(encoding="utf-8"))
# 更新完成后删除临时生成的ComicInfo.xml
xml_path.unlink()
logger.debug(f"更新 {cbz_path} 的 ComicInfo.xml Count完成")
# 临时添加end
logger.debug(f"{chapter.title} 章节已存在")
chapter.status = "downloaded"
down_chapter.append(chapter)

View File

@ -13,6 +13,9 @@ selectors:
author:
selector: '//div[@class="basis-3/5 text-sm sm:text-base"]//span[@class="text-foreground"]/text()'
index: 0
status:
selector: '//div[@class="basis-3/5 text-sm sm:text-base"]//span[@class="text-foreground"]/text()'
index: 1
description:
selector: '//div[@class="my-2 text-foreground text-sm sm:text-base"]/p/text()'
index: 1

82
test.py
View File

@ -1,5 +1,5 @@
from src.common.naming import FileNaming
from src.common.ComicInfo import ImageInfo
from src.common.ComicInfo import ImageInfo, ComicInfo as ci, ComicPageInfo
from zipfile import ZipFile
from datetime import datetime
import time, shutil,re, xxhash, json
@ -288,6 +288,33 @@ class comicInfo:
except:
raise exit(f"ver_comicinfo_xml 错误")
def clear_cbz(self):
# 清除3KB以下CBZ文件
# comicInfo().update_comicinfo_cbz("")
#cbz_path = "/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/福利女姊姊/第1话 福利女姊姊.CBZ"
dir_path = "CBZ/rm_comic"
#dir_path = "/mnt/Comics/CBZ/rm_comic"
for dir in os.listdir(dir_path):
c_dir = os.path.join(dir_path, dir)
if os.path.isdir(c_dir):
files = list(FileNaming.get_filenames_optimized(c_dir, ext_filter=['.CBZ']))
for file in files:
# 获取文件的创建时间仅在Linux/MacOS中可用
# 修改时间
create_time = time.localtime(os.utime(file)) # 注意st_birthtime 在Linux/MacOS中可用但不是所有系统都支持
# 格式化时间
formatted_time = time.strftime('%Y%m%d%H', create_time)
if int(formatted_time) > 2025020401:
print(f"{file} 文件创建时间:", formatted_time)
# 更新ComicInfoxml
# comicInfo().update_comicinfo_cbz(file)
# 检查CBZ是否存在ComicInfo.xml
comicInfo().ver_comicinfo_xml(file)
#if size < 3000:
# os.remove(file)
# print(f"已删除{file}")
def _comic_info_xml_pages(self, zip_file):
"""获取 ComicInfo.xml 文件中的 <PageCount> 标签值"""
@ -324,7 +351,7 @@ class comicInfo:
data["list_hash"] = self.generate_xxhash(list_page)
return data
def generate_xxhash(self, data: Any) -> str:
def _generate_xxhash(self, data: Any) -> str:
"""
使用 xxhash 生成更快的哈希值
@ -344,7 +371,7 @@ class comicInfo:
# 返回十六进制摘要
return hasher.hexdigest()
def extract_duplicate_files(self, data: List[Dict[str, str]]) -> Dict[str, List[str]]:
def _extract_duplicate_files(self, data: List[Dict[str, str]]) -> Dict[str, List[str]]:
"""
提取具有重复 list_hash 的文件名
@ -372,35 +399,16 @@ class comicInfo:
return duplicates
if __name__ == "__main1__":
# 清除3KB以下CBZ文件
# comicInfo().update_comicinfo_cbz("")
#cbz_path = "/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/福利女姊姊/第1话 福利女姊姊.CBZ"
dir_path = "CBZ/rm_comic"
#dir_path = "/mnt/Comics/CBZ/rm_comic"
for dir in os.listdir(dir_path):
c_dir = os.path.join(dir_path, dir)
if os.path.isdir(c_dir):
files = list(FileNaming.get_filenames_optimized(c_dir, ext_filter=['.CBZ']))
for file in files:
# 获取文件的创建时间仅在Linux/MacOS中可用
# 修改时间
create_time = time.localtime(os.utime(file)) # 注意st_birthtime 在Linux/MacOS中可用但不是所有系统都支持
# 格式化时间
formatted_time = time.strftime('%Y%m%d%H', create_time)
if int(formatted_time) > 2025020401:
print(f"{file} 文件创建时间:", formatted_time)
# 更新ComicInfoxml
# comicInfo().update_comicinfo_cbz(file)
# 检查CBZ是否存在ComicInfo.xml
comicInfo().ver_comicinfo_xml(file)
#if size < 3000:
# os.remove(file)
# print(f"已删除{file}")
if __name__ == "__main__":
def delete_repeat_file(self, cbz_path) -> None:
""" 删除 CBZ 文件中的重复图片章节
Args:
cbz_path (_type_): _description_
"""
# 批量删除漫画下的重复图片章节
# comicInfo()._comic_info_xml_pages("/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/和朋友的妈妈做朋友/第36话 36.CBZ")
dir_path = "CBZ/rm_comic"
#dir_path = "/mnt/Comics/CBZ/rm_comic"
for dir in os.listdir(dir_path):
@ -409,12 +417,10 @@ if __name__ == "__main__":
comic_pages = []
files = list(FileNaming.get_filenames_optimized(c_dir, ext_filter=['.CBZ']))
for file in files:
page_data = comicInfo()._comic_info_xml_pages(file)
page_data = self._comic_info_xml_pages(file)
comic_pages.append(page_data)
#print(page_data)
# 一本漫画读取完毕
#print(comic_pages)
duplicates = comicInfo().extract_duplicate_files(comic_pages)
duplicates = comicInfo()._extract_duplicate_files(comic_pages)
for hash_val, delete_files in duplicates.items():
# 删除重复文件
for file_path in delete_files:
@ -424,3 +430,11 @@ if __name__ == "__main__":
except Exception as e:
print(f"删除失败 {file_path}: {e}")
if __name__ == "__main__":
print("开始处理")
# ComicInfoXml()._xml_file_to_comicinfo("/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/和朋友的妈妈做朋友/第37话 37.CBZ")
xml_path = ComicInfoXml().update_comicinfo_count(37,"/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/和朋友的妈妈做朋友/第37话 37.CBZ")
comicInfo().update_cbz_with_new_xml("/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/和朋友的妈妈做朋友/第37话 37.CBZ", xml_path.read_text(encoding="utf-8"))
#items = ci().__dict__.keys()
#print(items)

View File

@ -0,0 +1,97 @@
# module_b.py
import sys
import os
# 获取当前文件所在目录
# current_dir = os.path.dirname(os.path.abspath(__file__))
current_dir = "/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader"
# 添加父目录到 sys.path
# parent_dir = os.path.join(current_dir, '..')
# sys.path.insert(0, parent_dir)
sys.path.insert(0, current_dir)
# 现在可以使用绝对导入
from src.common.ComicInfo import ComicInfo, ComicPageInfo, ImageInfo, ComicInfoXml
from os import path as Path
class test_ImageInfo:
def test_get_image_size(self):
print(ImageInfo().get_image_size("photo.jpg"))
def test_get_image_hash(self):
print(ImageInfo().get_image_hash_advanced("photo.jpg"))
def test_get_image_metadata(self):
"""获取图片信息"""
page = self.get_image_metadata("photo.jpg")
print(page)
def test_get_image_metadata_from_zip(self):
"""从ZIP文件中获取图片信息"""
pages = ImageInfo().get_image_metadata_from_zip("test.zip")
print(pages)
# Define the ComicInfo and ComicPageInfo classes
class test_ComicInfo:
# ComicInfo.xml 中的选项
def test_ToString(self):
"""测试ComicInfo的字符串表示"""
comic = ComicInfo()
comic.Title = "Test Comic"
comic.Series = "Test Series"
comic.Number = "1"
comic.PageCount = 10
comic.Writer = "Test Writer"
comic.Pages.append(ComicPageInfo())
print(comic.toString())
class test_ComicPageInfo:
# ComicInfo.xml 中的<Page>
def test_ToString(self):
"""测试ComicPageInfo的字符串表示"""
page = ComicPageInfo()
page.Image = "test_image.jpg"
page.ImageSize = 123456
class test_ComicInfoXml:
def test_get_page_count(self):
"""测试获取ComicInfo.xml中的PageCount"""
zip_file = Path("test.zip")
page_count = self.get_page_count(zip_file)
print(f"zip_file={zip_file} PageCount: {page_count}")
def test_scrapy_xml_by_json(self):
""" 根据Json数据生成ComicInfo.xml
"""
json_data = {
"name": "选手村母猪调教",
"chapter": "第2话-总教练最「疼爱」的选手",
"author": "沃林,蜜果实",
"tags": "凌辱,调教,报仇,选手村,体操,硬调色情,新作",
"images": [
"001.jpg", "scramble=6_002.jpg", "scramble=5_003.jpg",
"004.jpg", "005.jpg", "scramble=5_006.jpg",
"007.jpg", "008.jpg", "scramble=7_009.jpg",
"scramble=9_010.jpg", "011.jpg", "012.jpg",
"scramble=6_013.jpg", "014.jpg", "015.jpg",
"scramble=7_016.jpg", "017.jpg", "018.jpg",
"019.jpg"
],
"description": ("「总教练,我愿意用身体换取机会…」在腐败的选手村里,总教练握有绝对的权力,"
+ "选手们只能任凭摆布。人们对成功的渴望不断滋长,却也因为过度的欲望濒临崩溃…"),
"genre": "韩漫",
"age_rating": "R18+",
"series": "选手村母猪调教",
"number": 2,
'page_count': 286
}
save_dir = "/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/output/rm_comic/images/选手村母猪调教/第2话-总教练最「疼爱」的选手"
xsd_file = "ComicInfo_2.1.xsd"
pages = ComicInfoXml().scrapy_xml_by_json(json_data, save_dir=save_dir, xsd_file=xsd_file)
print(f"Generated pages: {pages}")
if __name__ == "__main__":
test_ComicInfoXml().test_scrapy_xml_by_json()