diff --git a/run.py b/run.py index 5f18b9e..3536cb3 100644 --- a/run.py +++ b/run.py @@ -1,5 +1,4 @@ import asyncio -from pathlib import Path from src.sites.manager import MangaManager from src.common.logging import setup_logging diff --git a/src/common/extractor.py b/src/common/extractor.py index 607e6ff..3895ded 100644 --- a/src/common/extractor.py +++ b/src/common/extractor.py @@ -69,6 +69,7 @@ class Extractor: """提取漫画信息并返回 MangaInfo 实例""" selectors = self.config.get_selector('manga_list') info_data = {} + info_data['base_url'] = self.config.base_url for key, selector in selectors.items(): if isinstance(selector, str): element = self.processor.select(tree, selector) diff --git a/src/common/item.py b/src/common/item.py index b0328e5..40b4357 100644 --- a/src/common/item.py +++ b/src/common/item.py @@ -72,6 +72,14 @@ class Chapter(BaseModel): #images: List[ImageItem] = [] class ListManga(BaseModel): + + base_url: str + """漫画网站域名""" + @field_validator('base_url', mode='before') + def validate_base_url(cls, v): + cls.base_url = v + return v + title: List[str] url: List[HttpUrl] @@ -81,10 +89,12 @@ class ListManga(BaseModel): list_url = [] for url in v: if isinstance(url, str) and not url.startswith('http'): - list_url.append(HttpUrl("https://rouman5.com" + url)) + list_url.append(HttpUrl(cls.base_url + url)) return list_url - created_at: List[str] = [] + updated_at: List[str] = [] + + last_updated: List[str] = [] class MangaInfo(BaseModel): project: str diff --git a/src/common/utils.py b/src/common/utils.py index c02c003..b4c75d6 100644 --- a/src/common/utils.py +++ b/src/common/utils.py @@ -730,18 +730,22 @@ class MangaUtils: save_data = [] for manga in self.data: - created_at = manga["created_at"] - if isinstance(created_at, datetime): + updated_at = manga["updated_at"] + last_updated = manga["last_updated"] + if isinstance(updated_at, datetime): str_strftime = '%Y%m%d' - created_at = created_at.strftime(str_strftime) - save_data.append({"name" : manga["name"] , "created_at" : created_at}) + updated_at = updated_at.strftime(str_strftime) + if isinstance(last_updated, datetime): + str_strftime = '%Y%m%d' + last_updated = last_updated.strftime(str_strftime) + save_data.append({"name" : manga["name"] , "updated_at" : updated_at , "last_updated" : last_updated}) with self.lock: with open(temp_path, 'w', encoding='utf-8') as f: json.dump(save_data, f, indent=2, ensure_ascii=False) temp_path.replace(self.file_path) - def add_manga(self, name: str, created_at: str = None) -> bool: + def add_manga(self, name: str, updated_at: str = None) -> bool: """添加新漫画""" if not self.validate_name(name): raise ValueError("无效的漫画名称") @@ -750,13 +754,14 @@ class MangaUtils: self.delete_manga(name) str_strftime = '%Y%m%d' - time = created_at or datetime.now() - if isinstance(time , datetime): - time = time.strftime(str_strftime) + now_time = datetime.now() + if isinstance(now_time , datetime): + now_time = now_time.strftime(str_strftime) new_manga = { "name": name.strip(), - "created_at": time + "updated_at": updated_at, + "last_updated": now_time } self.data.append(new_manga) @@ -804,7 +809,7 @@ class MangaUtils: if sort_by == "name": return sorted(self.data, key=lambda x: x['name']) elif sort_by == "date": - return sorted(self.data, key=lambda x: x['created_at']) + return sorted(self.data, key=lambda x: x['updated_at']) return self.data.copy() def validate_name(self, name: str) -> bool: @@ -817,7 +822,7 @@ class MangaUtils: """批量导入漫画""" for manga in mangas: if self.validate_name(manga["name"]): - self.add_manga(manga["name"], manga.get("created_at")) + self.add_manga(manga["name"], manga.get("updated_at")) def find_duplicates(self) -> List[str]: """查找可能的重复条目(简单版本)""" diff --git a/src/sites/manager.py b/src/sites/manager.py index 129dafd..7617cc2 100644 --- a/src/sites/manager.py +++ b/src/sites/manager.py @@ -151,21 +151,21 @@ class MangaManager: raise MangaException(f"不支持的网站: {manga_url}") async with list_site_handler() as site: manga_list = await site.get_manga_list(manga_url) - for title,url,created_at in zip(manga_list.title, manga_list.url, manga_list.created_at): + for title,url,updated_at in zip(manga_list.title, manga_list.url, manga_list.updated_at): title = FileNaming.chinese_file_name(title) save_manga = MangaUtils().search_manga(title) - created = None - if save_manga != None: created = save_manga.get('created_at', None) - if created != None and created_at == created: - created = save_manga.get('created_at', None) + updated = None + if save_manga != None: updated = save_manga.get('updated_at', None) + if updated != None and updated_at == updated: + updated = save_manga.get('updated_at', None) logger.info(f"{save_manga} 已存在") else: logger.info(f"开始下载 漫画: {title}") logger.info(f"{url}") - await self.download_manga(str(url), title = title, created_at = created_at) + await self.download_manga(str(url), title = title, updated_at = updated_at ) @classmethod - async def download_manga(cls, url: str, title: str = None, created_at: str = None, save_dir: Path = BASE_IMAGES_DIR): + async def download_manga(cls, url: str, title: str = None, updated_at: str = None, save_dir: Path = BASE_IMAGES_DIR): """下载漫画""" manager = MangaManager(save_dir) @@ -204,9 +204,9 @@ class MangaManager: logger.error(f"下载出错: {result['error']}") # 全部下载完成 - if int(total_chapters) == int(success_chapters) and title != None and created_at != None: - MangaUtils().add_manga(title, created_at=created_at) - logger.info(f"全部完成 {title}, {created_at}") + if int(total_chapters) == int(success_chapters) and title != None and updated_at != None: + MangaUtils().add_manga(title, updated_at=updated_at) + logger.info(f"全部完成 {title}, {updated_at}") except MangaException as e: logger.error(f"下载失败: {str(e)}") except Exception as e: diff --git a/test.py b/test.py index 6ed3faa..007d9ac 100644 --- a/test.py +++ b/test.py @@ -2,13 +2,14 @@ from src.common.naming import FileNaming from src.common.ComicInfo import ImageInfo from zipfile import ZipFile from datetime import datetime +import time import os,hashlib import xml.etree.ElementTree as ET from PIL import Image from io import BytesIO from tempfile import NamedTemporaryFile from xml.dom import minidom - +from src.common.ComicInfo import ComicInfoXml class test: @@ -90,7 +91,7 @@ class comicInfo: # 定义需要提取的元数据字段(用户自定义的字段列表) metadata_fields = [ "Title", "Series", "Number", "Summary", "Writer", - "Genre", "PageCount", "AgeRating" + "Genre", "Tags", "PageCount", "AgeRating" ] for field in metadata_fields: @@ -156,12 +157,14 @@ class comicInfo: } except Exception as e: print(f"处理 CBZ 文件时出错: {e}") - return None + raise exit(f"处理CBZ出错") def generate_comic_info_xml(self, metadata, pages_info): """根据元数据和页面信息生成 ComicInfo.xml 内容""" # 创建根节点 - root = ET.Element("ComicInfo", xmlns="http://comicrack.cyolito.com/comicinfo") + root = ET.Element('ComicInfo') + root.set('xmlns:xsd', 'http://www.w3.org/2001/XMLSchema') + root.set('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance') # 添加元数据字段 for field, value in metadata.items(): @@ -235,17 +238,50 @@ class comicInfo: cbz_path (_type_): _description_ """ data = self.process_cbz(cbz_path) + metadata = data["metadata"] + author = data["metadata"].get("Writer", "") + tags = data["metadata"].get("Tags", "") + + (list_value, value) = [[], str(author).replace("&", " ")] + for val in set(str(value).split(" ")): + list_value.append(val) + author = FileNaming.chinese_file_name(",".join(list_value)) + data["metadata"]["Writer"] = author # 生成 XML 内容 new_xml = self.generate_comic_info_xml(data["metadata"], data["pages"]) + xml_file = "NewComicInfo.xml" # 测试:保存 XML 到本地查看 - with open("NewComicInfo.xml", "w", encoding="utf-8") as f: + with open(xml_file, "w", encoding="utf-8") as f: f.write(new_xml) - print("已生成 NewComicInfo.xml") + print(f"已生成 {xml_file}") + ComicInfoXml()._validate_xml_with_xsd_file(xml_file=xml_file ,xsd_file="src/assets/ComicInfo_2.1.xsd") # 更新 CBZ 文件(示例路径,实际操作前请备份) - success = comicInfo().update_cbz_with_new_xml("example.cbz", new_xml, "example_updated.cbz") + success = comicInfo().update_cbz_with_new_xml(cbz_path, new_xml) # if success: # print("CBZ 文件更新成功") + os.remove(xml_file) if __name__ == "__main__": # 清除3KB以下CBZ文件 - test().clean_min_cbz() \ No newline at end of file + # comicInfo().update_comicinfo_cbz("") + #cbz_path = "/Users/cc/Documents/Dev/WorkSpace/VSCodeProjects/NewComicDownloader/CBZ/rm_comic/福利女姊姊/第1话 福利女姊姊.CBZ" + + dir_path = "/mnt/Comics/CBZ/rm_comic" + for dir in os.listdir(dir_path): + c_dir = os.path.join(dir_path, dir) + if os.path.isdir(c_dir): + files = list(FileNaming.get_filenames_optimized(c_dir, ext_filter=['.CBZ'])) + for file in files: + #size = os.path.getsize(file) + # 获取文件状态信息 + file_stat = os.stat(file) + # 获取文件的创建时间(仅在Linux/MacOS中可用) + create_time = time.localtime(file_stat.st_birthtime) # 注意:st_birthtime 在Linux/MacOS中可用,但不是所有系统都支持 + # 格式化时间 + formatted_time = time.strftime('%Y%m%d', create_time) + if int(formatted_time) < 20250204: + print(f"{file} 文件创建时间:", formatted_time) + comicInfo().update_comicinfo_cbz(file) + #if size < 3000: + # os.remove(file) + # print(f"已删除{file}") \ No newline at end of file