NewComicDownloader/src/sites/base.py
2025-07-13 02:21:00 +08:00

257 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from abc import ABC, abstractmethod
from typing import List, Dict, Optional, AsyncGenerator
from pathlib import Path
import aiohttp,os,shutil
import asyncio
import logging
from src.config import DEFAULT_HEADERS, TIMEOUT, RETRIES, PROXY_URL, RETRY_PROXY
from lxml import etree
from src.common.utils import Cache, CBZUtils # 导入缓存类
from src.common.item import Chapter, MangaItem, MangaInfo,CoverItem
from src.common.exceptions import SiteError, NetworkError, ParseError
from src.common.logging import setup_logging
from src.common.naming import DirectoryNaming,FileNaming
from src.common.ComicInfo import ComicInfo, ImageInfo, ComicInfoXml
logger = setup_logging(__name__)
class BaseSite(ABC):
"""漫画网站基类"""
def __init__(self):
self.session: Optional[aiohttp.ClientSession] = None
self.headers = DEFAULT_HEADERS.copy()
self.cache = Cache() # 初始化缓存
async def __aenter__(self):
self.session = aiohttp.ClientSession(headers=self.headers)
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
if self.session:
await self.session.close()
async def _get(self, url: str, retries: int = RETRIES, PROXY: bool = RETRY_PROXY) -> str:
"""发送GET请求并处理错误"""
# 尝试从缓存中获取 HTML 内容
cached_html = self.cache.get(url)
if cached_html:
logger.debug(f"从缓存中获取 HTML 内容: {url}")
return cached_html
for attempt in range(retries):
try:
if PROXY:
proxy = PROXY_URL
else:
proxy = None
async with self.session.get(str(url), proxy=proxy) as response:
if response.status == 200:
html = await response.text()
self.cache.set(url, html) # 将 HTML 内容保存到缓存
return html
elif response.status == 404:
raise SiteError(f"页面不存在: {url}")
elif response.status == 403:
raise SiteError(f"访问被拒绝: {url}")
else:
raise NetworkError(f"HTTP错误 {response.status}: {url}")
except aiohttp.ClientError as e:
if attempt == retries - 1:
raise NetworkError(f"网络错误: {str(e)}")
logger.info(f"{attempt + 2} 次重试, 网站: {url}")
await asyncio.sleep(2 * (attempt + 1))
@abstractmethod
async def get_chapter_images(self, chapter_url: str) -> List[str]:
"""获取章节所有图片URL"""
pass
#@abstractmethod
async def get_manga_info(self, manga_url: str) -> Dict[str, str]:
"""获取漫画信息"""
try:
html = await self._get(manga_url)
tree = etree.HTML(html)
return self.extractor.extract_manga_info(tree)
except Exception as e:
if isinstance(e, (ParseError, SiteError)):
raise exit(f"解析漫画信息失败: {str(e)}")
raise ParseError(f"解析漫画信息失败: {str(e)}")
#@abstractmethod
#async def get_chapter_list(self, info: MangaInfo) -> List[Dict[str, str]]:
# """获取漫画章节列表"""
# pass
async def get_chapter_list(self, manga_info: MangaInfo) -> List[Dict[str, str]]:
"""获取章节列表"""
try:
list_chapter = manga_info.get_list_chapter()
down_chapter = []
downloaded_chapter = []
number = 0
for chapter in list_chapter:
number += 1
cbz_path = FileNaming.chapter_cbz(manga_info=manga_info,chapter=chapter)
if os.path.exists(cbz_path):
# 判断是否是最新章节
if manga_info.is_chapter_ended(chapter.title):
# 如果是最新章节且漫画已完结,则不再下
# if ci.Count != "" and int(ci.Count) != 0 and int(ci.Count) != int(number):
# count = number
xml_data = ComicInfoXml().update_comicinfo_count_or_number(cbz_path=cbz_path, number=number, count=number)
# 更新ComicInfo.xml至CBZ文件中
if xml_data:
CBZUtils().update_cbz_with_new_xml(cbz_path, xml_data)
# 更新完成后删除临时生成的ComicInfo.xml
logger.info(f"更新 {cbz_path} 的 ComicInfo.xml Number完成")
# 临时添加end
logger.debug(f"{chapter.title} 章节已存在")
chapter.status = "downloaded"
downloaded_chapter.append({ "name" : chapter.title, "status": chapter.status, "path" : cbz_path })
down_chapter.append(chapter)
if manga_info.status == "已完结":
from src.common.utils import KomgaAPI, MangaUtils
is_update_komga_ended = MangaUtils("mangas_ended.json").search_manga(name= manga_info.title)
if is_update_komga_ended != None:
logger.info(f"{manga_info.title} 漫画已完结, 已存在于KOMGA服务器已完结列表中无需更新")
else:
if len(downloaded_chapter) == len(list_chapter):
logger.info(f"{manga_info.title} 漫画已完结, 章节已全部下载完成")
KomgaAPI().update_series_ended(manga_info.title)
logger.info(f"{manga_info.title} 漫画已完结, 已更新KOMGA状态")
MangaUtils("mangas_ended.json").add_manga(name= manga_info.title)
else:
logger.info(f"{manga_info.title} 漫画已完结, 但章节未全部下载完成, 可能是网络问题或其他原因")
return down_chapter
except Exception as e:
if isinstance(e, (ParseError, SiteError)):
raise
raise ParseError(f"解析章节列表失败: {str(e)}")
async def update_covers(self, manga_info : MangaInfo) -> AsyncGenerator[Dict, None]:
"""更新Icons文件夹内Cover逻辑"""
cache_cover = { 'path' : str(DirectoryNaming.manga_cover_dir(manga_info, cache=True)) }
cover_img = { 'path' : str(DirectoryNaming.manga_cover_dir(manga_info, cache=False)) }
cache_cover_item = CoverItem(**cache_cover)
icons_dir = os.path.dirname(cover_img['path'])
if not os.path.exists(icons_dir): os.makedirs(icons_dir)
list_cover = []
is_update = 0
try:
for file in os.listdir(icons_dir):
if file.lower().endswith(".jpg"):
file_cover = {'path' : os.path.join(icons_dir, file)}
f_item = CoverItem(**file_cover)
list_cover.append(f_item)
if f_item.md5 == cache_cover_item.md5: is_update += 1
if is_update == 0:
new_cover = { 'path' : FileNaming.cover_format_path(cover_img["path"]) }
shutil.copy(cache_cover["path"], new_cover["path"])
list_cover.append(CoverItem(**new_cover))
except Exception:
raise exit("Cover 检测异常")
return list_cover
async def update_cbz_covers(self, manga_info : MangaInfo):
"""更新CBZ漫画的Cover"""
cbz_dir = DirectoryNaming().chapter_cbz_dir(manga_info=manga_info)
list_cbz = list(FileNaming().get_filenames_optimized(cbz_dir, ext_filter=[".CBZ"]))
list_cover = await self.update_covers(manga_info)
# 用来保存所有CBZ的Cover.jpg
list_file_img = []
for cbz_path in list_cbz:
first_cover_path = str(cbz_path).split(".")[0]+".jpg"
if len(list_cover) == 1:
if FileNaming().file_update_by_date(first_cover_path, day=30) or not os.path.exists(first_cover_path):
shutil.copy(list_cover[0].path, first_cover_path)
list_file_img.append(first_cover_path)
logger.info(f"{list_cover[0].path} ==> {first_cover_path} 已复制")
else:
list_file_img.append(first_cover_path)
continue
cover_count = 1
for cover in list_cover:
cover_path = cover.path
if os.path.exists(first_cover_path): os.remove(first_cover_path)
new_cover_path = FileNaming().cover_format_path(str(cbz_path).split(".")[0]+".jpg", count=cover_count)
if FileNaming().file_update_by_date(new_cover_path, day=30) or not os.path.exists(new_cover_path):
shutil.copy(cover_path, new_cover_path)
list_file_img.append(new_cover_path)
logger.info(f"{cover_path} ==> {new_cover_path} 已复制")
else:
list_file_img.append(new_cover_path)
cover_count += 1
list_cbz_and_img = list(FileNaming().get_filenames_optimized(cbz_dir, ext_filter=[".jpg"]))
clear_imgs = DirectoryNaming.get_unique_ordered(list_cbz_and_img, list_file_img)
# 清理多余Img
if len(clear_imgs) > 0:
try:
for img in clear_imgs: os.remove(img)
except Exception:
logger.error(f"{clear_imgs} 删除失败")
async def download_manga(self, manga_url: str) -> AsyncGenerator[Dict, None]:
"""下载整部漫画"""
try:
# 获取漫画信息
info = await self.get_manga_info(manga_url)
yield {'type': 'info', 'data': info, 'item': info}
# 获取章节列表
chapters = await self.get_chapter_list(info)
yield {'type': 'chapters', 'data': chapters, 'item': info}
# 下载封面
yield {'type': 'cover', 'item': info}
covers = await self.update_covers(info)
# 下载每个章节
for chapter in chapters:
try:
if chapter.status == "downloaded":
logger.debug(f"{chapter.title} 章节已下载")
continue
images = await self.get_chapter_images(chapter.url)
manga_item = MangaItem(
info=info,
covers=covers,
chapter=chapter,
chapter_images=images,
chapters=chapters
).get_item()
yield {
'type': 'chapter',
'chapter': str(chapter.title),
'images': images,
'item': manga_item
}
except Exception as e:
yield {
'type': 'error',
'chapter': chapter,
'error': str(e)
}
continue
# 所有章节全部下载完后执行
await self.update_cbz_covers(info)
except Exception as e:
yield {'type': 'error', 'error': str(e)}
async def get_manga_list(self, manga_url: str) -> List[Dict[str, str]]:
"""获取漫画列表"""
try:
html = await self._get(manga_url)
tree = etree.HTML(html)
return self.extractor.extract_manga_list(tree)
except Exception as e:
if isinstance(e, (ParseError, SiteError)):
raise exit(f"解析漫画信息失败: {str(e)}")
raise ParseError(f"解析漫画信息失败: {str(e)}")