This commit is contained in:
caiwx86 2024-02-20 21:08:13 +08:00
parent 2a9820949b
commit c78fa7e47d
14 changed files with 424 additions and 260 deletions

1
.gitignore vendored
View File

@ -1,5 +1,6 @@
.scrapy/* .scrapy/*
.vscode/* .vscode/*
.DS_Store
CBZ/* CBZ/*
output/* output/*
/**/__pycache__ /**/__pycache__

View File

View File

@ -6,20 +6,10 @@ from scrapy.exporters import JsonItemExporter
from Comics.items import ComicInfoItem from Comics.items import ComicInfoItem
from Comics.items import ComicItem from Comics.items import ComicItem
from Comics.settings import COMIC_INFO_XML_STORE from Comics.settings import COMIC_INFO_XML_STORE
from Comics.utils.Constant import ComicPath from Comics.utils import ComicPath
from scrapy.utils.python import is_listlike, to_bytes, to_unicode from scrapy.utils.python import is_listlike, to_bytes, to_unicode
from itemadapter import ItemAdapter from itemadapter import ItemAdapter
class ItemImport():
def import_obj(self, file):
if os.path.exists(file):
with open(file, "r", encoding="utf-8") as fs:
result = fs.read()
fs.close()
return result
else:
return []
class CommonExporter(): class CommonExporter():
def getPath(self, file , sufix=None): def getPath(self, file , sufix=None):
sufix = "."+sufix sufix = "."+sufix

View File

@ -4,9 +4,9 @@
# https://docs.org/en/latest/topics/items.html # https://docs.org/en/latest/topics/items.html
import os,Comics.settings as settings,logging import os,Comics.settings as settings,logging
from scrapy.item import Item, Field from scrapy.item import Item, Field
from Comics.utils.Constant import ComicPath from Comics.utils import ComicPath
from Comics.utils.FileUtils import imageUtils from Comics.utils import imageUtils
from itemloaders.processors import TakeFirst, MapCompose, Join from itemloaders.processors import TakeFirst
# 繁体中文转为简体中文 # 繁体中文转为简体中文
def serialize_to_chinese(value): return ComicPath.chinese_convert(value) def serialize_to_chinese(value): return ComicPath.chinese_convert(value)
@ -86,6 +86,11 @@ class ComicItem(Item):
image_urls = Field(serializer=serialize_to_image_urls) image_urls = Field(serializer=serialize_to_image_urls)
# 图像名 # 图像名
images_name = Field() images_name = Field()
#章节链接
chapter_href = Field()
#章节API
chapter_api = Field()
# 序列化-作者 # 序列化-作者
def serializer_info_writer(value): def serializer_info_writer(value):

View File

@ -8,7 +8,8 @@ class ComicLoader(ItemLoader):
dots = str(exec).split(".") dots = str(exec).split(".")
if not isinstance(data,dict): data = json.loads(data) if not isinstance(data,dict): data = json.loads(data)
for dot in dots: for dot in dots:
data = data.get(dot) if data != None: data = data.get(dot)
logging.debug(f"data= {data} dot={dot}")
return data return data
def add_xpath(self, field_name, xpath, *processors, index=None, exec=None, re=None, is_null=None, **kw): def add_xpath(self, field_name, xpath, *processors, index=None, exec=None, re=None, is_null=None, **kw):
@ -60,8 +61,8 @@ class ComicLoader(ItemLoader):
def auto_replace_value(self, field_name, value): def auto_replace_value(self, field_name, value):
if self.get_output_value(field_name) != None: if self.get_output_value(field_name) != None:
self._replace_value(field_name, value) self._replace_value(field_name, value)
return False return False
else: return True else: return True
@ -101,7 +102,30 @@ class ComicLoader(ItemLoader):
def images(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('images', value, xpath, index, sexec) def images(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('images', value, xpath, index, sexec)
# 图像链接 # 图像链接
def image_urls(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('image_urls', value, xpath, index, sexec) def image_urls(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('image_urls', value, xpath, index, sexec)
def get_output_value(self, field_name):
value = super().get_output_value(field_name)
try:
if isinstance(value, list) and len(value) == 1 : value = value[0]
except:
print(f"get_output_value value={value} type={type(value)}")
return value
# 漫画名称
def get_name(self): return self.get_output_value("name")
# 漫画章节
def get_chapter(self): return self.get_output_value("chapter")
# 工程名
def get_project_name(self): return self.get_output_value(PROJECT_KEY)
# 章节链接
def get_chapter_href(self): return self.get_output_value("chapter_href")
# 全部章节
def get_chapters(self): return self.get_output_value("chapters")
def get_chapter_api(self): return self.get_output_value("chapter_api")
def get_image_urls(self): return self.get_output_value("image_urls")
class ComicEntity: class ComicEntity:
ENTITY = None ENTITY = None

View File

@ -9,11 +9,11 @@ import os,scrapy,logging
from Comics import settings from Comics import settings
from Comics.items import ComicItem from Comics.items import ComicItem
from Comics.settings import OUTPUT_DIR from Comics.settings import OUTPUT_DIR
from Comics.loader import ComicEntity from Comics.loader import ComicEntity,ComicLoader
from Comics.exporters import ComicInfoXmlItemExporter from Comics.exporters import ComicInfoXmlItemExporter
from Comics.utils.FileUtils import CBZUtils,fileUtils as fu from Comics.utils import CBZUtils,fileUtils as fu
from Comics.utils.Constant import ComicPath from Comics.utils import ComicPath
from Comics.utils.ComicUtils import checkUtils from Comics.utils import checkUtils
from Comics.exporters import JsonExport,ItemExporter from Comics.exporters import JsonExport,ItemExporter
from scrapy.pipelines.images import ImagesPipeline from scrapy.pipelines.images import ImagesPipeline
@ -24,12 +24,14 @@ class ComicsPipeline():
# item就是yield后面的对象 # item就是yield后面的对象
def process_item(self, item, spider): def process_item(self, item, spider):
if isinstance(item, ComicItem): if isinstance(item, ComicItem):
# item = ComicEntity(item).item()
# 'output/rm_comic/json/壞X/第1話 壞X' # 'output/rm_comic/json/壞X/第1話 壞X'
if fu.exists(ComicPath.path_cbz(item=item)): # 已存在漫画CBZ文件 调用转换
return ItemExporter().export_obj(item) if fu.exists(ComicPath.path_cbz(item=item)): return ItemExporter().export_obj(item)
else: else:
file = os.path.join(OUTPUT_DIR, spider.name, "json", item['name'], item['chapter']) # 不存在漫画CBZ文件
return JsonExport(file=file).export_json(ComicEntity(item).item(), if_return=True) #file = os.path.join(OUTPUT_DIR, spider.name, "json", item['name'], item['chapter'])
return JsonExport(file=ComicPath.getDirJosnComicChapter(item)).export_json(ComicEntity(item).item(), if_return=True)
# image解析 # image解析
def close_spider(self, spider): def close_spider(self, spider):
@ -102,6 +104,11 @@ class ImgDownloadPipeline(ImagesPipeline):
# return item # return item
# 打包 # 打包
cbz_path = self.get_file_path(item, result_type="cbz") cbz_path = self.get_file_path(item, result_type="cbz")
success_data = []
for result in results:
if result[0]: success_data.append(result[1])
image_urls = ComicLoader(item=item).get_image_urls()
if len(success_data) != len(image_urls): return
if fu.exists(cbz_path): if fu.exists(cbz_path):
self.update_icon(item) self.update_icon(item)
self.pack_icon(item) self.pack_icon(item)

View File

@ -26,7 +26,7 @@ ROBOTSTXT_OBEY = False
HTTPERROR_ALLOWED_CODES = [ 200 , 403] HTTPERROR_ALLOWED_CODES = [ 200 , 403]
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 16 CONCURRENT_REQUESTS = 8
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
@ -45,7 +45,7 @@ RETRY_HTTP_CODES = [408, 401]
CONCURRENT_REQUESTS_PER_DOMAIN = 16 CONCURRENT_REQUESTS_PER_DOMAIN = 16
CONCURRENT_REQUESTS_PER_IP = 16 CONCURRENT_REQUESTS_PER_IP = 16
PROXY_LIST = [ PROXY_LIST = [
"http://127.0.0.1:7890", # "http://127.0.0.1:7890",
# "http://10.0.10.117:8123", # "http://10.0.10.117:8123",
] ]
# Disable cookies (enabled by default) # Disable cookies (enabled by default)

View File

@ -1,15 +1,17 @@
import scrapy,logging,time,os,skip import scrapy,logging,time,os,skip
from Comics.items import ComicItem from Comics.items import ComicItem
from Comics.loader import ComicLoader from Comics.loader import ComicLoader
from Comics.utils.Constant import ComicPath from Comics.utils import ComicPath
from Comics.utils.ComicUtils import checkUtils from Comics.utils import checkUtils
from Comics.utils import Conf
class RmComicSpider(scrapy.Spider): class RmComicSpider(scrapy.Spider):
name = 'rm_comic' name = 'rm_comic'
allowed_domains = ['roum1.xyz'] allowed_domains = ['roum12.xyz']
main_url = 'https://'+allowed_domains[0] main_url = 'https://'+allowed_domains[0]
start_urls = main_url+'/books' start_urls = main_url+'/books'
# 遍历网站页数
def start_requests(self): def start_requests(self):
for x in range(0,60): for x in range(0,60):
yield scrapy.Request(self.start_urls+"?&page="+str(x), callback=self.books_comic) yield scrapy.Request(self.start_urls+"?&page="+str(x), callback=self.books_comic)
@ -17,66 +19,45 @@ class RmComicSpider(scrapy.Spider):
# 获取多个漫画信息 # 获取多个漫画信息
def books_comic(self, response): def books_comic(self, response):
comics = ComicLoader(item=ComicItem(), response=response) comics = ComicLoader(item=ComicItem(), response=response)
data = comics.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0] for book in comics.get_exec(comics.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0], str_exec="props.pageProps.books"):
for book in comics.get_exec(data, str_exec="props.pageProps.books"):
comics.add_value('link', self.start_urls+"/"+book['id'])
if book['name'] not in skip.skip_comic: if book['name'] not in skip.skip_comic:
yield scrapy.Request(url=self.start_urls+"/"+book['id'], callback=self.parse_comic) yield scrapy.Request(url=self.start_urls+"/"+book['id'], callback=self.parse_comic)
# 获取某个漫画的相关数据 # 获取某个漫画的相关数据
# 获取到多个章节链接后进入下个流程 # 获取到多个章节链接后进入下个流程
def parse_comic(self, response): def parse_comic(self, response):
comic_item = ComicLoader(item=ComicItem(), response=response) comic_item = Conf().comic(self.name, ComicLoader(ComicItem(), response))
comic_item.project_name(self.name) for chapter, link in zip(comic_item.get_chapters(), comic_item.get_chapter_href()):
comic_item.name(xpath='//div[@class="col"]/h5/text()')
comic_item.icon(xpath='//img[@class="img-thumbnail"]/@src')
comic_item.author(xpath='//div[contains(@class,"bookid_bookInfo")]/p[1]/text()', index=1)
comic_item.tags(xpath='//div[contains(@class,"bookid_bookInfo")]/p[3]/b/text()')
comic_item.dep(xpath='//div[contains(@class,"bookid_bookInfo")]/p[4]/text()', index=1)
comic_item.date(xpath='//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()', index=1)
comic_item.genre(value="韩漫")
comic_item.age_rating(value="R18+")
chapter_href = comic_item.get_xpath('//div[contains(@class,"bookid_chapterBox")]'
'//div[contains(@class,"bookid_chapter")]/a/@href')
chapters = comic_item.get_xpath('//div[contains(@class,"bookid_chapterBox")]'
'//div[contains(@class,"bookid_chapter")]/a/text()')
for chapter, link in zip(chapters, chapter_href):
comic_item.chapters(value=chapters)
comic_item.chapter(value=chapter)
item = comic_item.load_item() item = comic_item.load_item()
cbz_path = ComicPath.get_file_path(item=item, result_type="cbz", convert=True) cbz_path = ComicPath.get_file_path(item=item, result_type="cbz", convert=True, chapter=chapter)
if not checkUtils().is_error(item): if not checkUtils().is_error(item) and os.path.exists(cbz_path):
if os.path.exists(cbz_path): logging.info(f"漫画 {cbz_path} 已存在, 跳过中...")
logging.info(f"漫画 {cbz_path} 已存在, 跳过中...") yield item
yield item else:
else: # 开始访问章节链接并跳转到self.parse_chapter
yield scrapy.Request(self.main_url+link, meta={'item': item}, callback=self.parse_chapter) yield scrapy.Request(self.main_url+link, meta={'item': item}, callback=self.parse_chapter)
# 读取某章节下的所有图片 # 读取某章节下的所有图片
def parse_chapter(self, response): def parse_chapter(self, response):
comic_item = ComicLoader(item=response.meta['item'], response=response) comic_item = ComicLoader(item=response.meta['item'], response=response)
data = comic_item.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0] data = comic_item.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0]
str_exec = "props.pageProps." item: ComicLoader = Conf().parse_chapter(item=comic_item, value=data)
comic_item.name(value=data, sexec=str_exec+"bookName") comic = item.load_item()
comic_item.dep(value=data, sexec=str_exec+"description") chapter_api_url = item.get_chapter_api()
comic_item.chapter(value=data, sexec=str_exec+"chapterName") if chapter_api_url is not None and len(chapter_api_url) != 0 :
comic_item.image_urls(value=data, sexec=str_exec+"images") try:
comic_item.images(value=data, sexec=str_exec+"images") yield scrapy.Request(self.main_url + chapter_api_url, meta={'item': comic}, callback=self.parse_chapter_api)
comic = comic_item.load_item() except:
chapter_api_url = comic_item.get_exec(data, str_exec+"chapterAPIPath") logging.warning(f"yield scrapy.Request({self.main_url} + {chapter_api_url}, meta={comic}, callback=self.parse_chapter_api)")
if chapter_api_url is not None:
yield scrapy.Request(self.main_url + chapter_api_url, meta={'item': comic}, callback=self.parse_chapter_api)
else: else:
yield comic yield comic
# 加密数据API处理 # 加密数据API处理
def parse_chapter_api(self, response): def parse_chapter_api(self, response):
comic_item = ComicLoader(item=response.meta['item'], response=response) comic_item = ComicLoader(item=response.meta['item'], response=response)
comic_item.chapter(value=response.text, sexec='chapter.name') item: ComicLoader = Conf().parse_chapter(item=comic_item, value=response.text)
comic_item.image_urls(value=response.text, sexec='chapter.images') yield item.load_item()
comic_item.images(value=response.text, sexec='chapter.images')
yield comic_item.load_item()
def parse(self, response): def parse(self, response):

View File

@ -0,0 +1,41 @@
data:
name: '//div[@class="col"]/h5/text()'
icon: '//img[@class="img-thumbnail"]/@src'
author:
xpath: '//div[contains(@class,"bookid_bookInfo")]/p[1]/text()'
index: 1
tags: '//div[contains(@class,"bookid_bookInfo")]/p[3]/b/text()'
dep:
xpath: '//div[contains(@class,"bookid_bookInfo")]/p[4]/text()'
index: 1
date:
xpath: '//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()'
index: 1
genre:
value: "韩漫"
age_rating:
value: "R18+"
chapter_href: '//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/@href'
chapters: '//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/text()'
parse_chapter:
name:
sexec: props.pageProps.bookName
dep:
sexec: props.pageProps.description
chapter:
sexec: props.pageProps.chapterName
image_urls:
sexec: props.pageProps.images
images:
sexec: props.pageProps.images
chapter_api:
sexec: props.pageProps.chapterAPIPath
parse_chapter_api:
chapter:
sexec: chapter.name
image_urls:
sexec: chapter.images
images:
sexec: chapter.images

View File

@ -1,7 +1,7 @@
import scrapy,logging,time,os import scrapy,logging,time,os
from Comics.items import ComicItem from Comics.items import ComicItem
from Comics.loader import ComicLoader from Comics.loader import ComicLoader
from Comics.utils.Constant import ComicPath from Comics.utils import ComicPath
from Comics.settings import PROJECT_KEY from Comics.settings import PROJECT_KEY
import skip import skip

View File

@ -1,74 +1,166 @@
import base64,hashlib,os,shutil import base64,hashlib,os,shutil,os.path
import math,time,json,datetime,logging import math,time,json,datetime,logging
import re,requests,time,xmlschema
from datetime import date
from Comics import settings
from opencc import OpenCC
from PIL import Image from PIL import Image
from Comics.utils.Constant import ComicPath
from pathlib import Path from pathlib import Path
from zipfile import ZipFile from zipfile import ZipFile
from Comics.settings import COMIC_INFO_XML_FILE,CBZ_EXPORT_PATH,IMAGES_STORE from Comics.settings import COMIC_INFO_XML_FILE,OUTPUT_DIR,PROJECT_KEY
from Comics.utils.Constant import ntfy import yaml
from Comics.loader import ComicLoader
# 配置类
class Conf():
# 读取yml文件配置
# @project 根据工程名读取配置 project.yml
# @key 读取key内的字典的数据(默认为空)
#def init(self, project, key=None):
# data = None
# if project == None: project = "config"
# with open(os.path.join("Comics","spiders", project)+".yml") as f:
# data = yaml.load(f, Loader=yaml.FullLoader)
# if key != None and data != None:
# return data[key]
def get_config_value(self, project, key=None):
# 使用Path类来处理文件路径
config_path = Path(os.path.join("Comics","spiders", project)+".yml")
#Path("Comics") / "spiders" / project / (project + ".yml")
# 检查项目是否存在
if not config_path.is_file():
return None
# 打开文件并加载配置数据
try:
with config_path.open('r') as f:
data = yaml.safe_load(f)
except yaml.YAMLError as e:
print(f"Error loading YAML file: {e}")
return None
# 检查key是否存在
if key is not None and key in data:
return data[key]
else:
return None
# 根据读取的配置数据导入到ComicLoader中
def comic(self, project, item: ComicLoader, child_data='data', val=None):
item.project_name(project)
data = self.get_config_value(project, child_data)
for key, xpath_data in data.items():
if isinstance(xpath_data, str): xpath_data = {'xpath': xpath_data}
xpath = xpath_data.get('xpath', None)
index = xpath_data.get('index', None)
value = xpath_data.get('value', None) if val is None else val
sexec = xpath_data.get('sexec', None)
item.set_properties(name=key, value=value, xpath=xpath, index=index, sexec=sexec)
return item
def parse_chapter(self,item: ComicLoader, value):
return self.comic(item.get_project_name(), item, "parse_chapter", value)
# 文件操作类
class fileUtils: class fileUtils:
# 文件是否存在
@classmethod @classmethod
def exists(cls, path): return os.path.exists(path) def exists(cls, path): return os.path.exists(path)
# 文件路径拼接
@classmethod @classmethod
def join(cls, path, *paths): return os.path.join(path, *paths); def join(cls, path, *paths): return os.path.join(path, *paths);
# 文件夹名
@classmethod @classmethod
def dirname(cls, path): return os.path.dirname(path); def dirname(cls, path): return os.path.dirname(path);
# 文件名
@classmethod @classmethod
def basename(cls, path): return os.path.basename(path); def basename(cls, path): return os.path.basename(path);
# 保存文件
@classmethod @classmethod
def save_file(cls,path,data): def save_file(cls,path,data):
root_dir = os.path.dirname(path) root_dir = os.path.dirname(path)
if not os.path.exists(root_dir): if not os.path.exists(root_dir): os.makedirs(root_dir)
os.makedirs(root_dir)
with open(path,'w',encoding='utf-8') as fs: with open(path,'w',encoding='utf-8') as fs:
fs.write(str(data)) fs.write(str(data))
fs.close()
# 返回校验后的文件路径
@classmethod @classmethod
def path(cls, file): def path(cls, file):
base_dir = os.path.dirname(file) base_dir = os.path.dirname(file)
if not os.path.exists(base_dir): os.makedirs(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir)
return file return file
# 比较文件大小
@classmethod @classmethod
def compare_size(cls, dst, file): def compare_size(cls, dst, file):
if os.path.exists(dst) and os.path.exists(file): if cls.exists(dst) and cls.exists(file):
return os.stat(dst).st_size == os.stat(file).st_size return os.stat(dst).st_size == os.stat(file).st_size
else: else:
return 0 return None
# 读取文件
@classmethod
def read(cls, file):
if os.path.exists(file):
with open(file, "r", encoding="utf-8") as fs: return fs.read()
else:
return []
""" """
图像编号 image-1.jpg 图像编号 image-1.jpg
存在image.png 返回 image-1.png 反之 image.png 存在image.png 返回 image-1.png 反之 image.png
""" """
@classmethod @classmethod
def file_check(cls, file, result="file"): def file_check(cls, file, result="file", count=0):
temp_file_name = file temp_file_name, files_size, files_name = [file, {}, []]
count = 1 # 默认文件名不存在
files_size = [] if not cls.exists(temp_file_name) and temp_file_name == file: count = 1
name, suffix = temp_file_name.split(".") while count or count == 0:
while count: temp_file_name = ComicPath().images_icon(file=file, count=count)
if os.path.exists(temp_file_name): if cls.exists(temp_file_name):
files_size.append(os.stat(temp_file_name).st_size) # 保存存在的文件名
temp_file_name = name+"-"+str(count)+"."+suffix files_name.append(temp_file_name)
file_size = os.path.getsize(temp_file_name)
# 保存文件名和大小数据
files_size[file_size] = {"name": temp_file_name, "size": file_size}
# 格式化文件名
# temp_file_name = ComicPath().images_icon(file=file, count=count)
count += 1 count += 1
else: else:
# 检测是否有重复数据
# 提取重复并需删除的文件名
diff_names = {value["name"] for value in files_size.values()}
# 不存在则返回原文件名
if len(diff_names) == 0: return file
for file_name in files_name:
if file_name not in diff_names:
logging.info(f"删除文件:{file_name}")
os.remove(file_name)
# 判断是否存在初始文件和多个文件名
if file in diff_names:
move_file = ComicPath().images_icon(file=file, count=count)
logging.info(f"移动文件{file}{move_file}")
shutil.move(file, move_file)
cls.file_check(file=file,result=result,count=0)
# 去重后文件名数与存在的文件名数不存在则证明文件存在重复,重新运行本方法
if len(set(diff_names)) != len(set(files_name)): cls.file_check(file, result=result,count=0)
if result == "size": if result == "size":
return files_size return {value["size"] for value in files_size.values()}
else: else:
return temp_file_name return temp_file_name
# 判断文件是否更新
@classmethod @classmethod
def file_update(cls, old_file, new_file): def file_update(cls, old_file, new_file):
is_update = False is_update = False
if os.path.exists(old_file): if os.path.exists(old_file): is_update = os.path.getsize(old_file) not in cls.file_check(new_file, result="size")
is_update = os.stat(old_file).st_size not in cls.file_check(new_file, result="size")
return is_update return is_update
# 判断是否需要更新封面 # 判断是否需要更新封面
@ -81,7 +173,7 @@ class fileUtils:
logging.info(f"update icon ... {image_path} ===> {cls.file_check(save_path)}") logging.info(f"update icon ... {image_path} ===> {cls.file_check(save_path)}")
shutil.copyfile(image_path, cls.file_check(save_path)) shutil.copyfile(image_path, cls.file_check(save_path))
# 公共工具类
class CommonUtils: class CommonUtils:
@classmethod @classmethod
def parseExec(cls,data,exec): def parseExec(cls,data,exec):
@ -92,6 +184,28 @@ class CommonUtils:
data = data.get(dot) data = data.get(dot)
return data return data
@classmethod
def _validate_xml(cls,xml_file, xsd_file):
# 读取XSD文件
xsd = xmlschema.XMLSchema(xsd_file)
# 验证XML
is_valid = xsd.is_valid(xml_file)
if is_valid:
print("XML文件通过XSD验证成功")
else:
print("XML文件未通过XSD验证。以下是验证错误信息")
validation_errors = xsd.to_errors(xml_file)
for error in validation_errors:
print(error)
@classmethod
def validate_comicinfo_xml(cls, xml_file):
cls._validate_xml(xml_file, "ComicInfo.xsd")
# 图片处理类
class imageUtils: class imageUtils:
@classmethod @classmethod
@ -307,7 +421,7 @@ class imageUtils:
logging.debug(f"remove {img_path}") logging.debug(f"remove {img_path}")
return save_path return save_path
# 压缩工具类
class CBZUtils: class CBZUtils:
@classmethod @classmethod
@ -418,4 +532,159 @@ class CBZUtils:
else: else:
os.remove(zip_path) os.remove(zip_path)
logging.error(f"validating fail === {zip_path}") logging.error(f"validating fail === {zip_path}")
return False return False
# 检测工具类
class checkUtils:
def read(self, item):
file = os.path.join(OUTPUT_DIR, ComicLoader(item=item).get_project_name(), "error_comics.json")
return fileUtils.read(file)
#
# 检测某一章节是否连续错误
def export_error(self, item):
if not self.is_error(item):
file = os.path.join(OUTPUT_DIR, ComicLoader(item=item).get_project_name(), "error_comics.json")
try:
error_comic = eval(self.read(item))
except:
error_comic = []
error_comic.append({ "name" : ComicPath.new_file_name(item['name']),
"chapter" : ComicPath.new_file_name(item['chapter']),
"date" : ComicPath().getYearMonthDay()})
fileUtils.save_file(file, json.dumps(error_comic))
def is_error(self, item):
try:
for error_c in eval(self.read(item)):
(name, chatper, date) = [error_c['name'], error_c['chapter'], error_c['date']]
if ComicPath.new_file_name(item['name']) == ComicPath.new_file_name(name) and ComicPath.new_file_name(item['chapter']) == ComicPath.new_file_name(chatper):
return True
else:
return False
except:
return False
# Comic路径类
class ComicPath:
PREFIX_SCRAMBLE = "scramble="
@classmethod
def getYearMonthDay(cls):
today = date.today()
# 格式化为年-月-日
return today.strftime("%Y%m%d")
@classmethod
def getDirComicChapter(cls, item, categorize=""):
comic = ComicLoader(item=item)
return os.path.join(OUTPUT_DIR, comic.get_project_name(), categorize, comic.get_name(), comic.get_chapter())
@classmethod
def getDirJosnComicChapter(cls, item):
return cls.getDirComicChapter(item=item, categorize="json")
@classmethod
def getFileScrambleImageName(cls,count,block,suffix=".jpg"): return cls.PREFIX_SCRAMBLE+str(block)+"_"+str(count)+suffix
@classmethod
def getFileScrambleImageSave(cls,file,relative=False, is_prefix=True):
file_name = str(file).split("_")[-1]
if relative:
file_name = os.path.basename(file_name)
if relative == "fullpath":
file_name = os.path.join(os.path.dirname(file), file_name)
if not is_prefix:
return file_name.split(".")[0]
else:
return file_name
#繁体中文转简体中文
@classmethod
def chinese_convert(cls, text,convert='t2s'): return OpenCC(convert).convert(str(text))
#处理成符合规定的文件名
@classmethod
def fix_file_name(cls, filename, replace=None):
if not isinstance(filename, str):
return filename
in_tab = r'[?*/\|.:><]'
str_replace = ""
if replace is not None:
str_replace = replace
filename = re.sub(in_tab, str_replace, filename)
count = 1
while True:
str_file = filename[0-count]
if str_file == " ":
count += 1
else:
filename = filename[0:len(filename)+1-count]
break
return filename
@classmethod
def new_file_name(cls, name): return cls.fix_file_name(cls.chinese_convert(name))
@classmethod
def get_file_path(cls, item, result_type="image", file=None, convert=False, chapter=None):
PROJECT = ComicLoader(item=item).get_project_name()
if not convert:
name = item['name']
if chapter == None: chapter = item['chapter']
else:
name = cls.fix_file_name(cls.chinese_convert(item['name']))
if chapter == None: chapter = cls.fix_file_name(cls.chinese_convert(item['chapter']))
if result_type == "image":
if os.path.sep not in file:
file = os.path.join(PROJECT, "images", name, chapter, file)
elif result_type == "comic_info":
file = os.path.join(PROJECT, "images", name, chapter)
elif result_type == "cbz_icon":
file = os.path.join(settings.CBZ_EXPORT_PATH, PROJECT, name, chapter+".jpg")
elif result_type == "down_icon":
file = os.path.join(settings.IMAGES_STORE, cls.get_file_path(item=item, result_type="icon"))
elif result_type == "down_cache_icon":
file = os.path.join(settings.IMAGES_STORE, cls.get_file_path(item=item, result_type="icon_cache"))
elif result_type == "icon":
file = os.path.join(PROJECT, "icons", name, name+".jpg")
elif result_type == "icon_cache":
file = os.path.join(PROJECT, "icons", ".cache", name+".jpg")
elif result_type == "cbz":
file = os.path.join(settings.CBZ_EXPORT_PATH, PROJECT, name, chapter+".CBZ")
elif result_type == "images_dir":
file = os.path.join(settings.IMAGES_STORE, PROJECT, "images", name, chapter)
else:
raise ValueError(f"Unsupported result_type: {result_type}")
return file
@classmethod
def path_cbz(cls, item):
return cls.get_file_path(item, result_type="cbz", convert=True)
@classmethod
def images_icon(cls, file, count):
if count == 0: return file
name, suffix = os.path.splitext(file)
return name+"-"+str(count)+suffix
# 通知类
class ntfy:
@classmethod
def sendMsg(cls, msg,alert=False,sleep=None,error=None):
try:
print(f"#ntfy: {msg}")
if alert:
requests.post("https://ntfy.caiwenxiu.cn/PyComic",
data=msg.encode(encoding='utf-8'))
except:
print(f"#ntfy error: {msg}")
if sleep != None:
logging.info(f'等待{sleep}秒后进入下一阶段')
time.sleep(int(sleep))
if error != None:
print(f"#ntfy Error: {error}")
return False
else:
return True

View File

@ -1,40 +0,0 @@
import os,json
from Comics.settings import CBZ_EXPORT_PATH,OUTPUT_DIR,PROJECT_KEY
from Comics.utils.Constant import ComicPath
from Comics.exporters import ComicInfoXmlItemExporter,JsonExport,ItemExporter, ItemImport
from Comics.utils.FileUtils import fileUtils as fu
from Comics.loader import ComicEntity
class checkUtils:
def read(self, item):
file = os.path.join(OUTPUT_DIR, item[PROJECT_KEY][0], "error_comics.json")
return ItemImport().import_obj(file)
#
# 检测某一章节是否连续错误
def export_error(self, item):
if not self.is_error(item):
file = os.path.join(OUTPUT_DIR, item[PROJECT_KEY][0], "error_comics.json")
try:
error_comic = eval(self.read(item))
except:
error_comic = []
error_comic.append({ "name" : ComicPath.new_file_name(item['name']),
"chapter" : ComicPath.new_file_name(item['chapter']),
"date" : ComicPath().getYearMonthDay()})
fu.save_file(file, json.dumps(error_comic))
def is_error(self, item):
try:
for error_c in eval(self.read(item)):
(name, chatper, date) = [error_c['name'], error_c['chapter'], error_c['date']]
if ComicPath.new_file_name(item['name']) == ComicPath.new_file_name(name) and ComicPath.new_file_name(item['chapter']) == ComicPath.new_file_name(chatper):
return True
else:
return False
except:
return False

View File

@ -1,114 +0,0 @@
import os.path,logging
import re,requests,time
from datetime import date
from Comics import settings
from opencc import OpenCC
class ComicPath:
PREFIX_SCRAMBLE = "scramble="
@classmethod
def getYearMonthDay(cls):
today = date.today()
# 格式化为年-月-日
return today.strftime("%Y%m%d")
@classmethod
def getDirComicChapter(cls):
return None
@classmethod
def getFileScrambleImageName(cls,count,block,suffix=".jpg"): return cls.PREFIX_SCRAMBLE+str(block)+"_"+str(count)+suffix
@classmethod
def getFileScrambleImageSave(cls,file,relative=False, is_prefix=True):
file_name = str(file).split("_")[-1]
if relative:
file_name = os.path.basename(file_name)
if relative == "fullpath":
file_name = os.path.join(os.path.dirname(file), file_name)
if not is_prefix:
return file_name.split(".")[0]
else:
return file_name
#繁体中文转简体中文
@classmethod
def chinese_convert(cls, text,convert='t2s'): return OpenCC(convert).convert(str(text))
#处理成符合规定的文件名
@classmethod
def fix_file_name(cls, filename, replace=None):
if not isinstance(filename, str):
return filename
in_tab = r'[?*/\|.:><]'
str_replace = ""
if replace is not None:
str_replace = replace
filename = re.sub(in_tab, str_replace, filename)
count = 1
while True:
str_file = filename[0-count]
if str_file == " ":
count += 1
else:
filename = filename[0:len(filename)+1-count]
break
return filename
@classmethod
def new_file_name(cls, name): return cls.fix_file_name(cls.chinese_convert(name))
@classmethod
def get_file_path(cls, item, result_type="image", file=None, convert=False):
PROJECT = item[settings.PROJECT_KEY][0]
if not convert:
name = item['name']
chapter = item['chapter']
else:
name = cls.fix_file_name(cls.chinese_convert(item['name']))
chapter = cls.fix_file_name(cls.chinese_convert(item['chapter']))
if result_type == "image":
if os.path.sep not in file:
file = os.path.join(PROJECT, "images", name, chapter, file)
elif result_type == "comic_info":
file = os.path.join(PROJECT, "images", name, chapter)
elif result_type == "cbz_icon":
file = os.path.join(settings.CBZ_EXPORT_PATH, PROJECT, name, chapter+".jpg")
elif result_type == "down_icon":
file = os.path.join(settings.IMAGES_STORE, cls.get_file_path(item=item, result_type="icon"))
elif result_type == "down_cache_icon":
file = os.path.join(settings.IMAGES_STORE, cls.get_file_path(item=item, result_type="icon_cache"))
elif result_type == "icon":
file = os.path.join(PROJECT, "icons", name, name+".jpg")
elif result_type == "icon_cache":
file = os.path.join(PROJECT, "icons", ".cache", name+".jpg")
elif result_type == "cbz":
file = os.path.join(settings.CBZ_EXPORT_PATH, PROJECT, name, chapter+".CBZ")
elif result_type == "images_dir":
file = os.path.join(settings.IMAGES_STORE, PROJECT, "images", name, chapter)
return file
@classmethod
def path_cbz(cls, item):
return cls.get_file_path(item, result_type="cbz", convert=True)
class ntfy:
@classmethod
def sendMsg(cls, msg,alert=False,sleep=None,error=None):
try:
print(f"#ntfy: {msg}")
if alert:
requests.post("https://ntfy.caiwenxiu.cn/PyComic",
data=msg.encode(encoding='utf-8'))
except:
print(f"#ntfy error: {msg}")
if sleep != None:
logging.info(f'等待{sleep}秒后进入下一阶段')
time.sleep(int(sleep))
if error != None:
print(f"#ntfy Error: {error}")
return False
else:
return True

2
run.py
View File

@ -2,4 +2,4 @@
from scrapy import cmdline from scrapy import cmdline
cmdline.execute("scrapy crawl rm_comic".split()) cmdline.execute("scrapy crawl rm_comic".split())