diff --git a/.gitignore b/.gitignore index 0eade7f..a5b4aff 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .scrapy/* .vscode/* +.DS_Store CBZ/* output/* /**/__pycache__ \ No newline at end of file diff --git a/Comics/__init__.py b/Comics/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/Comics/exporters.py b/Comics/exporters.py index f8af21d..97d2176 100644 --- a/Comics/exporters.py +++ b/Comics/exporters.py @@ -6,20 +6,10 @@ from scrapy.exporters import JsonItemExporter from Comics.items import ComicInfoItem from Comics.items import ComicItem from Comics.settings import COMIC_INFO_XML_STORE -from Comics.utils.Constant import ComicPath +from Comics.utils import ComicPath from scrapy.utils.python import is_listlike, to_bytes, to_unicode from itemadapter import ItemAdapter -class ItemImport(): - def import_obj(self, file): - if os.path.exists(file): - with open(file, "r", encoding="utf-8") as fs: - result = fs.read() - fs.close() - return result - else: - return [] - class CommonExporter(): def getPath(self, file , sufix=None): sufix = "."+sufix diff --git a/Comics/items.py b/Comics/items.py index 5098815..cb65687 100644 --- a/Comics/items.py +++ b/Comics/items.py @@ -4,9 +4,9 @@ # https://docs.org/en/latest/topics/items.html import os,Comics.settings as settings,logging from scrapy.item import Item, Field -from Comics.utils.Constant import ComicPath -from Comics.utils.FileUtils import imageUtils -from itemloaders.processors import TakeFirst, MapCompose, Join +from Comics.utils import ComicPath +from Comics.utils import imageUtils +from itemloaders.processors import TakeFirst # 繁体中文转为简体中文 def serialize_to_chinese(value): return ComicPath.chinese_convert(value) @@ -86,6 +86,11 @@ class ComicItem(Item): image_urls = Field(serializer=serialize_to_image_urls) # 图像名 images_name = Field() + + #章节链接 + chapter_href = Field() + #章节API + chapter_api = Field() # 序列化-作者 def serializer_info_writer(value): diff --git a/Comics/loader.py b/Comics/loader.py index 4eebf5d..8dddb37 100644 --- a/Comics/loader.py +++ b/Comics/loader.py @@ -8,7 +8,8 @@ class ComicLoader(ItemLoader): dots = str(exec).split(".") if not isinstance(data,dict): data = json.loads(data) for dot in dots: - data = data.get(dot) + if data != None: data = data.get(dot) + logging.debug(f"data= {data} dot={dot}") return data def add_xpath(self, field_name, xpath, *processors, index=None, exec=None, re=None, is_null=None, **kw): @@ -60,8 +61,8 @@ class ComicLoader(ItemLoader): def auto_replace_value(self, field_name, value): if self.get_output_value(field_name) != None: - self._replace_value(field_name, value) - return False + self._replace_value(field_name, value) + return False else: return True @@ -101,7 +102,30 @@ class ComicLoader(ItemLoader): def images(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('images', value, xpath, index, sexec) # 图像链接 def image_urls(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('image_urls', value, xpath, index, sexec) - + + def get_output_value(self, field_name): + value = super().get_output_value(field_name) + try: + if isinstance(value, list) and len(value) == 1 : value = value[0] + except: + print(f"get_output_value value={value} type={type(value)}") + return value + + # 漫画名称 + def get_name(self): return self.get_output_value("name") + # 漫画章节 + def get_chapter(self): return self.get_output_value("chapter") + # 工程名 + def get_project_name(self): return self.get_output_value(PROJECT_KEY) + # 章节链接 + def get_chapter_href(self): return self.get_output_value("chapter_href") + # 全部章节 + def get_chapters(self): return self.get_output_value("chapters") + + def get_chapter_api(self): return self.get_output_value("chapter_api") + + def get_image_urls(self): return self.get_output_value("image_urls") + class ComicEntity: ENTITY = None diff --git a/Comics/pipelines.py b/Comics/pipelines.py index e53b582..338abde 100644 --- a/Comics/pipelines.py +++ b/Comics/pipelines.py @@ -9,11 +9,11 @@ import os,scrapy,logging from Comics import settings from Comics.items import ComicItem from Comics.settings import OUTPUT_DIR -from Comics.loader import ComicEntity +from Comics.loader import ComicEntity,ComicLoader from Comics.exporters import ComicInfoXmlItemExporter -from Comics.utils.FileUtils import CBZUtils,fileUtils as fu -from Comics.utils.Constant import ComicPath -from Comics.utils.ComicUtils import checkUtils +from Comics.utils import CBZUtils,fileUtils as fu +from Comics.utils import ComicPath +from Comics.utils import checkUtils from Comics.exporters import JsonExport,ItemExporter from scrapy.pipelines.images import ImagesPipeline @@ -24,12 +24,14 @@ class ComicsPipeline(): # item就是yield后面的对象 def process_item(self, item, spider): if isinstance(item, ComicItem): + # item = ComicEntity(item).item() # 'output/rm_comic/json/壞X/第1話 壞X' - if fu.exists(ComicPath.path_cbz(item=item)): - return ItemExporter().export_obj(item) + # 已存在漫画CBZ文件 调用转换 + if fu.exists(ComicPath.path_cbz(item=item)): return ItemExporter().export_obj(item) else: - file = os.path.join(OUTPUT_DIR, spider.name, "json", item['name'], item['chapter']) - return JsonExport(file=file).export_json(ComicEntity(item).item(), if_return=True) + # 不存在漫画CBZ文件 + #file = os.path.join(OUTPUT_DIR, spider.name, "json", item['name'], item['chapter']) + return JsonExport(file=ComicPath.getDirJosnComicChapter(item)).export_json(ComicEntity(item).item(), if_return=True) # image解析 def close_spider(self, spider): @@ -102,6 +104,11 @@ class ImgDownloadPipeline(ImagesPipeline): # return item # 打包 cbz_path = self.get_file_path(item, result_type="cbz") + success_data = [] + for result in results: + if result[0]: success_data.append(result[1]) + image_urls = ComicLoader(item=item).get_image_urls() + if len(success_data) != len(image_urls): return if fu.exists(cbz_path): self.update_icon(item) self.pack_icon(item) diff --git a/Comics/settings.py b/Comics/settings.py index 476a6c6..6e96cfe 100644 --- a/Comics/settings.py +++ b/Comics/settings.py @@ -26,7 +26,7 @@ ROBOTSTXT_OBEY = False HTTPERROR_ALLOWED_CODES = [ 200 , 403] # Configure maximum concurrent requests performed by Scrapy (default: 16) -CONCURRENT_REQUESTS = 16 +CONCURRENT_REQUESTS = 8 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay @@ -45,7 +45,7 @@ RETRY_HTTP_CODES = [408, 401] CONCURRENT_REQUESTS_PER_DOMAIN = 16 CONCURRENT_REQUESTS_PER_IP = 16 PROXY_LIST = [ - "http://127.0.0.1:7890", +# "http://127.0.0.1:7890", # "http://10.0.10.117:8123", ] # Disable cookies (enabled by default) diff --git a/Comics/spiders/rm_comic.py b/Comics/spiders/rm_comic.py index d3826cc..7dd9fed 100644 --- a/Comics/spiders/rm_comic.py +++ b/Comics/spiders/rm_comic.py @@ -1,15 +1,17 @@ import scrapy,logging,time,os,skip from Comics.items import ComicItem from Comics.loader import ComicLoader -from Comics.utils.Constant import ComicPath -from Comics.utils.ComicUtils import checkUtils +from Comics.utils import ComicPath +from Comics.utils import checkUtils +from Comics.utils import Conf class RmComicSpider(scrapy.Spider): name = 'rm_comic' - allowed_domains = ['roum1.xyz'] + allowed_domains = ['roum12.xyz'] main_url = 'https://'+allowed_domains[0] start_urls = main_url+'/books' + # 遍历网站页数 def start_requests(self): for x in range(0,60): yield scrapy.Request(self.start_urls+"?&page="+str(x), callback=self.books_comic) @@ -17,66 +19,45 @@ class RmComicSpider(scrapy.Spider): # 获取多个漫画信息 def books_comic(self, response): comics = ComicLoader(item=ComicItem(), response=response) - data = comics.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0] - for book in comics.get_exec(data, str_exec="props.pageProps.books"): - comics.add_value('link', self.start_urls+"/"+book['id']) + for book in comics.get_exec(comics.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0], str_exec="props.pageProps.books"): if book['name'] not in skip.skip_comic: yield scrapy.Request(url=self.start_urls+"/"+book['id'], callback=self.parse_comic) # 获取某个漫画的相关数据 # 获取到多个章节链接后进入下个流程 def parse_comic(self, response): - comic_item = ComicLoader(item=ComicItem(), response=response) - comic_item.project_name(self.name) - comic_item.name(xpath='//div[@class="col"]/h5/text()') - comic_item.icon(xpath='//img[@class="img-thumbnail"]/@src') - comic_item.author(xpath='//div[contains(@class,"bookid_bookInfo")]/p[1]/text()', index=1) - comic_item.tags(xpath='//div[contains(@class,"bookid_bookInfo")]/p[3]/b/text()') - comic_item.dep(xpath='//div[contains(@class,"bookid_bookInfo")]/p[4]/text()', index=1) - comic_item.date(xpath='//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()', index=1) - comic_item.genre(value="韩漫") - comic_item.age_rating(value="R18+") - chapter_href = comic_item.get_xpath('//div[contains(@class,"bookid_chapterBox")]' - '//div[contains(@class,"bookid_chapter")]/a/@href') - chapters = comic_item.get_xpath('//div[contains(@class,"bookid_chapterBox")]' - '//div[contains(@class,"bookid_chapter")]/a/text()') - for chapter, link in zip(chapters, chapter_href): - comic_item.chapters(value=chapters) - comic_item.chapter(value=chapter) + comic_item = Conf().comic(self.name, ComicLoader(ComicItem(), response)) + for chapter, link in zip(comic_item.get_chapters(), comic_item.get_chapter_href()): item = comic_item.load_item() - cbz_path = ComicPath.get_file_path(item=item, result_type="cbz", convert=True) - if not checkUtils().is_error(item): - if os.path.exists(cbz_path): - logging.info(f"漫画 {cbz_path} 已存在, 跳过中...") - yield item - else: - yield scrapy.Request(self.main_url+link, meta={'item': item}, callback=self.parse_chapter) + cbz_path = ComicPath.get_file_path(item=item, result_type="cbz", convert=True, chapter=chapter) + if not checkUtils().is_error(item) and os.path.exists(cbz_path): + logging.info(f"漫画 {cbz_path} 已存在, 跳过中...") + yield item + else: + # 开始访问章节链接并跳转到self.parse_chapter + yield scrapy.Request(self.main_url+link, meta={'item': item}, callback=self.parse_chapter) # 读取某章节下的所有图片 def parse_chapter(self, response): comic_item = ComicLoader(item=response.meta['item'], response=response) data = comic_item.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0] - str_exec = "props.pageProps." - comic_item.name(value=data, sexec=str_exec+"bookName") - comic_item.dep(value=data, sexec=str_exec+"description") - comic_item.chapter(value=data, sexec=str_exec+"chapterName") - comic_item.image_urls(value=data, sexec=str_exec+"images") - comic_item.images(value=data, sexec=str_exec+"images") - comic = comic_item.load_item() - chapter_api_url = comic_item.get_exec(data, str_exec+"chapterAPIPath") - if chapter_api_url is not None: - yield scrapy.Request(self.main_url + chapter_api_url, meta={'item': comic}, callback=self.parse_chapter_api) + item: ComicLoader = Conf().parse_chapter(item=comic_item, value=data) + comic = item.load_item() + chapter_api_url = item.get_chapter_api() + if chapter_api_url is not None and len(chapter_api_url) != 0 : + try: + yield scrapy.Request(self.main_url + chapter_api_url, meta={'item': comic}, callback=self.parse_chapter_api) + except: + logging.warning(f"yield scrapy.Request({self.main_url} + {chapter_api_url}, meta={comic}, callback=self.parse_chapter_api)") else: yield comic # 加密数据API处理 def parse_chapter_api(self, response): comic_item = ComicLoader(item=response.meta['item'], response=response) - comic_item.chapter(value=response.text, sexec='chapter.name') - comic_item.image_urls(value=response.text, sexec='chapter.images') - comic_item.images(value=response.text, sexec='chapter.images') - yield comic_item.load_item() + item: ComicLoader = Conf().parse_chapter(item=comic_item, value=response.text) + yield item.load_item() def parse(self, response): diff --git a/Comics/spiders/rm_comic.yml b/Comics/spiders/rm_comic.yml new file mode 100644 index 0000000..6368b57 --- /dev/null +++ b/Comics/spiders/rm_comic.yml @@ -0,0 +1,41 @@ +data: + name: '//div[@class="col"]/h5/text()' + icon: '//img[@class="img-thumbnail"]/@src' + author: + xpath: '//div[contains(@class,"bookid_bookInfo")]/p[1]/text()' + index: 1 + tags: '//div[contains(@class,"bookid_bookInfo")]/p[3]/b/text()' + dep: + xpath: '//div[contains(@class,"bookid_bookInfo")]/p[4]/text()' + index: 1 + date: + xpath: '//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()' + index: 1 + genre: + value: "韩漫" + age_rating: + value: "R18+" + chapter_href: '//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/@href' + chapters: '//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/text()' + +parse_chapter: + name: + sexec: props.pageProps.bookName + dep: + sexec: props.pageProps.description + chapter: + sexec: props.pageProps.chapterName + image_urls: + sexec: props.pageProps.images + images: + sexec: props.pageProps.images + chapter_api: + sexec: props.pageProps.chapterAPIPath + +parse_chapter_api: + chapter: + sexec: chapter.name + image_urls: + sexec: chapter.images + images: + sexec: chapter.images \ No newline at end of file diff --git a/Comics/spiders/yh_comic.py b/Comics/spiders/yh_comic.py index 5060b76..57df971 100644 --- a/Comics/spiders/yh_comic.py +++ b/Comics/spiders/yh_comic.py @@ -1,7 +1,7 @@ import scrapy,logging,time,os from Comics.items import ComicItem from Comics.loader import ComicLoader -from Comics.utils.Constant import ComicPath +from Comics.utils import ComicPath from Comics.settings import PROJECT_KEY import skip diff --git a/Comics/utils/FileUtils.py b/Comics/utils.py similarity index 54% rename from Comics/utils/FileUtils.py rename to Comics/utils.py index 6d4aecc..bdb970f 100644 --- a/Comics/utils/FileUtils.py +++ b/Comics/utils.py @@ -1,74 +1,166 @@ -import base64,hashlib,os,shutil +import base64,hashlib,os,shutil,os.path import math,time,json,datetime,logging +import re,requests,time,xmlschema +from datetime import date +from Comics import settings +from opencc import OpenCC from PIL import Image -from Comics.utils.Constant import ComicPath from pathlib import Path from zipfile import ZipFile -from Comics.settings import COMIC_INFO_XML_FILE,CBZ_EXPORT_PATH,IMAGES_STORE -from Comics.utils.Constant import ntfy +from Comics.settings import COMIC_INFO_XML_FILE,OUTPUT_DIR,PROJECT_KEY +import yaml +from Comics.loader import ComicLoader + +# 配置类 +class Conf(): + # 读取yml文件配置 + # @project 根据工程名读取配置 project.yml + # @key 读取key内的字典的数据(默认为空) + #def init(self, project, key=None): + # data = None + # if project == None: project = "config" + # with open(os.path.join("Comics","spiders", project)+".yml") as f: + # data = yaml.load(f, Loader=yaml.FullLoader) + # if key != None and data != None: + # return data[key] + def get_config_value(self, project, key=None): + # 使用Path类来处理文件路径 + config_path = Path(os.path.join("Comics","spiders", project)+".yml") + #Path("Comics") / "spiders" / project / (project + ".yml") + # 检查项目是否存在 + if not config_path.is_file(): + return None + # 打开文件并加载配置数据 + try: + with config_path.open('r') as f: + data = yaml.safe_load(f) + except yaml.YAMLError as e: + print(f"Error loading YAML file: {e}") + return None + # 检查key是否存在 + if key is not None and key in data: + return data[key] + else: + return None + + # 根据读取的配置数据导入到ComicLoader中 + def comic(self, project, item: ComicLoader, child_data='data', val=None): + item.project_name(project) + data = self.get_config_value(project, child_data) + for key, xpath_data in data.items(): + if isinstance(xpath_data, str): xpath_data = {'xpath': xpath_data} + xpath = xpath_data.get('xpath', None) + index = xpath_data.get('index', None) + value = xpath_data.get('value', None) if val is None else val + sexec = xpath_data.get('sexec', None) + item.set_properties(name=key, value=value, xpath=xpath, index=index, sexec=sexec) + return item + + def parse_chapter(self,item: ComicLoader, value): + return self.comic(item.get_project_name(), item, "parse_chapter", value) + +# 文件操作类 class fileUtils: + # 文件是否存在 @classmethod def exists(cls, path): return os.path.exists(path) - + + # 文件路径拼接 @classmethod def join(cls, path, *paths): return os.path.join(path, *paths); - + + # 文件夹名 @classmethod def dirname(cls, path): return os.path.dirname(path); + # 文件名 @classmethod def basename(cls, path): return os.path.basename(path); - + + # 保存文件 @classmethod def save_file(cls,path,data): root_dir = os.path.dirname(path) - if not os.path.exists(root_dir): - os.makedirs(root_dir) + if not os.path.exists(root_dir): os.makedirs(root_dir) with open(path,'w',encoding='utf-8') as fs: fs.write(str(data)) - fs.close() - + + # 返回校验后的文件路径 @classmethod def path(cls, file): base_dir = os.path.dirname(file) if not os.path.exists(base_dir): os.makedirs(base_dir) return file - + + # 比较文件大小 @classmethod def compare_size(cls, dst, file): - if os.path.exists(dst) and os.path.exists(file): + if cls.exists(dst) and cls.exists(file): return os.stat(dst).st_size == os.stat(file).st_size else: - return 0 - + return None + + # 读取文件 + @classmethod + def read(cls, file): + if os.path.exists(file): + with open(file, "r", encoding="utf-8") as fs: return fs.read() + else: + return [] + """ 图像编号 image-1.jpg 如:存在image.png 返回 image-1.png 反之 image.png """ @classmethod - def file_check(cls, file, result="file"): - temp_file_name = file - count = 1 - files_size = [] - name, suffix = temp_file_name.split(".") - while count: - if os.path.exists(temp_file_name): - files_size.append(os.stat(temp_file_name).st_size) - temp_file_name = name+"-"+str(count)+"."+suffix + def file_check(cls, file, result="file", count=0): + temp_file_name, files_size, files_name = [file, {}, []] + # 默认文件名不存在 + if not cls.exists(temp_file_name) and temp_file_name == file: count = 1 + while count or count == 0: + temp_file_name = ComicPath().images_icon(file=file, count=count) + if cls.exists(temp_file_name): + # 保存存在的文件名 + files_name.append(temp_file_name) + file_size = os.path.getsize(temp_file_name) + # 保存文件名和大小数据 + files_size[file_size] = {"name": temp_file_name, "size": file_size} + # 格式化文件名 + # temp_file_name = ComicPath().images_icon(file=file, count=count) count += 1 else: + # 检测是否有重复数据 + # 提取重复并需删除的文件名 + diff_names = {value["name"] for value in files_size.values()} + # 不存在则返回原文件名 + if len(diff_names) == 0: return file + for file_name in files_name: + if file_name not in diff_names: + logging.info(f"删除文件:{file_name}") + os.remove(file_name) + + # 判断是否存在初始文件和多个文件名 + if file in diff_names: + move_file = ComicPath().images_icon(file=file, count=count) + logging.info(f"移动文件{file}到 {move_file}") + shutil.move(file, move_file) + cls.file_check(file=file,result=result,count=0) + # 去重后文件名数与存在的文件名数不存在则证明文件存在重复,重新运行本方法 + if len(set(diff_names)) != len(set(files_name)): cls.file_check(file, result=result,count=0) + if result == "size": - return files_size + return {value["size"] for value in files_size.values()} else: - return temp_file_name - + return temp_file_name + + + # 判断文件是否更新 @classmethod def file_update(cls, old_file, new_file): is_update = False - if os.path.exists(old_file): - is_update = os.stat(old_file).st_size not in cls.file_check(new_file, result="size") + if os.path.exists(old_file): is_update = os.path.getsize(old_file) not in cls.file_check(new_file, result="size") return is_update # 判断是否需要更新封面 @@ -81,7 +173,7 @@ class fileUtils: logging.info(f"update icon ... {image_path} ===> {cls.file_check(save_path)}") shutil.copyfile(image_path, cls.file_check(save_path)) - +# 公共工具类 class CommonUtils: @classmethod def parseExec(cls,data,exec): @@ -92,6 +184,28 @@ class CommonUtils: data = data.get(dot) return data + @classmethod + def _validate_xml(cls,xml_file, xsd_file): + # 读取XSD文件 + xsd = xmlschema.XMLSchema(xsd_file) + + # 验证XML + is_valid = xsd.is_valid(xml_file) + + if is_valid: + print("XML文件通过XSD验证成功!") + else: + print("XML文件未通过XSD验证。以下是验证错误信息:") + validation_errors = xsd.to_errors(xml_file) + for error in validation_errors: + print(error) + + @classmethod + def validate_comicinfo_xml(cls, xml_file): + cls._validate_xml(xml_file, "ComicInfo.xsd") + + +# 图片处理类 class imageUtils: @classmethod @@ -307,7 +421,7 @@ class imageUtils: logging.debug(f"remove {img_path}") return save_path - +# 压缩工具类 class CBZUtils: @classmethod @@ -418,4 +532,159 @@ class CBZUtils: else: os.remove(zip_path) logging.error(f"validating fail === {zip_path}") - return False \ No newline at end of file + return False + +# 检测工具类 +class checkUtils: + + def read(self, item): + file = os.path.join(OUTPUT_DIR, ComicLoader(item=item).get_project_name(), "error_comics.json") + return fileUtils.read(file) + # + # 检测某一章节是否连续错误 + def export_error(self, item): + if not self.is_error(item): + file = os.path.join(OUTPUT_DIR, ComicLoader(item=item).get_project_name(), "error_comics.json") + try: + error_comic = eval(self.read(item)) + except: + error_comic = [] + error_comic.append({ "name" : ComicPath.new_file_name(item['name']), + "chapter" : ComicPath.new_file_name(item['chapter']), + "date" : ComicPath().getYearMonthDay()}) + fileUtils.save_file(file, json.dumps(error_comic)) + + def is_error(self, item): + try: + for error_c in eval(self.read(item)): + (name, chatper, date) = [error_c['name'], error_c['chapter'], error_c['date']] + if ComicPath.new_file_name(item['name']) == ComicPath.new_file_name(name) and ComicPath.new_file_name(item['chapter']) == ComicPath.new_file_name(chatper): + return True + else: + return False + except: + return False + +# Comic路径类 +class ComicPath: + PREFIX_SCRAMBLE = "scramble=" + + @classmethod + def getYearMonthDay(cls): + today = date.today() + # 格式化为年-月-日 + return today.strftime("%Y%m%d") + + @classmethod + def getDirComicChapter(cls, item, categorize=""): + comic = ComicLoader(item=item) + return os.path.join(OUTPUT_DIR, comic.get_project_name(), categorize, comic.get_name(), comic.get_chapter()) + + @classmethod + def getDirJosnComicChapter(cls, item): + return cls.getDirComicChapter(item=item, categorize="json") + + @classmethod + def getFileScrambleImageName(cls,count,block,suffix=".jpg"): return cls.PREFIX_SCRAMBLE+str(block)+"_"+str(count)+suffix + + @classmethod + def getFileScrambleImageSave(cls,file,relative=False, is_prefix=True): + file_name = str(file).split("_")[-1] + if relative: + file_name = os.path.basename(file_name) + if relative == "fullpath": + file_name = os.path.join(os.path.dirname(file), file_name) + if not is_prefix: + return file_name.split(".")[0] + else: + return file_name + + #繁体中文转简体中文 + @classmethod + def chinese_convert(cls, text,convert='t2s'): return OpenCC(convert).convert(str(text)) + + #处理成符合规定的文件名 + @classmethod + def fix_file_name(cls, filename, replace=None): + if not isinstance(filename, str): + return filename + in_tab = r'[?*/\|.:><]' + str_replace = "" + if replace is not None: + str_replace = replace + filename = re.sub(in_tab, str_replace, filename) + count = 1 + while True: + str_file = filename[0-count] + if str_file == " ": + count += 1 + else: + filename = filename[0:len(filename)+1-count] + break + return filename + + @classmethod + def new_file_name(cls, name): return cls.fix_file_name(cls.chinese_convert(name)) + + @classmethod + def get_file_path(cls, item, result_type="image", file=None, convert=False, chapter=None): + PROJECT = ComicLoader(item=item).get_project_name() + if not convert: + name = item['name'] + if chapter == None: chapter = item['chapter'] + else: + name = cls.fix_file_name(cls.chinese_convert(item['name'])) + if chapter == None: chapter = cls.fix_file_name(cls.chinese_convert(item['chapter'])) + + if result_type == "image": + if os.path.sep not in file: + file = os.path.join(PROJECT, "images", name, chapter, file) + elif result_type == "comic_info": + file = os.path.join(PROJECT, "images", name, chapter) + elif result_type == "cbz_icon": + file = os.path.join(settings.CBZ_EXPORT_PATH, PROJECT, name, chapter+".jpg") + elif result_type == "down_icon": + file = os.path.join(settings.IMAGES_STORE, cls.get_file_path(item=item, result_type="icon")) + elif result_type == "down_cache_icon": + file = os.path.join(settings.IMAGES_STORE, cls.get_file_path(item=item, result_type="icon_cache")) + elif result_type == "icon": + file = os.path.join(PROJECT, "icons", name, name+".jpg") + elif result_type == "icon_cache": + file = os.path.join(PROJECT, "icons", ".cache", name+".jpg") + elif result_type == "cbz": + file = os.path.join(settings.CBZ_EXPORT_PATH, PROJECT, name, chapter+".CBZ") + elif result_type == "images_dir": + file = os.path.join(settings.IMAGES_STORE, PROJECT, "images", name, chapter) + else: + raise ValueError(f"Unsupported result_type: {result_type}") + return file + + @classmethod + def path_cbz(cls, item): + return cls.get_file_path(item, result_type="cbz", convert=True) + + @classmethod + def images_icon(cls, file, count): + if count == 0: return file + name, suffix = os.path.splitext(file) + return name+"-"+str(count)+suffix + +# 通知类 +class ntfy: + @classmethod + def sendMsg(cls, msg,alert=False,sleep=None,error=None): + try: + print(f"#ntfy: {msg}") + if alert: + requests.post("https://ntfy.caiwenxiu.cn/PyComic", + data=msg.encode(encoding='utf-8')) + except: + print(f"#ntfy error: {msg}") + if sleep != None: + logging.info(f'等待{sleep}秒后进入下一阶段') + time.sleep(int(sleep)) + if error != None: + print(f"#ntfy Error: {error}") + return False + else: + return True \ No newline at end of file diff --git a/Comics/utils/ComicUtils.py b/Comics/utils/ComicUtils.py deleted file mode 100644 index e502048..0000000 --- a/Comics/utils/ComicUtils.py +++ /dev/null @@ -1,40 +0,0 @@ -import os,json -from Comics.settings import CBZ_EXPORT_PATH,OUTPUT_DIR,PROJECT_KEY -from Comics.utils.Constant import ComicPath -from Comics.exporters import ComicInfoXmlItemExporter,JsonExport,ItemExporter, ItemImport -from Comics.utils.FileUtils import fileUtils as fu -from Comics.loader import ComicEntity - -class checkUtils: - - def read(self, item): - file = os.path.join(OUTPUT_DIR, item[PROJECT_KEY][0], "error_comics.json") - return ItemImport().import_obj(file) - # - # 检测某一章节是否连续错误 - def export_error(self, item): - if not self.is_error(item): - file = os.path.join(OUTPUT_DIR, item[PROJECT_KEY][0], "error_comics.json") - try: - error_comic = eval(self.read(item)) - except: - error_comic = [] - error_comic.append({ "name" : ComicPath.new_file_name(item['name']), - "chapter" : ComicPath.new_file_name(item['chapter']), - "date" : ComicPath().getYearMonthDay()}) - fu.save_file(file, json.dumps(error_comic)) - - def is_error(self, item): - try: - for error_c in eval(self.read(item)): - (name, chatper, date) = [error_c['name'], error_c['chapter'], error_c['date']] - if ComicPath.new_file_name(item['name']) == ComicPath.new_file_name(name) and ComicPath.new_file_name(item['chapter']) == ComicPath.new_file_name(chatper): - return True - else: - return False - except: - return False - - - - \ No newline at end of file diff --git a/Comics/utils/Constant.py b/Comics/utils/Constant.py deleted file mode 100644 index 4c76337..0000000 --- a/Comics/utils/Constant.py +++ /dev/null @@ -1,114 +0,0 @@ -import os.path,logging -import re,requests,time -from datetime import date -from Comics import settings -from opencc import OpenCC -class ComicPath: - PREFIX_SCRAMBLE = "scramble=" - - @classmethod - def getYearMonthDay(cls): - today = date.today() - # 格式化为年-月-日 - return today.strftime("%Y%m%d") - - @classmethod - def getDirComicChapter(cls): - return None - - @classmethod - def getFileScrambleImageName(cls,count,block,suffix=".jpg"): return cls.PREFIX_SCRAMBLE+str(block)+"_"+str(count)+suffix - - @classmethod - def getFileScrambleImageSave(cls,file,relative=False, is_prefix=True): - file_name = str(file).split("_")[-1] - if relative: - file_name = os.path.basename(file_name) - if relative == "fullpath": - file_name = os.path.join(os.path.dirname(file), file_name) - if not is_prefix: - return file_name.split(".")[0] - else: - return file_name - - #繁体中文转简体中文 - @classmethod - def chinese_convert(cls, text,convert='t2s'): return OpenCC(convert).convert(str(text)) - - #处理成符合规定的文件名 - @classmethod - def fix_file_name(cls, filename, replace=None): - if not isinstance(filename, str): - return filename - in_tab = r'[?*/\|.:><]' - str_replace = "" - if replace is not None: - str_replace = replace - filename = re.sub(in_tab, str_replace, filename) - count = 1 - while True: - str_file = filename[0-count] - if str_file == " ": - count += 1 - else: - filename = filename[0:len(filename)+1-count] - break - return filename - - @classmethod - def new_file_name(cls, name): return cls.fix_file_name(cls.chinese_convert(name)) - - @classmethod - def get_file_path(cls, item, result_type="image", file=None, convert=False): - PROJECT = item[settings.PROJECT_KEY][0] - if not convert: - name = item['name'] - chapter = item['chapter'] - else: - name = cls.fix_file_name(cls.chinese_convert(item['name'])) - chapter = cls.fix_file_name(cls.chinese_convert(item['chapter'])) - - if result_type == "image": - if os.path.sep not in file: - file = os.path.join(PROJECT, "images", name, chapter, file) - elif result_type == "comic_info": - file = os.path.join(PROJECT, "images", name, chapter) - elif result_type == "cbz_icon": - file = os.path.join(settings.CBZ_EXPORT_PATH, PROJECT, name, chapter+".jpg") - elif result_type == "down_icon": - file = os.path.join(settings.IMAGES_STORE, cls.get_file_path(item=item, result_type="icon")) - elif result_type == "down_cache_icon": - file = os.path.join(settings.IMAGES_STORE, cls.get_file_path(item=item, result_type="icon_cache")) - elif result_type == "icon": - file = os.path.join(PROJECT, "icons", name, name+".jpg") - elif result_type == "icon_cache": - file = os.path.join(PROJECT, "icons", ".cache", name+".jpg") - elif result_type == "cbz": - file = os.path.join(settings.CBZ_EXPORT_PATH, PROJECT, name, chapter+".CBZ") - elif result_type == "images_dir": - file = os.path.join(settings.IMAGES_STORE, PROJECT, "images", name, chapter) - return file - - @classmethod - def path_cbz(cls, item): - return cls.get_file_path(item, result_type="cbz", convert=True) - - -class ntfy: - @classmethod - def sendMsg(cls, msg,alert=False,sleep=None,error=None): - try: - print(f"#ntfy: {msg}") - if alert: - requests.post("https://ntfy.caiwenxiu.cn/PyComic", - data=msg.encode(encoding='utf-8')) - except: - print(f"#ntfy error: {msg}") - if sleep != None: - logging.info(f'等待{sleep}秒后进入下一阶段') - time.sleep(int(sleep)) - if error != None: - print(f"#ntfy Error: {error}") - return False - else: - return True \ No newline at end of file diff --git a/run.py b/run.py index b6cd60f..c9743c2 100644 --- a/run.py +++ b/run.py @@ -2,4 +2,4 @@ from scrapy import cmdline -cmdline.execute("scrapy crawl rm_comic".split()) \ No newline at end of file +cmdline.execute("scrapy crawl rm_comic".split())