diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/ComicScrapy.iml b/.idea/ComicScrapy.iml new file mode 100644 index 0000000..8ac55ea --- /dev/null +++ b/.idea/ComicScrapy.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..9ef495e --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..23f68eb --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/Comics/exporters.py b/Comics/exporters.py new file mode 100644 index 0000000..77faca6 --- /dev/null +++ b/Comics/exporters.py @@ -0,0 +1,98 @@ +import os.path,json,ast + +from Comics.settings import COMIC_INFO_FIELDS_TO_EXPORT +from scrapy.exporters import XmlItemExporter +from scrapy.exporters import PythonItemExporter +from Comics.items import ComicInfoItem +from Comics.items import ComicItem +from Comics.settings import COMIC_INFO_XML_STORE +from Comics.utils.Constant import ComicPath +from scrapy.utils.python import is_listlike, to_bytes, to_unicode + +class ItemExporter(PythonItemExporter): + def convert(self, data): + if isinstance(data, bytes): return data.decode("utf-8") + if isinstance(data, dict): return dict(map(self.convert, data.items())) + if isinstance(data, tuple): return map(self.convert, data) + if isinstance(data, list): return [self.convert(i) for i in data] + return data + + def export_obj(self, obj_item): + self.start_exporting() + obj_item = self.convert(self.export_item(obj_item)) + self.finish_exporting() + return obj_item + +class ComicInfoXmlItemExporter(XmlItemExporter): + custom_root_element = "ComicInfo" + def __init__(self, comic, chapter): + file_path = os.path.join(COMIC_INFO_XML_STORE, comic, + chapter, f"{self.custom_root_element}.xml") + dir_path = os.path.dirname(file_path) + if not os.path.exists(dir_path): os.makedirs(dir_path) + self.xml_file = open(file_path, "wb") + super(ComicInfoXmlItemExporter, self).__init__(self.xml_file, + root_element=self.custom_root_element, + indent=1,fields_to_export=COMIC_INFO_FIELDS_TO_EXPORT) + + def serialize_field(self, field, name, value): + #通过序列化 + value = ComicPath.chinese_convert(value) + return super().serialize_field(field, name, value) + + def start_exporting(self): + self.xg.startDocument() + self.xg.startElement(self.custom_root_element, {}) + + def comic_to_info_item(self, comic_item): + comic_info = {} + comic_info_dict = getattr(ComicItem, "data", 0) + for key, value in ComicItem(comic_item).items(): + new_key = comic_info_dict.get(key) + if new_key is not None: + comic_info[new_key] = value + return ItemExporter().export_obj(ComicInfoItem(comic_info)) + + def export_item(self, item): + comic_info = self.comic_to_info_item(item) + child_element = "Page" + self._beautify_indent(depth=1) + self._beautify_newline() + for name, value in self._get_serialized_fields(comic_info, default_value=""): + if name is "Pages": + value = str(value).split(',') + if value is not None or value != "": + self._export_xml_field(name, value, depth=2, child_element=child_element) + self._beautify_indent(depth=1) + return comic_info + + def _export_xml_field(self, name, serialized_value, depth, child_element="value"): + self._beautify_indent(depth=depth) + self.xg.startElement(name, {}) + if hasattr(serialized_value, "items"): + self._beautify_newline() + for subname, value in serialized_value.items(): + self._export_xml_field(subname, value, depth=depth + 1) + self._beautify_indent(depth=depth) + elif is_listlike(serialized_value): + self._beautify_newline() + for value in serialized_value: + self._export_xml_field(child_element, value, depth=depth + 1) + self._beautify_indent(depth=depth) + elif isinstance(serialized_value, str): + self.xg.characters(serialized_value) + else: + self.xg.characters(str(serialized_value)) + self.xg.endElement(name) + self._beautify_newline() + + def finish_exporting(self): + self.xg.endElement(self.custom_root_element) + self.xg.endDocument() + self.xml_file.close() + + def export_xml(self, item): + self.start_exporting() + comic_info = self.export_item(item) + self.finish_exporting() + return comic_info diff --git a/Comics/items.py b/Comics/items.py index 08bddc2..7700738 100644 --- a/Comics/items.py +++ b/Comics/items.py @@ -1,52 +1,93 @@ # Define here the models for your scraped items # # See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy +# https://docs.org/en/latest/topics/items.html +from scrapy.item import Item, Field +from Comics.utils.Constant import ComicPath +from dataclasses import dataclass +from scrapy.loader.processors import TakeFirst, MapCompose, Join -class ComicsItem(scrapy.Item): - #漫画名 - name = scrapy.Field() - #链接 - link = scrapy.Field() +data = {} +def setinfo(**kwds): + def decorate(f): + for k in kwds: data[k] = kwds[k] + setattr(f, "data", data) + return f + return decorate -class ComicItem(scrapy.Item): - name = scrapy.Field() - chapter = scrapy.Field() - list_img = scrapy.Field() - author= scrapy.Field() - icon = scrapy.Field() - tags = scrapy.Field() - dep = scrapy.Field() - date = scrapy.Field() - chapters = scrapy.Field() - chapter_href= scrapy.Field() - genre = scrapy.Field() - age_rating = scrapy.Field() +def serialize_to_chinese(value): + return ComicPath.chinese_convert(value) -class ImageItem(scrapy.Item): - image_name = scrapy.Field() - image_url = scrapy.Field() - image_path = scrapy.Field() +def serialize_to_fix_file(value): + file = ComicPath.chinese_convert(value) + return ComicPath.fix_file_name(file) -class ComicInfoItem(scrapy.Item): - Title= scrapy.Field()#"章节名",True] - Series = scrapy.Field()# ","漫画名",True] - Number = scrapy.Field()# ","编号",True] - SeriesGroup = scrapy.Field()# ","别名",False] - Summary = scrapy.Field()# ","概述",True] - Year = scrapy.Field()# ","年",False] - Month = scrapy.Field()# ","月",False] - Day = scrapy.Field()# ","日",False] - Writer = scrapy.Field()# "作者",True] - Publisher = scrapy.Field()# ","出版社",False] - Genre = scrapy.Field()# ","流派",True] - Tags = scrapy.Field()# ","标签",True] - Web = scrapy.Field()# ","主页",False] - PageCount = scrapy.Field()# ","总页数",True] - LanguageISO = scrapy.Field()#","语言",True] - AgeRating = scrapy.Field()#","年龄分级",False] - Pages = scrapy.Field()#","页码",True] - # ComicInfo.xml and ComicChapter.json end \ No newline at end of file +class ComicOItem(Item): + name = Field() + chapterItem = Field() + +@setinfo(name="Series", chapter="Title", + author="Writer", tags="Tags", + dep="Summary", genre="Genre", + index="Number", images_name="Pages", + age_rating="AgeRating") +class ComicItem(Item): + # 编号 + index = Field() + # 漫画名 + name = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst()) + # 章节名 + chapter = Field(serializer=serialize_to_fix_file) + # 图片链接 + list_img = Field() + # 作者 + author = Field(serialize_to_chinese=serialize_to_chinese, output_processor=TakeFirst()) + # 封面链接 + icon = Field() + # 标签 + tags = Field(serializer=serialize_to_chinese) + # 概述 + dep = Field(serializer=serialize_to_chinese) + # 时间 + date = Field() + # 流派 + genre = Field() + # 年龄分级 + age_rating = Field() + + images = Field() + images_name = Field() + +class ImageItem(Item): + image_name = Field() + image_url = Field() + image_path = Field() + +def serializer_info_writer(value): + list_value = [] + str(value).replace("&", " ") + for v in str(value).split(" "): + list_value.append(v) + return ",".join(list_value) + +class ComicInfoItem(Item): + Title = Field()#"章节名",True] + Series = Field()# ","漫画名",True] + Number = Field()# ","编号",True] + SeriesGroup = Field()# ","别名",False] + Summary = Field()# ","概述",True] + Year = Field()# ","年",False] + Month = Field()# ","月",False] + Day = Field()# ","日",False] + Writer = Field(serializer=serializer_info_writer)# "作者",True] + Publisher = Field()# ","出版社",False] + Genre = Field()# ","流派",True] + Tags = Field()# ","标签",True] + Web = Field()# ","主页",False] + PageCount = Field()# ","总页数",True] + LanguageISO = Field()#","语言",True] + AgeRating = Field()#","年龄分级",False] + Pages = Field()#","页码",True] + Page = Field() + # ComicInfo.xml and ComicChapter.json end \ No newline at end of file diff --git a/Comics/pipelines.py b/Comics/pipelines.py index 208df52..a114f8e 100644 --- a/Comics/pipelines.py +++ b/Comics/pipelines.py @@ -8,93 +8,74 @@ import os,requests,re,scrapy,logging from Comics import settings from Comics.utils.FileUtils import imageUtils +from Comics.utils.FileUtils import fileUtils from Comics.utils.Constant import ComicPath from Comics.items import ComicItem from Comics.items import ImageItem from scrapy.pipelines.images import ImagesPipeline -from scrapy.exporters import XmlItemExporter -from itemadapter import ItemAdapter - +from Comics.exporters import ComicInfoXmlItemExporter +from Comics.exporters import ItemExporter +from Comics.utils.CBZUtils import CBZUtils class ComicsPipeline: - def open_spider(self,spider): - self.fp = open('book.json','w',encoding='utf-8') - + def open_spider(self, spider): + pass # item就是yield后面的对象 def process_item(self, item, spider): - self.fp.write(str(item)) + if isinstance(item, ComicItem): + item = ComicItem(ItemExporter().export_obj(item)) + file = os.path.join("json",item['name'],item['chapter']) + fileUtils.save_file(f"{file}.json",item) return item #image解析 def close_spider(self,spider): - self.fp.close() + pass class ImageParsePipeline: def process_item(self, item, spider): if isinstance(item, ComicItem): - list_img = item['list_img'] count = 1 - scramble_count = 0 - list_image_item = [] - for image in list_img: + images_item = [] + for image in item['list_img']: (image_src,scramble) = [image.get("src"),image.get("scramble")] count_image = "{:0>3d}".format(count) - image_src_suffix = "."+str(image_src).split(".")[-1] - image_file_name = count_image+image_src_suffix + suffix = "."+str(image_src).split(".")[-1] + image_name = count_image + suffix if scramble: - de_str = str(image_src).split("/")[-1].replace(image_src_suffix,"==") + de_str = str(image_src).split("/")[-1].replace(suffix,"==") blocks_num = imageUtils.encodeImage(de_str) - scramble_image_file_name = ComicPath.getFileScrambleImageName(count=count_image,block=blocks_num,suffix=image_src_suffix) - scramble_count += 1 - image_path = os.path.join(item['name'],item['chapter'],scramble_image_file_name) - image_path = ComicPath.ChineseConvert(image_path) - list_image_item.append(ImageItem(image_name=image_file_name,image_url=image_src,image_path=image_path)) - count+=1 - return list_image_item + image_name = ComicPath.getFileScrambleImageName(count=count_image,block=blocks_num,suffix=suffix) + image_path = os.path.join(item['name'],item['chapter'], image_name) + images_item.append(ImageItem(image_name=count_image + suffix,image_url=image_src,image_path=image_path)) + count += 1 + item['images'] = images_item + return item class ImgDownloadPipeline(ImagesPipeline): def file_path(self, request, response=None, info=None, *, item=None): image = request.meta['item'] image_path = image['image_path'] - en_image_path = os.path.join(os.path.dirname(image_path),image['image_name']) - if os.path.exists(en_image_path): return en_image_path - else: return image_path + en_image_path = os.path.join(os.path.dirname(image_path), image['image_name']) + if os.path.exists(os.path.join(settings.IMAGES_STORE, en_image_path)): + return en_image_path + else: + return image_path def get_media_requests(self, item, info): - for image in item: - host = re.sub(r'(http://|https://)', '', image['image_url']).split('/')[0] + for image in item['images']: yield scrapy.Request(url= image['image_url'], meta= {'item' : image}) def item_completed(self, results, item, info): - if len(results) == len(item): - for image in results: - success = image[0] - img = image[1] - img_path = os.path.join(settings.IMAGES_STORE,img['path']) - #解密图片 - imageUtils.deScrambleImagesByPath(img_path) - return item - -class ComicInfoXmlPipeline: - - def open_spider(self, spider): - self.xml_exporter = {} - - def close_spider(self, spider): - for exporter, xml_file in self.xml_exporter.values(): - exporter.finish_exporting() - xml_file.close() - - def _exporter_for_item(self, item): - adapter = ItemAdapter(item) - xml_file = open("ComicInfo.xml", "wb") - exporter = XmlItemExporter(xml_file) - exporter.start_exporting() - self.xml_exporter = (exporter, xml_file) - return self.xml_exporter - - def process_item(self, item, spider): - exporter = self._exporter_for_item(item) - exporter.export_item(item) - return item - + info_img = [] + for success, img in results: + img_path = os.path.join(settings.IMAGES_STORE, img['path']) + #解密图片 + img_path = imageUtils.deScrambleImagesByPath(img_path) + info_img.append(os.path.basename(img_path).split('.')[0]) + item['images_name'] = ",".join(info_img) + #return item + #ComicInfoXml 生成 + ComicInfoXmlItemExporter(comic=item['name'], chapter=item['chapter']).export_xml(item) + #打包 + CBZUtils.packComicChapterCBZ(comic=item['name'], chapter=item['chapter'], remove= False) \ No newline at end of file diff --git a/Comics/settings.py b/Comics/settings.py index 51e79ef..4d04dd7 100644 --- a/Comics/settings.py +++ b/Comics/settings.py @@ -18,17 +18,17 @@ NEWSPIDER_MODULE = 'Comics.spiders' #USER_AGENT = 'Comics (+http://www.yourdomain.com)' USER_AGENT = UserAgent().random # Obey robots.txt rules -ROBOTSTXT_OBEY = False +ROBOTSTXT_OBEY = False +HTTPERROR_ALLOWED_CODES = [ 200 , 403] # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs -IMAGES_URLS_FIELD = "image_url" -IMAGES_RESULT_FIELD = "image_path" IMAGES_STORE = 'images' +COMIC_INFO_XML_STORE = 'images' DOWNLOAD_DELAY = 20 #重试 RETRY_ENABLED = True @@ -66,7 +66,7 @@ COOKIES_ENABLED = False DOWNLOADER_MIDDLEWARES = { # 'Comics.middlewares.ComicsDownloaderMiddleware': 543, # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500, - 'Comics.middlewares.ProxyMiddleware' : 100, + 'Comics.middlewares.ProxyMiddleware': 100, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400, } @@ -82,7 +82,6 @@ ITEM_PIPELINES = { 'Comics.pipelines.ComicsPipeline': 300, 'Comics.pipelines.ImageParsePipeline': 400, 'Comics.pipelines.ImgDownloadPipeline': 500, - 'Comics.pipelines.ComicInfoXmlPipeline': 600, } # Enable and configure the AutoThrottle extension (disabled by default) @@ -103,5 +102,28 @@ AUTOTHROTTLE_DEBUG = False HTTPCACHE_ENABLED = True HTTPCACHE_EXPIRATION_SECS = 0 HTTPCACHE_DIR = 'httpcache' -HTTPCACHE_IGNORE_HTTP_CODES = [] +HTTPCACHE_IGNORE_HTTP_CODES = [500, 502, 404, 403] HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +CBZ_EXPORT_PATH = "CBZ" +#数据导出类 排序 +COMIC_INFO_XML_FILE = "ComicInfo.xml" +COMIC_INFO_FIELDS_TO_EXPORT = [ + "Title", + "Series", + "Number", + "SeriesGroup", + "Summary", + "Year", + "Month", + "Day", + "Writer", + "Publisher", + "Genre", + "Tags", + "Web", + "PageCount", + "LanguageISO", + "AgeRating", + "Pages" +] diff --git a/Comics/spiders/rm_comic.py b/Comics/spiders/rm_comic.py index 4c7ba47..86fef07 100644 --- a/Comics/spiders/rm_comic.py +++ b/Comics/spiders/rm_comic.py @@ -1,26 +1,9 @@ +import urllib.parse + import scrapy,json,requests from Comics.items import ComicItem from Comics.utils.FileUtils import CommonUtils -import threading -import toml - -class ErrorLog: - def __init__(self) -> None: - self.lock = threading.Lock() - - def err_ls(self, dic): - self.lock.acquire() - with open('error.toml', 'r+t') as f: - data = toml.load('error.toml') - f.seek(0, 0) - f.truncate() - dic_name = f'err_{len(data)}' - data[dic_name] = dic - _ = toml.dump(data, f) - self.lock.release() - - -error_logger = ErrorLog() +from scrapy.loader import ItemLoader class RmComicSpider(scrapy.Spider): name = 'rm_comic' @@ -29,45 +12,51 @@ class RmComicSpider(scrapy.Spider): #start_urls = ['https://rm01.xyz/books/63b65185-f798-4c8f-a0b0-8811615908fd/0'] def start_requests(self): - yield scrapy.Request(self.main_url + '/books/0a7e8bd1-4cfa-481a-b067-1df663fb2017', callback=self.parse_comic) + yield scrapy.Request('https://rm01.xyz' + '/books/306ec1e2-f701-4fda-bb78-041ad6ec4020', callback=self.parse_comic) def parse_comic(self, response): comic = ComicItem() +# comic_item = ItemLoader(item=ComicItem(), response=response) comic['name'] = response.xpath('//div[@class="col"]/h5/text()').extract_first() comic['icon'] = response.xpath('//img[@class="img-thumbnail"]/@src').extract_first() comic['author'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[1]/text()').extract()[1] comic['tags'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[3]/b/text()').extract_first() comic['dep'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[4]/text()').extract()[1] comic['date'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()').extract()[1] - comic['chapters'] = response.xpath('//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/text()').extract() - comic['chapter_href'] = response.xpath('//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/@href').extract() - for link in comic['chapter_href']: - yield scrapy.Request(self.main_url+link,meta={'item' : comic} , callback=self.parse_chapter,errback=self.err) - - def err(self): - print("Error=====") - + comic['genre'] = "韩漫" + comic['age_rating'] = "R18+" + chapters = response.xpath('//div[contains(@class,"bookid_chapterBox")]' + '//div[contains(@class,"bookid_chapter")]/a/text()').extract() + chapter_href = response.xpath('//div[contains(@class,"bookid_chapterBox")]' + '//div[contains(@class,"bookid_chapter")]/a/@href').extract() + #for chapter, link in zip(chapters, chapter_href): + for i, link in enumerate(chapter_href, start=1): + yield scrapy.Request(self.main_url+link, meta={'item' : comic, 'number': i}, callback=self.parse_chapter) + def parse_chapter(self, response): item = response.meta['item'] + number = response.meta['number'] data = response.xpath('//script[@id="__NEXT_DATA__"]/text()').extract_first() - str_exec="props.pageProps." - comic_name = CommonUtils.parseExec(data,str_exec+"bookName") - chapterName = CommonUtils.parseExec(data,str_exec+"chapterName") - description = CommonUtils.parseExec(data,str_exec+"description") - images = CommonUtils.parseExec(data,str_exec+"images") - chapter_api_url = CommonUtils.parseExec(data,str_exec+"chapterAPIPath") + str_exec = "props.pageProps." + comic_name = CommonUtils.parseExec(data, str_exec+"bookName") + chapterName = CommonUtils.parseExec(data, str_exec+"chapterName") + description = CommonUtils.parseExec(data, str_exec+"description") + images = CommonUtils.parseExec(data, str_exec+"images") + chapter_api_url = CommonUtils.parseExec(data, str_exec+"chapterAPIPath") item['chapter'] = chapterName item['list_img'] = images + item['index'] = number if chapter_api_url != None: - yield scrapy.Request(url=self.main_url+chapter_api_url,meta={'item' : item}, callback=self.parse_chapter_api, errback=self.err) + yield scrapy.Request(self.main_url+ chapter_api_url,meta={'item' : item}, callback= self.parse_chapter_api) else: - item['list_img'] = images yield item - - def parse_chapter_api(self,response,item): - data = response.meta['item'] - print(item) - return response - + + def parse_chapter_api(self, response): + item = response.meta['item'] + item['chapter'] = CommonUtils.parseExec(response.text, "chapter.name") + item['list_img'] = CommonUtils.parseExec(response.text, "chapter.images") + yield item + def parse(self, response): raise NotImplementedError \ No newline at end of file diff --git a/Comics/utils/CBZUtils.py b/Comics/utils/CBZUtils.py new file mode 100644 index 0000000..3eba3ff --- /dev/null +++ b/Comics/utils/CBZUtils.py @@ -0,0 +1,105 @@ +import os, shutil, time, logging +from datetime import datetime +from pathlib import Path +from zipfile import ZipFile +from Comics.settings import COMIC_INFO_XML_FILE,CBZ_EXPORT_PATH,IMAGES_STORE + +class CBZUtils: + + @classmethod + def readDirsOrFiles(cls,dir,type): + data = [] + files = os.listdir(dir) + for file in files: + path = os.path.join(dir,file) + if type == "files" and os.path.isfile(path): + data.append(path) + if type == "dirs" and os.path.isdir(path): + data.append(path) + return data + + @classmethod + def zip_compression(cls, source_dir=None, target_file=None, remove=True): + target_dir = os.path.dirname(target_file) + if not os.path.exists(target_dir): + os.makedirs(target_dir) + if not os.path.exists(target_file) and source_dir != None: + with ZipFile(target_file, mode='w') as zf: + for path, dir_names, filenames in os.walk(source_dir): + path = Path(path) + arc_dir = path.relative_to(source_dir) + y = 0 + for filename in filenames: + y = y + 1 + print("打包中:" + str(y) + "/" + str(len(filenames)), os.path.join(source_dir, filename)) + zf.write(path.joinpath(filename), arc_dir.joinpath(filename)) + zf.close() + logging.info(f"打包完成:{target_file}") + + @classmethod + def packComicChapterCBZ(cls, comic, chapter, remove=True): + images_chapter_path = os.path.join(IMAGES_STORE, comic, chapter) + cbz_chapter_path = os.path.join(CBZ_EXPORT_PATH, comic, chapter)+".CBZ" + if os.path.exists(images_chapter_path): + dirs = os.listdir(images_chapter_path) + for file in dirs: + if file.startswith("scramble="): + try: + os.remove(file) + except: + print(f"删除 {file} 发生错误,已跳过") + return False + cls.zip_compression(images_chapter_path, cbz_chapter_path) + time.sleep(0.1) + if remove: shutil.rmtree(images_chapter_path) + return True + + @classmethod + def replaceZip(cls,filepath,unpack_dir=None): + if not cls.compareFileDate(filepath): return None + if unpack_dir == None: + unpack_dir = str(filepath).split(".")[0] + fz = ZipFile(filepath, 'r') + for file in fz.namelist(): + if file.endswith(".jpg"): + data = fz.read(file) + if len(data) < 500 and os.path.exists(filepath): + os.remove(filepath) + print(f"数据不完整,已删除:{filepath}") + if cls.compareFileDate(filepath): + os.utime(filepath) + print(f"已更新文件时间 {filepath}") + if os.path.exists(unpack_dir): + shutil.rmtree(unpack_dir) + # 删除删除main.ftl文件 + #delete_filename = '' + #if os.path.exists(delete_filename): + # os.remove(delete_filename) + # time.sleep(60) + # shutil.copy(文件的路径,另一个目录);拷贝main.ftl到准备压缩的目录下 + #cls.zip_compression() + #小于则运行 + @classmethod + def compareFileDate(cls,filepath): + if os.path.exists(filepath): + ctime = os.path.getmtime(filepath) + str_ctime = datetime.fromtimestamp(int(ctime)) + file_ctime = str(str_ctime.year)+"{:0>2d}".format(str_ctime.month)+"{:0>2d}".format(str_ctime.day)+"{:0>2d}".format(str_ctime.hour) + c_ctime = 2023011603 + else: + return False + if int(file_ctime) < c_ctime: + return True + return False + + @classmethod + def zip_info(cls, path, filter=True): + result = None + try: + with ZipFile(path, "r") as zip_file: + result = zip_file.namelist() + if filter: + result.remove(COMIC_INFO_XML_FILE) + except Exception as e: + print(e) + return result \ No newline at end of file diff --git a/Comics/utils/Comic.py b/Comics/utils/Comic.py deleted file mode 100644 index 7a11e38..0000000 --- a/Comics/utils/Comic.py +++ /dev/null @@ -1,248 +0,0 @@ -import json,re -from opencc import OpenCC -from queue import Queue -from utils.OldUtils import OldUtils - -class Comic: - # ComicInfo.xml and ComicChapter.json bengin - # value origin node dep required - dict_chapter = [None,None,"Title","章节名",True] - dict_comic_name = [None,None,"Series","漫画名",True] - dict_number = [None,None,"Number","编号",True] - dict_comic_names = [None,None,"SeriesGroup","别名",False] - dict_dep = [None,None,"Summary","概述",True] - dict_year = [None,None,"Year","年",False] - dict_month = [None,None,"Month","月",False] - dict_day = [None,None,"Day","日",False] - dict_author = [None,None,"Writer","作者",True] - dict_cbs = [None,None,"Publisher","出版社",False] - dict_genre = [None,None,"Genre","流派",True] - dict_tags = [None,None,"Tags","标签",True] - dict_homepage = [None,None,"Web","主页",False] - dict_page_count = [None,None,"PageCount","总页数",True] - dict_language = [None,None,"LanguageISO","语言",True] - dict_agerating = [None,None,"AgeRating","年龄分级",False] - dict_pages = [None,None,"Pages","页码",True] - CURRENT_DOWN_LINK = None - # ComicInfo.xml and ComicChapter.json end - dict_icon = [None,None,"Icon","图标",True] - dict_chapter_imgs = [None,None,"ChapterImgs","图像",True] - #主页 - dict_list_chapter = [None,None,"ListChapter","全部章节名",True] - (update_at,current_chapter_img,file_chapter_imgs) = [None,None,None] - - - #繁体中文转简体中文 - @classmethod - def ChineseConvert(cls, text,convert='t2s'): return OpenCC(convert).convert(str(text)) - #处理成符合规定的文件名 - @classmethod - def fixFileName(cls,filename,replace=None): - if not isinstance(filename,str): return filename - intab = r'[?*/\|.:><]' - str_replace = "" - if replace != None: str_replace = replace - filename = re.sub(intab, str_replace, filename) - count = 1 - while True: - str_file = filename[0-count] - if str_file == " ": count += 1 - else: - filename = filename[0:len(filename)+1-count] - break - return filename - - @classmethod - def setValue(cls,value): - if value != None: value = cls.ChineseConvert(value) - return value - - @classmethod - def setField(cls,field,value,origin=True,convert=True): - if value != None: - if origin: - field[1] = value - if convert: value = cls.ChineseConvert(value) - field[0] = value - return field - - @classmethod - def getFieldValue(cls,field): - if field == None: return None - return field[0] - @classmethod - def setFieldOrigin(cls,filed,origin): - filed[1] = origin - return filed - @classmethod - def getFieldOrigin(cls,filed): return filed[1] - @classmethod - def getFieldNode(cls,filed): return filed[2] - - @classmethod - def getValue(cls,field,exec=None): - if exec != None: return cls.parseExec(field,exec=exec) - return field - #章节名 - @classmethod - def setChapterName(cls,value,exec=None): - value = cls.fixFileName(cls.parseExec(value,exec=exec)) - OldUtils.setOldChapter(value) - cls.dict_chapter = cls.setField(cls.dict_chapter,value) - - @classmethod - def getChapterName(cls): return cls.getFieldValue(cls.dict_chapter) - @classmethod - def getOriginChapterName(cls): return cls.getFieldOrigin(cls.dict_chapter) - - #漫画名 - @classmethod - def setComicName(cls,value,exec=None): - value = cls.fixFileName(cls.parseExec(value,exec=exec)) - OldUtils.setOldComicName(value) - cls.dict_comic_name = cls.setField(cls.dict_comic_name,value) - - @classmethod - def getComicName(cls): return cls.getFieldValue(cls.dict_comic_name) - @classmethod - def getOriginComicName(cls): return cls.getFieldOrigin(cls.dict_comic_name) - #编号 - @classmethod - def setNumber(cls,value): cls.dict_number = cls.setField(cls.dict_number,value) - @classmethod - def getNumber(cls): return cls.getFieldValue(cls.dict_number) - #概述 - @classmethod - def setDep(cls,value,exec=None): - cls.dict_dep = cls.setField(cls.dict_dep,cls.parseExec(value,exec=exec)) - @classmethod - def getDep(cls): return cls.getFieldValue(cls.dict_dep) - #作者 - @classmethod - def setAuthor(cls,value): cls.dict_author = cls.setField(cls.dict_author,value) - @classmethod - def getAuthor(cls): return cls.getFieldValue(cls.dict_author) - #流派 - @classmethod - def setGenre(cls,value): cls.dict_genre = cls.setField(cls.dict_genre,value) - @classmethod - def getGenre(cls): return cls.getFieldValue(cls.dict_genre) - #语言 - @classmethod - def setLanguage(cls,value): cls.dict_language = cls.setField(cls.dict_language,value) - @classmethod - def getLanguage(cls): return cls.getFieldValue(cls.dict_language) - #年龄分级 - @classmethod - def setAgeRating(cls,value): cls.dict_agerating = cls.setField(cls.dict_agerating,value) - @classmethod - def getAgeRating(cls): return cls.getFieldValue(cls.dict_agerating) - #标签 - @classmethod - def setTags(cls,value): cls.dict_tags = cls.setField(cls.dict_tags,value) - @classmethod - def getTags(cls): return cls.getFieldValue(cls.dict_tags) - #总页数 - @classmethod - def setPageCount(cls,value): cls.dict_page_count = cls.setField(cls.dict_page_count,value) - @classmethod - def getPageCount(cls): return cls.getFieldValue(cls.dict_page_count) - - #------------------------------------------------------------------------ - @classmethod - def parseExec(cls,data,exec,item=True): - if data !=None and exec != None: - dots = str(exec).split(".") - if not isinstance(data,dict): data = json.loads(data) - for dot in dots: - data = data.get(dot) - return data - @classmethod - def setHomePage(cls,value): cls.dict_homepage = cls.setField(cls.dict_homepage,value) - @classmethod - def getHomePage(cls): return cls.getFieldValue(cls.dict_homepage) - @classmethod - def setIcon(cls,value): cls.dict_icon = cls.setField(cls.dict_icon,value,convert=False) - @classmethod - def getIcon(cls): return cls.getFieldValue(cls.dict_icon) - @classmethod - def setListChapter(cls,value): cls.dict_list_chapter = cls.setField(cls.dict_list_chapter,value,convert=False) - @classmethod - def getListChapter(cls): return cls.getFieldValue(cls.dict_list_chapter) - @classmethod - def getLenChapters(cls): return len(cls.getListChapter()) - @classmethod - def setChapterImgs(cls,value,exec=None,item=None): - cls.dict_chapter_imgs = cls.setField(cls.dict_chapter_imgs,cls.parseExec(value,exec=exec,item=item),convert=False) - @classmethod - def getChapterImgs(cls): return cls.getFieldValue(cls.dict_chapter_imgs) - @classmethod - def setUpdateAt(cls,value): cls.update_at = value - @classmethod - def getUpdateAt(cls): return cls.update_at - @classmethod - def setCurrentChapterImg(cls,value): cls.current_chapter_img = value - @classmethod - def getCurrentChapterImg(cls): return cls.current_chapter_img - @classmethod - def setChapterFilesName(cls,value): cls.file_chapter_imgs= value - @classmethod - def getChapterFilesName(cls): return cls.file_chapter_imgs - @classmethod - def setCurrentDownLink(cls,value): cls.CURRENT_DOWN_LINK = value - @classmethod - def getCurrentDownLink(cls): return cls.CURRENT_DOWN_LINK - -class ListComic: - LIST_COMIC_QUEUE = Queue() - (LIST_COMIC_NAME,LIST_COMIC_LINK,LIST_COMIC_UPDATEAT) = [None,None,None] - - @classmethod - def setListComicsLinksUpdateAt(cls,names,links,update_at): - if isinstance(names,list) and isinstance(links,list) and isinstance(update_at,list): - for x in range(0,len(names)): - cls.LIST_COMIC_QUEUE.put([names[x],links[x],update_at[x]]) - @classmethod - def getListComicsLinksUpdateAt(cls): - result = None - if cls.LIST_COMIC_NAME != None and cls.LIST_COMIC_LINK != None: - cls.setListComicsLinksUpdateAt(cls.LIST_COMIC_NAME,cls.LIST_COMIC_LINK,cls.LIST_COMIC_UPDATEAT) - (cls.LIST_COMIC_NAME,cls.LIST_COMIC_LINK,cls.LIST_COMIC_UPDATEAT) = [None,None,None] - if not cls.LIST_COMIC_QUEUE.empty(): result = cls.LIST_COMIC_QUEUE.get(False) - return result - - @classmethod - def addListComicChapterLink(cls,name,link,update_at): - if name != None and link != None: - cls.LIST_COMIC_QUEUE.put(name,link,update_at) - - @classmethod - def getListValue(cls,result,type,start_add=None,result_type="list"): - if result == None: return None - if type == None: return result - if result_type == "list" and type != None: - data = [] - for x in range(0, len(result)): - if start_add != None: - data.append(start_add+result[x].get(type)) - else: - data.append(result[x].get(type)) - return data - return result - - @classmethod - def setListComicName(cls,value,type=None): cls.LIST_COMIC_NAME = cls.getListValue(value,type) - @classmethod - def getListComicName(cls): return cls.LIST_COMIC_NAME - @classmethod - def setListComicChapterLink(cls,value,type=None,start_add=None): cls.LIST_COMIC_LINK = cls.getListValue(value,type,start_add) - @classmethod - def getListComicChapterLink(cls): return cls.LIST_COMIC_LINK - @classmethod - def setListComicUpdateAt(cls,value,type=None): cls.LIST_COMIC_UPDATEAT = cls.getListValue(value,type) - @classmethod - def getListComicUpdateAt(cls): return cls.LIST_COMIC_UPDATEAT - @classmethod - def getListComicChapterLink(cls): return cls.LIST_COMIC_QUEUE.get(False) - - #domain end.... \ No newline at end of file diff --git a/Comics/utils/ComicInfo.py b/Comics/utils/ComicInfo.py index 03c6755..aa5e22f 100644 --- a/Comics/utils/ComicInfo.py +++ b/Comics/utils/ComicInfo.py @@ -1,41 +1,14 @@ import json,os import logging from xml.dom.minidom import Document -from Comics.utils.Comic import Comic from Comics.utils.Constant import ComicPath +from itemadapter import is_item, ItemAdapter -class ComicInfoEntity: - @classmethod - def getNodes(cls): - return [Comic.dict_chapter,Comic.dict_comic_name,Comic.dict_number,Comic.dict_comic_names, - Comic.dict_dep,Comic.dict_year,Comic.dict_month,Comic.dict_day,Comic.dict_author, - Comic.dict_cbs,Comic.dict_genre,Comic.dict_tags,Comic.dict_page_count, - Comic.dict_language,Comic.dict_agerating,Comic.dict_pages] - @classmethod - def getJsonNodes(cls): - return [Comic.dict_chapter,Comic.dict_comic_name,Comic.dict_icon,Comic.dict_number, - Comic.dict_comic_names, - Comic.dict_dep,Comic.dict_year,Comic.dict_month,Comic.dict_day,Comic.dict_author, - Comic.dict_cbs,Comic.dict_genre,Comic.dict_tags,Comic.dict_page_count, - Comic.dict_language,Comic.dict_agerating,Comic.dict_pages, - Comic.dict_list_chapter,Comic.dict_chapter_imgs] - class ComicInfo: IS_NEW_ICON = False document = Document() path_comic_info = None - @classmethod - def parseExec(cls,data,exec,start_add=None,item=True): - if data !=None and exec != None: - dots = str(exec).split(".") - if not isinstance(data,dict): data = json.loads(data) - for dot in dots: - data = data.get(dot) - if start_add != None and data != None: - data = start_add+data - return data - @classmethod def setNodeAndValue(cls,node,value): if value != None: @@ -50,12 +23,12 @@ class ComicInfo: #页数 @classmethod def setPages(cls,values=None): - if values == None: values = Comic.getChapterFilesName() + #if values == None: values = Comic.getChapterFilesName() if values != None and isinstance(values,list): suffix = "."+str(values[0]).split(".")[-1] join_list=",".join(values).replace(suffix,"") values = join_list.split(",") - Comic.setPageCount(len(values)+1 if cls.IS_NEW_ICON else len(values)) + #Comic.setPageCount(len(values)+1 if cls.IS_NEW_ICON else len(values)) root_node = cls.document.createElement("Pages") if cls.IS_NEW_ICON: #添加封面 @@ -68,12 +41,12 @@ class ComicInfo: page = page.split("_")[-1] c_node.setAttribute("Image",page) root_node.appendChild(c_node) - Comic.dict_pages = Comic.setField(Comic.dict_pages,root_node,convert=False) + #Comic.dict_pages = Comic.setField(Comic.dict_pages,root_node,convert=False) @classmethod def getBaseUrl(cls,url=None): - if url == None: - url = Comic.getHomePage() + #if url == None: + # url = Comic.getHomePage() (num,index) = [3,0] for x in range(0, num): index = str(url).find("/",index)+1 @@ -84,24 +57,30 @@ class ComicInfo: def root_node(cls,root_value): return cls.document.createElement(root_value) @classmethod - def add_nodes(cls,root,list_value): - if len(list_value) == 0: return list_value - for value in list_value: - #Comic.chapter - if value[0] == None and value[4]: - #数据为空 value[0], 但不允许为空value[4] = False - msg = f"#数据为空 key={value[3]} value[0]={value[0]}, 但不允许为空value[4]={value[4]}" - logger.error(msg) - exit() - if value[0] != None: root.appendChild(cls.setNodeAndValue(value[2],value[0])) + def add_nodes(cls,root,item): + item = ItemAdapter(item) + keys = item.keys() + files = item.field_names() + values = item.values() + print("test") + #if len(list_value) == 0: return list_value + #for value in list_value: + # #Comic.chapter + # if value[0] == None and value[4]: + # #数据为空 value[0], 但不允许为空value[4] = False + # msg = f"#数据为空 key={value[3]} value[0]={value[0]}, 但不允许为空value[4]={value[4]}" + # logging.error(msg) + # exit() + # if value[0] != None: root.appendChild(cls.setNodeAndValue(value[2],value[0])) @classmethod def initComicInfoXML(cls): cls.setPages() @classmethod - def writeComicInfoXML(cls,overlay=False): - save_path = ComicPath.getPathComicInfoXML() + def writeComicInfoXML(cls,item,overlay=False): + #save_path = ComicPath.getPathComicInfoXML() + save_path = "ComicInfo.xml" if os.path.exists(save_path): if overlay: os.remove(save_path) @@ -113,44 +92,8 @@ class ComicInfo: root = cls.root_node("ComicInfo") new_document = Document() new_document.appendChild(root) - cls.add_nodes(root,ComicInfoEntity.getNodes()) + cls.add_nodes(root, item) with open(save_path, "w", encoding="utf-8") as fo: new_document.writexml(fo, indent='', addindent='\t', newl='\n', encoding="utf-8") fo.close() - logging.info(f"已生成文件... {save_path}") - - @classmethod - def setComicInfo(cls,comicname=None,homepage=None,alias=None,author=None,icon=None,tags=None, - dep=None,genre=None,lang=None,age_rating=None,chapters=None,current_chapter_img=None): - author = ",".join(set(str(str(author).replace("&",",").replace(" ",",")).split(","))) - Comic.setHomePage(homepage) - Comic.setIcon(icon) - Comic.setListChapter(chapters) - #Comic.setUpdateAt(update_at) - Comic.setComicName(str(comicname)) - #if alias != None: comicInfo.setComicNames(alias) - Comic.setAuthor(author) - Comic.setTags(tags) - Comic.setDep(dep) - #comicInfo.setCBS("韩漫") - if genre != None: Comic.setGenre(genre) - Comic.setLanguage(lang) - Comic.setAgeRating(age_rating) - Comic.setCurrentChapterImg(current_chapter_img) - - @classmethod - def writeJson(cls): - dict_data = {} - nodes = ComicInfoEntity.getJsonNodes() - for node in nodes: - key = Comic.getFieldNode(node) - value = Comic.getFieldOrigin(node) - if isinstance(value,list): - value = ",".join(value) - if key != None and isinstance(value,str): - child_dict = { key : value} - dict_data.update(child_dict) - s = json.dumps(dict_data,ensure_ascii=True) - logging.debug(f"json={s}") - with open(ComicPath.getPathConfComicChapterJson(mkdir=True),"w") as fs: - fs.write(s) \ No newline at end of file + logging.info(f"已生成文件... {save_path}") \ No newline at end of file diff --git a/Comics/utils/Constant.py b/Comics/utils/Constant.py index 4530c41..f178afb 100644 --- a/Comics/utils/Constant.py +++ b/Comics/utils/Constant.py @@ -1,5 +1,7 @@ +import os.path +import re from opencc import OpenCC - +from Comics.settings import IMAGES_STORE class ComicPath: @classmethod def getDirComicChapter(cls): @@ -13,4 +15,24 @@ class ComicPath: #繁体中文转简体中文 @classmethod - def ChineseConvert(cls, text,convert='t2s'): return OpenCC(convert).convert(str(text)) \ No newline at end of file + def chinese_convert(cls, text,convert='t2s'): return OpenCC(convert).convert(str(text)) + + #处理成符合规定的文件名 + @classmethod + def fix_file_name(cls, filename, replace=None): + if not isinstance(filename, str): + return filename + in_tab = r'[?*/\|.:><]' + str_replace = "" + if replace is not None: + str_replace = replace + filename = re.sub(in_tab, str_replace, filename) + count = 1 + while True: + str_file = filename[0-count] + if str_file == " ": + count += 1 + else: + filename = filename[0:len(filename)+1-count] + break + return filename \ No newline at end of file diff --git a/Comics/utils/FileUtils.py b/Comics/utils/FileUtils.py index dfc1235..4824c6b 100644 --- a/Comics/utils/FileUtils.py +++ b/Comics/utils/FileUtils.py @@ -2,7 +2,17 @@ import base64,hashlib,os,shutil import math,time,json,datetime,logging from PIL import Image from tinydb import TinyDB, Query -from Comics.spiders.utils.Constant import ComicPath +from Comics.utils.Constant import ComicPath + +class fileUtils: + @classmethod + def save_file(cls,path,data): + dir = os.path.dirname(path) + if not os.path.exists(dir): + os.makedirs(dir) + with open(path,'w',encoding='utf-8') as fs: + fs.write(str(data)) + fs.close() class CommonUtils: @classmethod @@ -31,11 +41,9 @@ class imageUtils: @classmethod def deScrambleImagesByPath(cls,img_path,img_save=None): if os.path.basename(img_path).startswith("scramble="): - imageUtils.encode_scramble_image(img_path,img_save) - return True - else: - return False - + img_path = imageUtils.encode_scramble_image(img_path,img_save) + return img_path + @classmethod def encodeImage(cls,str_en): #print("en",str_en) @@ -223,4 +231,5 @@ class imageUtils: print("解密成功=",save_path) if os.path.exists(imgpath): os.remove(imgpath) - print("remove=",imgpath) \ No newline at end of file + print("remove=",imgpath) + return save_path \ No newline at end of file