diff --git a/.gitignore b/.gitignore index 4bead05..0eade7f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ -.scrapy/* -images/* -json/* -CBZ/* +.scrapy/* +.vscode/* +CBZ/* +output/* /**/__pycache__ \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index 26d3352..0000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml diff --git a/.idea/ComicScrapy.iml b/.idea/ComicScrapy.iml deleted file mode 100644 index 8ac55ea..0000000 --- a/.idea/ComicScrapy.iml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2d..0000000 --- a/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index 9ef495e..0000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index 23f68eb..0000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 35eb1dd..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/Comics/exporters.py b/Comics/exporters.py index d9f69f2..8ce1550 100644 --- a/Comics/exporters.py +++ b/Comics/exporters.py @@ -1,105 +1,130 @@ -import os.path,json,ast - -from Comics.settings import COMIC_INFO_FIELDS_TO_EXPORT -from scrapy.exporters import XmlItemExporter -from scrapy.exporters import PythonItemExporter -from Comics.items import ComicInfoItem -from Comics.items import ComicItem -from Comics.settings import COMIC_INFO_XML_STORE -from Comics.utils.Constant import ComicPath -from scrapy.utils.python import is_listlike, to_bytes, to_unicode -from itemadapter import ItemAdapter - - -class ItemExporter(PythonItemExporter): - def convert(self, data): - if isinstance(data, bytes): return data.decode("utf-8") - if isinstance(data, dict): return dict(map(self.convert, data.items())) - if isinstance(data, tuple): return map(self.convert, data) - if isinstance(data, list): return [self.convert(i) for i in data] - return data - - def export_obj(self, obj_item): - self.start_exporting() - obj_item = self.convert(self.export_item(obj_item)) - self.finish_exporting() - return obj_item - -class ComicInfoXmlItemExporter(XmlItemExporter): - custom_root_element = "ComicInfo" - def __init__(self, comic, chapter): - file_path = os.path.join(COMIC_INFO_XML_STORE, comic, - chapter, f"{self.custom_root_element}.xml") - dir_path = os.path.dirname(file_path) - if not os.path.exists(dir_path): os.makedirs(dir_path) - self.xml_file = open(file_path, "wb") - super(ComicInfoXmlItemExporter, self).__init__(self.xml_file, - root_element=self.custom_root_element, - indent=1,fields_to_export=COMIC_INFO_FIELDS_TO_EXPORT) - - def serialize_field(self, field, name, value): - #通过序列化 - value = ComicPath.chinese_convert(value) - return super().serialize_field(field, name, value) - - def start_exporting(self): - self.xg.startDocument() - self.xg.startElement(self.custom_root_element, {}) - - def comic_to_info_item(self, comic_item): - comic_info = {} - info_item = ItemAdapter(ComicInfoItem()) - comic_info_dict = {} - for field in info_item.field_names(): - meta_info = info_item.get_field_meta(field).get('info') - if meta_info is not None: - comic_info_dict[meta_info] = field - for key, value in ComicItem(comic_item).items(): - new_key = comic_info_dict.get(key) - if new_key is not None: - comic_info[new_key] = value - return ItemExporter().export_obj(ComicInfoItem(comic_info)) - - def export_item(self, item): - comic_info = self.comic_to_info_item(item) - child_element = "Page" - self._beautify_indent(depth=1) - self._beautify_newline() - for name, value in self._get_serialized_fields(comic_info, default_value=""): - if name is "Pages": - value = str(value).split(',') - if value is not None or value != "": - self._export_xml_field(name, value, depth=2, child_element=child_element) - #self._beautify_indent(depth=1) - return comic_info - - def _export_xml_field(self, name, serialized_value, depth, child_element="value"): - self._beautify_indent(depth=depth) - self.xg.startElement(name, {}) - if hasattr(serialized_value, "items"): - self._beautify_newline() - for sub_name, value in serialized_value.items(): - self._export_xml_field(sub_name, value, depth=depth + 1) - self._beautify_indent(depth=depth) - elif is_listlike(serialized_value): - self._beautify_newline() - for value in serialized_value: - self._export_xml_field(child_element, value, depth=depth + 1) - self._beautify_indent(depth=depth) - elif isinstance(serialized_value, str): - self.xg.characters(serialized_value) - else: - self.xg.characters(str(serialized_value)) - self.xg.endElement(name) - self._beautify_newline() - - def finish_exporting(self): - self.xg.endElement(self.custom_root_element) - self.xg.endDocument() - self.xml_file.close() - - def export_xml(self, item): - self.start_exporting() - comic_info = self.export_item(item) - self.finish_exporting() - return comic_info +import os.path,json,ast + +from Comics.settings import COMIC_INFO_FIELDS_TO_EXPORT +from scrapy.exporters import XmlItemExporter +from scrapy.exporters import PythonItemExporter +from scrapy.exporters import JsonItemExporter +from Comics.items import ComicInfoItem +from Comics.items import ComicItem +from Comics.settings import COMIC_INFO_XML_STORE +from Comics.utils.Constant import ComicPath +from scrapy.utils.python import is_listlike, to_bytes, to_unicode +from itemadapter import ItemAdapter + +class CommonExporter(): + def getPath(self, file , sufix=None): + sufix = "."+sufix + dirname = os.path.dirname(file) + if not os.path.exists(dirname): + os.makedirs(dirname) + if sufix != None and sufix not in file: + file = file + sufix + return file + +class ItemExporter(PythonItemExporter): + def convert(self, data): + if isinstance(data, bytes): return data.decode("utf-8") + if isinstance(data, dict): return dict(map(self.convert, data.items())) + if isinstance(data, tuple): return map(self.convert, data) + if isinstance(data, list): return [self.convert(i) for i in data] + return data + + def export_obj(self, obj_item): + self.start_exporting() + obj_item = self.convert(self.export_item(obj_item)) + self.finish_exporting() + return obj_item + +class JsonExport(JsonItemExporter): + def __init__(self, file, **kwargs): + file = CommonExporter().getPath(file=file, sufix= "json") + self.file = open(file, "wb") + super(JsonExport, self).__init__(self.file, **kwargs) + + def export_json(self, json_object, if_return=False): + self.start_exporting() + self.export_item(json_object) + self.finish_exporting() + self.file.close() + if if_return: + return ItemExporter().export_obj(json_object) + + +class ComicInfoXmlItemExporter(XmlItemExporter): + custom_root_element = "ComicInfo" + def __init__(self, comic, chapter): + file_path = os.path.join(COMIC_INFO_XML_STORE, comic, + chapter, f"{self.custom_root_element}.xml") + dir_path = os.path.dirname(file_path) + if not os.path.exists(dir_path): os.makedirs(dir_path) + self.xml_file = open(file_path, "wb") + super(ComicInfoXmlItemExporter, self).__init__(self.xml_file, + root_element=self.custom_root_element, + indent=1,fields_to_export=COMIC_INFO_FIELDS_TO_EXPORT) + + def serialize_field(self, field, name, value): + #通过序列化 + value = ComicPath.chinese_convert(value) + return super().serialize_field(field, name, value) + + def start_exporting(self): + self.xg.startDocument() + self.xg.startElement(self.custom_root_element, {}) + + def comic_to_info_item(self, comic_item): + comic_info = {} + info_item = ItemAdapter(ComicInfoItem()) + comic_info_dict = {} + for field in info_item.field_names(): + meta_info = info_item.get_field_meta(field).get('info') + if meta_info is not None: + comic_info_dict[meta_info] = field + for key, value in ComicItem(comic_item).items(): + new_key = comic_info_dict.get(key) + if new_key is not None: + comic_info[new_key] = value + return ItemExporter().export_obj(ComicInfoItem(comic_info)) + + def export_item(self, item): + comic_info = self.comic_to_info_item(item) + child_element = "Page" + self._beautify_indent(depth=1) + self._beautify_newline() + for name, value in self._get_serialized_fields(comic_info, default_value=""): + if name == "Pages": + value = ast.literal_eval(value) + if value is not None or value != "": + self._export_xml_field(name, value, depth=2, child_element=child_element) + #self._beautify_indent(depth=1) + return comic_info + + def _export_xml_field(self, name, serialized_value, depth, child_element="value"): + self._beautify_indent(depth=depth) + self.xg.startElement(name, {}) + if hasattr(serialized_value, "items"): + self._beautify_newline() + for sub_name, value in serialized_value.items(): + self._export_xml_field(sub_name, value, depth=depth + 1) + self._beautify_indent(depth=depth) + elif is_listlike(serialized_value): + self._beautify_newline() + for value in serialized_value: + self._export_xml_field(child_element, value, depth=depth + 1) + self._beautify_indent(depth=depth) + elif isinstance(serialized_value, str): + self.xg.characters(serialized_value) + else: + self.xg.characters(str(serialized_value)) + self.xg.endElement(name) + self._beautify_newline() + + def finish_exporting(self): + self.xg.endElement(self.custom_root_element) + self.xg.endDocument() + self.xml_file.close() + + def export_xml(self, item): + self.start_exporting() + comic_info = self.export_item(item) + self.finish_exporting() + return comic_info diff --git a/Comics/items.py b/Comics/items.py index 2fb6541..92fc8d4 100644 --- a/Comics/items.py +++ b/Comics/items.py @@ -1,78 +1,151 @@ -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.org/en/latest/topics/items.html -from scrapy.item import Item, Field -from Comics.utils.Constant import ComicPath -from scrapy.loader.processors import TakeFirst, MapCompose, Join - -def serialize_to_chinese(value): - return ComicPath.chinese_convert(value) - -def serialize_to_fix_file(value): - file = ComicPath.chinese_convert(value) - return ComicPath.fix_file_name(file) - -class ComicOItem(Item): - name = Field() - chapterItem = Field() - -class ComicItem(Item): - # 编号 - index = Field(output_processor=TakeFirst()) - # 漫画名 - name = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst()) - # 章节名 - chapter = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst()) - # 图片链接 - list_img = Field() - # 作者 - author = Field(serialize_to_chinese=serialize_to_chinese, output_processor=TakeFirst()) - # 封面链接 - icon = Field(output_processor=TakeFirst()) - # 标签 - tags = Field(serializer=serialize_to_chinese, output_processor=TakeFirst()) - # 概述 - dep = Field(serializer=serialize_to_chinese, output_processor=TakeFirst()) - # 时间 - date = Field(output_processor=TakeFirst()) - # 流派 - genre = Field(output_processor=TakeFirst()) - # 年龄分级 - age_rating = Field(output_processor=TakeFirst()) - - images = Field() - images_name = Field() - -class ImageItem(Item): - image_name = Field() - image_url = Field() - image_path = Field() - -def serializer_info_writer(value): - list_value = [] - str(value).replace("&", " ") - for v in str(value).split(" "): - list_value.append(v) - return ",".join(list_value) - -class ComicInfoItem(Item): - Title = Field(info='chapter')#"章节名",True] - Series = Field(info='name')# ","漫画名",True] - Number = Field(info='index')# ","编号",True] - SeriesGroup = Field()# ","别名",False] - Summary = Field(info='dep')# ","概述",True] - Year = Field()# ","年",False] - Month = Field()# ","月",False] - Day = Field()# ","日",False] - Writer = Field(info='author',serializer=serializer_info_writer)# "作者",True] - Publisher = Field()# ","出版社",False] - Genre = Field(info='genre')# ","流派",True] - Tags = Field(info='tags')# ","标签",True] - Web = Field()# ","主页",False] - PageCount = Field()# ","总页数",True] - LanguageISO = Field()#","语言",True] - AgeRating = Field(info='age_rating')#","年龄分级",False] - Pages = Field(info='images_name')#","页码",True] - # ComicInfo.xml and ComicChapter.json end - +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.org/en/latest/topics/items.html +import os,Comics.settings as settings,logging +from scrapy.item import Item, Field +from Comics.utils.Constant import ComicPath +from Comics.utils.FileUtils import imageUtils +from scrapy.loader.processors import TakeFirst, MapCompose, Join + +def serialize_to_chinese(value): + return ComicPath.chinese_convert(value) + +def serialize_to_fix_file(value): + file = ComicPath.chinese_convert(value) + return ComicPath.fix_file_name(file) + +def _serialize_to_images(value, result_type=None): + count = 1 + images_item = [] + image_urls = [] + for image in value: + (image_src, scramble) = [image.get("src"), image.get("scramble")] + count_image = settings.IMAGES_NAME_FORMAT.format(count) + suffix = "."+str(image_src).split(".")[-1] + image_name = count_image + suffix + if scramble: + de_str = str(image_src).split("/")[-1].replace(suffix, "==") + blocks_num = imageUtils.encodeImage(de_str) + image_name = ComicPath.getFileScrambleImageName(count=count_image, block=blocks_num, suffix=suffix) + #images_item.append(ImagesItem(image_name=count_image + suffix, image_url=image_src, image_path=image_name)) + images_item.append(image_name) + image_urls.append(image_src) + count += 1 + logging.info(f"images_len: {len(images_item)}") + if result_type == "image_urls": return image_urls + else: return images_item + +def serialize_to_images(value): return _serialize_to_images(value) + + +def serialize_to_image_urls(value): return _serialize_to_images(value, result_type="image_urls") + + +class ListComicItem(Item): + name = Field() + link = Field() + + +class ComicItem(Item): + # 编号 + index = Field(output_processor=TakeFirst()) + # 漫画名 + name = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst()) + # 章节名 + chapter = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst()) + # 图片链接 + list_img = Field(serializer=serialize_to_images) + # 作者 + author = Field(serialize_to_chinese=serialize_to_chinese, output_processor=TakeFirst()) + # 封面链接 + icon = Field(output_processor=TakeFirst()) + # 标签 + tags = Field(serializer=serialize_to_chinese, output_processor=TakeFirst()) + # 概述 + dep = Field(serializer=serialize_to_chinese, output_processor=TakeFirst()) + # 时间 + date = Field(output_processor=TakeFirst()) + # 流派 + genre = Field(output_processor=TakeFirst()) + # 年龄分级 + age_rating = Field(output_processor=TakeFirst()) + + images_old = Field(serializer=serialize_to_images) + images = Field(serializer=serialize_to_images) + image_urls = Field(serializer=serialize_to_image_urls) + images_name = Field() + +class ImagesItem(Item): + image_name = Field() + image_url = Field() + image_path = Field() + images = Field() + image_urls = Field() + comic = Field() + +def serializer_info_writer(value): + list_value = [] + str(value).replace("&", " ") + for v in str(value).split(" "): + list_value.append(v) + return ",".join(list_value) + +# Result_type name +def _serializer_info_imagesa(value, result_type=None): + info = [] + for success, img in value: + img_path = os.path.join(settings.IMAGES_STORE, img['path']) + if result_type == 'name': + info.append(ComicPath().getFileScrambleImageSave(img_path,True,False)) + else: + info.append(img_path) + if result_type == "len": + value = len(info) + else: + value = info + return value + +def _serialize_info_images(value, result_type=None): + images = [] + for image in value: + images.append(ComicPath().getFileScrambleImageSave(image,True,False)) + if result_type == "count": + return len(images) + else: + return images + + +def serializer_info_images(value): return _serialize_info_images(value) + +def serializer_info_images_count(value): return _serialize_info_images(value, "count") + +def serializer_info_images_completed(value): + return _serialize_info_images(value, result_type='name') + +def serializer_info_images_count(value): + return _serialize_info_images(value, result_type='len') + + +class ComicInfoItem(Item): + Title = Field(info='chapter')#"章节名",True] + Series = Field(info='name')# ","漫画名",True] + Number = Field(info='index')# ","编号",True] + SeriesGroup = Field()# ","别名",False] + Summary = Field(info='dep')# ","概述",True] + Year = Field()# ","年",False] + Month = Field()# ","月",False] + Day = Field()# ","日",False] + Writer = Field(info='author',serializer=serializer_info_writer)# "作者",True] + Publisher = Field()# ","出版社",False] + Genre = Field(info='genre')# ","流派",True] + Tags = Field(info='tags')# ","标签",True] + Web = Field()# ","主页",False] + #PageCount = Field()# ","总页数",True] + PageCount = Field(info='images',serializer=serializer_info_images_count)# ","总页数",True] + LanguageISO = Field()#","语言",True] + AgeRating = Field(info='age_rating')#","年龄分级",False] + #Pages = Field(info='images_name', serializer=serializer_info_images_completed)#","页码",True] + Pages = Field(info='images', serializer=serializer_info_images)#","页码",True] + # ComicInfo.xml and ComicChapter.json end + diff --git a/Comics/loader.py b/Comics/loader.py index ad4b8e4..28d3bb6 100644 --- a/Comics/loader.py +++ b/Comics/loader.py @@ -1,44 +1,56 @@ -import json -from scrapy.loader import ItemLoader -class ComicLoader(ItemLoader): - def parseExec(cls,data,exec): - if data !=None and exec != None: - dots = str(exec).split(".") - if not isinstance(data,dict): data = json.loads(data) - for dot in dots: - data = data.get(dot) - return data - - def add_xpath(self, field_name, xpath, *processors, index=None, exec=None, re=None, **kw): - """ - Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a - value, which is used to extract a list of strings from the - selector associated with this :class:`ItemLoader`. - - See :meth:`get_xpath` for ``kwargs``. - - :param xpath: the XPath to extract data from - :type xpath: str - - Examples:: - - # HTML snippet:

Color TV

- loader.add_xpath('name', '//p[@class="product-name"]') - # HTML snippet:

the price is $1200

- loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)') - - """ - values = self._get_xpathvalues(xpath, **kw) - if exec is not None: - values = self.parseExec(values, exec) - if index is not None: - values = values[index] - self.add_value(field_name, values, *processors, re=re, **kw) - - def add_exec(self, field_name, value, str_exec=None, *processors, re=None, **kw): - if str_exec is not None: - value = self.parseExec(value, str_exec) - self.add_value(field_name, value, *processors, re=re, **kw) - - def get_exec(self, value, str_exec): - return self.parseExec(value, str_exec) \ No newline at end of file +import json +from scrapy.loader import ItemLoader + +class ComicLoader(ItemLoader): + def parseExec(cls,data,exec): + if data !=None and exec != None: + dots = str(exec).split(".") + if not isinstance(data,dict): data = json.loads(data) + for dot in dots: + data = data.get(dot) + return data + + def add_xpath(self, field_name, xpath, *processors, index=None, exec=None, re=None, **kw): + """ + Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a + value, which is used to extract a list of strings from the + selector associated with this :class:`ItemLoader`. + + See :meth:`get_xpath` for ``kwargs``. + + :param xpath: the XPath to extract data from + :type xpath: str + + Examples:: + + # HTML snippet:

Color TV

+ loader.add_xpath('name', '//p[@class="product-name"]') + # HTML snippet:

the price is $1200

+ loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)') + + """ + values = self._get_xpathvalues(xpath, **kw) + if exec is not None: + values = self.parseExec(values, exec) + if index is not None: + values = values[index] + self.add_value(field_name, values, *processors, re=re, **kw) + + def add_exec(self, field_name, value, str_exec=None, *processors, re=None, **kw): + if str_exec is not None: + value = self.parseExec(value, str_exec) + self.add_value(field_name, value, *processors, re=re, **kw) + + def get_exec(self, value, str_exec): + return self.parseExec(value, str_exec) + + def add_value(self, field_name, value, *processors, re=None, **kw): + if self.auto_replace_value(field_name, value): + return super().add_value(field_name, value, *processors, re=re, **kw) + + + def auto_replace_value(self, field_name, value): + if self.get_output_value(field_name) != None: + self._replace_value(field_name, value) + return False + else: return True \ No newline at end of file diff --git a/Comics/middlewares.py b/Comics/middlewares.py index c5c85d2..3a65236 100644 --- a/Comics/middlewares.py +++ b/Comics/middlewares.py @@ -1,110 +1,110 @@ -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals -import random,logging -from pathlib import Path -from Comics.settings import PROXY_LIST -# useful for handling different item types with a single interface - -logger = logging.getLogger(__name__) - -class ProxyMiddleware(object): - def process_request(self, request, spider): - if len(PROXY_LIST) != 0: - request.meta["proxy"] = random.choice(PROXY_LIST) - -class ComicsSpiderMiddleware: - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, or item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request or item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ComicsDownloaderMiddleware: - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals +import random,logging +from pathlib import Path +from Comics.settings import PROXY_LIST +# useful for handling different item types with a single interface + +logger = logging.getLogger(__name__) + +class ProxyMiddleware(object): + def process_request(self, request, spider): + if len(PROXY_LIST) != 0: + request.meta["proxy"] = random.choice(PROXY_LIST) + +class ComicsSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ComicsDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/Comics/pipelines.py b/Comics/pipelines.py index 712c68f..948c674 100644 --- a/Comics/pipelines.py +++ b/Comics/pipelines.py @@ -1,81 +1,63 @@ -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - - -# useful for handling different item types with a single interface -import os, scrapy -from Comics import settings -from Comics.utils.FileUtils import imageUtils -from Comics.utils.FileUtils import fileUtils -from Comics.utils.Constant import ComicPath -from Comics.items import ComicItem -from Comics.items import ImageItem -from scrapy.pipelines.images import ImagesPipeline -from Comics.exporters import ComicInfoXmlItemExporter -from Comics.exporters import ItemExporter -from Comics.utils.FileUtils import CBZUtils - -class ComicsPipeline: - def open_spider(self, spider): - pass - # item就是yield后面的对象 - def process_item(self, item, spider): - if isinstance(item, ComicItem): - item = ComicItem(ItemExporter().export_obj(item)) - file = os.path.join("json", item['name'], item['chapter']) - fileUtils.save_file(f"{file}.json", item) - return item - # image解析 - - def close_spider(self,spider): - pass - -class ImageParsePipeline: - def process_item(self, item, spider): - if isinstance(item, ComicItem): - count = 1 - images_item = [] - for image in item['list_img']: - (image_src, scramble) = [image.get("src"), image.get("scramble")] - count_image = "{:0>3d}".format(count) - suffix = "."+str(image_src).split(".")[-1] - image_name = count_image + suffix - if scramble: - de_str = str(image_src).split("/")[-1].replace(suffix, "==") - blocks_num = imageUtils.encodeImage(de_str) - image_name = ComicPath.getFileScrambleImageName(count=count_image, block=blocks_num, suffix=suffix) - image_path = os.path.join(item['name'], item['chapter'], image_name) - images_item.append(ImageItem(image_name=count_image + suffix, image_url=image_src, image_path=image_path)) - count += 1 - item['images'] = images_item - return item - -class ImgDownloadPipeline(ImagesPipeline): - def file_path(self, request, response=None, info=None, *, item=None): - image = request.meta['item'] - image_path = image['image_path'] - en_image_path = os.path.join(os.path.dirname(image_path), image['image_name']) - if os.path.exists(os.path.join(settings.IMAGES_STORE, en_image_path)): - return en_image_path - else: - return image_path - - def get_media_requests(self, item, info): - for image in item['images']: - yield scrapy.Request(url=image['image_url'], meta={'item': image}) - - def item_completed(self, results, item, info): - info_img = [] - for success, img in results: - img_path = os.path.join(settings.IMAGES_STORE, img['path']) - # 解密图片 - img_path = imageUtils.deScrambleImagesByPath(img_path) - info_img.append(os.path.basename(img_path).split('.')[0]) - item['images_name'] = ",".join(info_img) - # return item - # ComicInfoXml 生成 - ComicInfoXmlItemExporter(comic=item['name'], chapter=item['chapter']).export_xml(item) - # 打包 - CBZUtils.packComicChapterCBZ(comic=item['name'], chapter=item['chapter'], remove=False) +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +import os, scrapy,logging,time,random +from Comics import settings +from Comics.utils.FileUtils import imageUtils +from Comics.utils.FileUtils import fileUtils +from Comics.utils.Constant import ComicPath +from Comics.items import ComicItem +from Comics.items import ImagesItem +from scrapy.pipelines.images import ImagesPipeline +from Comics.exporters import ComicInfoXmlItemExporter +from Comics.exporters import ItemExporter +from Comics.exporters import JsonExport +from Comics.utils.FileUtils import CBZUtils + +class ComicsPipeline: + def open_spider(self, spider): + pass + # item就是yield后面的对象 + def process_item(self, item, spider): + if isinstance(item, ComicItem): + file = os.path.join(settings.OUTPUT_DIR,"json", item['name'], item['chapter']) + data = JsonExport(file=file).export_json(item, if_return=True) + #item['images'] = data['images'] + return data + # image解析 + + def close_spider(self,spider): + pass + + +class ImgDownloadPipeline(ImagesPipeline): + def file_exits(self, image_path): + en_image_path = ComicPath().getFileScrambleImageSave(image_path, relative="fullpath") + return os.path.exists(os.path.join(settings.IMAGES_STORE, en_image_path)) + + def file_full_path(self, item, image): return os.path.join(item['name'], item['chapter'], image) + + def file_path(self, request, response=None, info=None, *, item=None): return request.meta['path'] + + def get_media_requests(self, item, info): + for image_url,image_path in zip(item['image_urls'],item['images']): + image_path = self.file_full_path(item, image_path) + if self.file_exits(image_path): + logging.info(f"file exists: {image_path}") + else: + logging.info(f"downloading {image_url} --> {image_path}") + yield scrapy.Request(url=image_url, meta={'path': image_path}) + + def item_completed(self, results, item, info): + item['images_name'] = results + # return item + # ComicInfoXml 生成 + comic_info = ComicInfoXmlItemExporter(comic=item['name'], chapter=item['chapter']).export_xml(item) + # 打包 + CBZUtils.packComicChapterCBZ(comic=item['name'], chapter=item['chapter'], + comic_info_images= comic_info["Pages"], remove=False) + time.sleep(random.randint(5,10)) \ No newline at end of file diff --git a/Comics/settings.py b/Comics/settings.py index bc5db37..29b4946 100644 --- a/Comics/settings.py +++ b/Comics/settings.py @@ -1,130 +1,137 @@ -# Scrapy settings for Comics project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html -from fake_useragent import UserAgent - -BOT_NAME = 'Comics' - -SPIDER_MODULES = ['Comics.spiders'] -NEWSPIDER_MODULE = 'Comics.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'Comics (+http://www.yourdomain.com)' -USER_AGENT = UserAgent().random -# Obey robots.txt rules -ROBOTSTXT_OBEY = False - -HTTPERROR_ALLOWED_CODES = [ 200 , 403] -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -IMAGES_STORE = 'images' -COMIC_INFO_XML_STORE = 'images' -DOWNLOAD_DELAY = 20 -#重试 -RETRY_ENABLED = True -RETRY_TIMES = 10 # 想重试几次就写几 -# 下面这行可要可不要 -RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 401] -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 -PROXY_LIST = [ - "http://127.0.0.1:7890", -] -# Disable cookies (enabled by default) -COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'Comics.middlewares.ComicsSpiderMiddleware': 543, -# 'Comics.middlewares.ProxyMiddleware' : 100, -# 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -DOWNLOADER_MIDDLEWARES = { -# 'Comics.middlewares.ComicsDownloaderMiddleware': 543, -# 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500, - 'Comics.middlewares.ProxyMiddleware': 100, - 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400, -} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'Comics.pipelines.ComicsPipeline': 300, - 'Comics.pipelines.ImageParsePipeline': 400, - 'Comics.pipelines.ImgDownloadPipeline': 500, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -AUTOTHROTTLE_ENABLED = True -# The initial download delay -AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -HTTPCACHE_ENABLED = True -HTTPCACHE_EXPIRATION_SECS = 0 -HTTPCACHE_DIR = 'httpcache' -HTTPCACHE_IGNORE_HTTP_CODES = [500, 502, 404, 403, 401] -#HTTPCACHE_STORAGE = 'Comics.middlewares.MyFilesystemCacheStorage' -HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -CBZ_EXPORT_PATH = "CBZ" -#数据导出类 排序 -COMIC_INFO_XML_FILE = "ComicInfo.xml" -COMIC_INFO_FIELDS_TO_EXPORT = [ - "Title", - "Series", - "Number", - "SeriesGroup", - "Summary", - "Year", - "Month", - "Day", - "Writer", - "Publisher", - "Genre", - "Tags", - "Web", - "PageCount", - "LanguageISO", - "AgeRating", - "Pages" -] +# Scrapy settings for Comics project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html +from fake_useragent import UserAgent +import os + +BOT_NAME = 'Comics' + +SPIDER_MODULES = ['Comics.spiders'] +NEWSPIDER_MODULE = 'Comics.spiders' + +OUTPUT_DIR = "output" +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'Comics (+http://www.yourdomain.com)' +USER_AGENT = UserAgent().random +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +HTTPERROR_ALLOWED_CODES = [ 200 , 403] +# Configure maximum concurrent requests performed by Scrapy (default: 16) +CONCURRENT_REQUESTS = 16 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +IMAGES_STORE = os.path.join(OUTPUT_DIR, 'images') +IMAGES_NAME_FORMAT = "{:0>3d}" +COMIC_INFO_XML_STORE = IMAGES_STORE +DOWNLOAD_DELAY = 0 +#重试 +RETRY_ENABLED = True +RETRY_TIMES = 10 # 想重试几次就写几 +# 下面这行可要可不要 +RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 401] +# The download delay setting will honor only one of: +CONCURRENT_REQUESTS_PER_DOMAIN = 16 +CONCURRENT_REQUESTS_PER_IP = 16 +PROXY_LIST = [ + "http://127.0.0.1:7890", +] +# Disable cookies (enabled by default) +COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'Comics.middlewares.ComicsSpiderMiddleware': 543, +# 'Comics.middlewares.ProxyMiddleware' : 100, +# 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +DOWNLOADER_MIDDLEWARES = { +# 'Comics.middlewares.ComicsDownloaderMiddleware': 543, +# 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500, + 'Comics.middlewares.ProxyMiddleware': 100, + 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400, +} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { +# 'scrapy.pipelines.images.ImagesPipeline' : 1, + 'Comics.pipelines.ComicsPipeline': 300, +# 'Comics.pipelines.ImageParsePipeline': 400, + 'Comics.pipelines.ImgDownloadPipeline': 500, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +AUTOTHROTTLE_ENABLED = True +# The initial download delay +AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +HTTPCACHE_ENABLED = True +HTTPCACHE_EXPIRATION_SECS = 0 +HTTPCACHE_DIR = 'httpcache' +HTTPCACHE_IGNORE_HTTP_CODES = [500, 502, 404] +#HTTPCACHE_STORAGE = 'Comics.middlewares.MyFilesystemCacheStorage' +HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +# Logging configuration +LOG_LEVEL = "INFO" # 日志等级 +LOG_STDOUT = True # 标准化输出 + +CBZ_EXPORT_PATH = "CBZ" +#数据导出类 排序 +COMIC_INFO_XML_FILE = "ComicInfo.xml" +COMIC_INFO_FIELDS_TO_EXPORT = [ + "Title", + "Series", + "Number", + "SeriesGroup", + "Summary", + "Year", + "Month", + "Day", + "Writer", + "Publisher", + "Genre", + "Tags", + "Web", + "PageCount", + "LanguageISO", + "AgeRating", + "Pages" +] diff --git a/Comics/spiders/__init__.py b/Comics/spiders/__init__.py index ebd689a..5ca581d 100644 --- a/Comics/spiders/__init__.py +++ b/Comics/spiders/__init__.py @@ -1,4 +1,4 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/Comics/spiders/rm_comic.py b/Comics/spiders/rm_comic.py index 07d8c03..71d062b 100644 --- a/Comics/spiders/rm_comic.py +++ b/Comics/spiders/rm_comic.py @@ -1,62 +1,75 @@ -import scrapy -from Comics.items import ComicItem -from Comics.loader import ComicLoader -from itemadapter import ItemAdapter -from Comics.items import ComicInfoItem - -class RmComicSpider(scrapy.Spider): - name = 'rm_comic' - allowed_domains = ['rm01.xyz'] - main_url = 'https://rm01.xyz' - #start_urls = ['https://rm01.xyz/books/63b65185-f798-4c8f-a0b0-8811615908fd/0'] - - def start_requests(self): - yield scrapy.Request('https://rm01.xyz' - '/books/306ec1e2-f701-4fda-bb78-041ad6ec4020', callback=self.parse_comic) - - # 获取某个漫画的相关数据 - # 获取到多个章节链接后进入下个流程 - def parse_comic(self, response): - comic_item = ComicLoader(item=ComicItem(), response=response) - comic_item.add_xpath('name', '//div[@class="col"]/h5/text()') - comic_item.add_xpath('icon', '//img[@class="img-thumbnail"]/@src') - comic_item.add_xpath('author', '//div[contains(@class,"bookid_bookInfo")]/p[1]/text()', index=1) - comic_item.add_xpath('tags', '//div[contains(@class,"bookid_bookInfo")]/p[3]/b/text()') - comic_item.add_xpath('dep', '//div[contains(@class,"bookid_bookInfo")]/p[4]/text()', index=1) - comic_item.add_xpath('date', '//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()', index=1) - comic_item.add_value('genre', "韩漫") - comic_item.add_value('age_rating', "R18+") - chapter_href = comic_item.get_xpath('//div[contains(@class,"bookid_chapterBox")]' - '//div[contains(@class,"bookid_chapter")]/a/@href') - #chapters = response.xpath('//div[contains(@class,"bookid_chapterBox")]' - # '//div[contains(@class,"bookid_chapter")]/a/text()').extract() - #for chapter, link in zip(chapters, chapter_href): - for i, link in enumerate(chapter_href, start=1): - yield scrapy.Request(self.main_url+link, meta={'item': comic_item.load_item(), 'num': i}, callback=self.parse_chapter) - - # 读取某章节下的所有图片 - def parse_chapter(self, response): - comic_item = ComicLoader(item=response.meta['item'], response=response) - data = comic_item.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0] - str_exec = "props.pageProps." - #comic_item.add_exec('name', data, str_exec=str_exec+"bookName") - #comic_item.add_exec('dep', data, str_exec=str_exec+"description") - comic_item.add_value('index', response.meta['num']) - comic_item.add_exec('chapter', data, str_exec=str_exec + "chapterName") - comic_item.add_exec('list_img', data, str_exec+"images") - comic = comic_item.load_item() - chapter_api_url = comic_item.get_exec(data, str_exec+"chapterAPIPath") - if chapter_api_url is not None: - yield scrapy.Request(self.main_url + chapter_api_url, meta={'item': comic}, callback=self.parse_chapter_api) - else: - yield comic - - # 加密数据API处理 - def parse_chapter_api(self, response): - comic_item = ComicLoader(item=response.meta['item'], response=response) - comic_item.add_exec('chapter', response.text, str_exec='chapter.name') - comic_item.add_exec('list_img', response.text, str_exec='chapter.images') - yield comic_item.load_item() - - def parse(self, response): +import scrapy,logging,time +from Comics.items import ComicItem +from Comics.loader import ComicLoader +from Comics.items import ListComicItem + +class RmComicSpider(scrapy.Spider): + name = 'rm_comic' + allowed_domains = ['rm01.xyz'] + main_url = 'https://rm01.xyz' + start_urls = 'https://rm01.xyz/books' + + def start_requests(self): + yield scrapy.Request(self.start_urls, callback=self.books_comic) + + def books_comic(self, response): + books_comic = ComicLoader(item=ListComicItem(), response=response) + data = books_comic.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0] + str_exec = "props.pageProps.books" + books = books_comic.get_exec(data, str_exec=str_exec) + for book in books: + books_comic.add_value('link', book['id']) + logging.info(f"downloading books %s" % book['name']) + time.sleep(3) + yield scrapy.Request(url=self.start_urls+"/"+book['id'], callback=self.parse_comic) + + + # 获取某个漫画的相关数据 + # 获取到多个章节链接后进入下个流程 + def parse_comic(self, response): + comic_item = ComicLoader(item=ComicItem(), response=response) + comic_item.add_xpath('name', '//div[@class="col"]/h5/text()') + comic_item.add_xpath('icon', '//img[@class="img-thumbnail"]/@src') + comic_item.add_xpath('author', '//div[contains(@class,"bookid_bookInfo")]/p[1]/text()', index=1) + comic_item.add_xpath('tags', '//div[contains(@class,"bookid_bookInfo")]/p[3]/b/text()') + comic_item.add_xpath('dep', '//div[contains(@class,"bookid_bookInfo")]/p[4]/text()', index=1) + comic_item.add_xpath('date', '//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()', index=1) + comic_item.add_value('genre', "韩漫") + comic_item.add_value('age_rating', "R18+") + chapter_href = comic_item.get_xpath('//div[contains(@class,"bookid_chapterBox")]' + '//div[contains(@class,"bookid_chapter")]/a/@href') + #chapters = response.xpath('//div[contains(@class,"bookid_chapterBox")]' + # '//div[contains(@class,"bookid_chapter")]/a/text()').extract() + #for chapter, link in zip(chapters, chapter_href): + for i, link in enumerate(chapter_href, start=1): + yield scrapy.Request(self.main_url+link, meta={'item': comic_item.load_item(), 'num': i}, callback=self.parse_chapter) + + # 读取某章节下的所有图片 + def parse_chapter(self, response): + comic_item = ComicLoader(item=response.meta['item'], response=response) + data = comic_item.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0] + str_exec = "props.pageProps." + #comic_item.add_exec('name', data, str_exec=str_exec+"bookName") + #comic_item.add_exec('dep', data, str_exec=str_exec+"description") + comic_item.add_value('index', response.meta['num']) + comic_item.add_exec('chapter', data, str_exec=str_exec + "chapterName") + comic_item.add_exec('image_urls', data, str_exec+"images") + comic_item.add_exec('images', data, str_exec+"images") + comic = comic_item.load_item() + chapter_api_url = comic_item.get_exec(data, str_exec+"chapterAPIPath") + if chapter_api_url is not None: + yield scrapy.Request(self.main_url + chapter_api_url, meta={'item': comic}, callback=self.parse_chapter_api) + else: + yield comic + + # 加密数据API处理 + def parse_chapter_api(self, response): + comic_item = ComicLoader(item=response.meta['item'], response=response) + comic_item.add_exec('chapter', response.text, str_exec='chapter.name') + comic_item.add_exec('image_urls', response.text, str_exec='chapter.images') + comic_item.add_exec('images', response.text, str_exec='chapter.images') + yield comic_item.load_item() + + + def parse(self, response): raise NotImplementedError \ No newline at end of file diff --git a/Comics/utils/Constant.py b/Comics/utils/Constant.py index ebe3091..18d0489 100644 --- a/Comics/utils/Constant.py +++ b/Comics/utils/Constant.py @@ -1,39 +1,48 @@ -import os.path -import re -from opencc import OpenCC -class ComicPath: - PREFIX_SCRAMBLE = "scramble=" - - @classmethod - def getDirComicChapter(cls): - return None - - @classmethod - def getFileScrambleImageName(cls,count,block,suffix=".jpg"): return cls.PREFIX_SCRAMBLE+str(block)+"_"+str(count)+suffix - - @classmethod - def getFileScrambleImageSave(cls,file): return str(file).split("_")[-1] - - #繁体中文转简体中文 - @classmethod - def chinese_convert(cls, text,convert='t2s'): return OpenCC(convert).convert(str(text)) - - #处理成符合规定的文件名 - @classmethod - def fix_file_name(cls, filename, replace=None): - if not isinstance(filename, str): - return filename - in_tab = r'[?*/\|.:><]' - str_replace = "" - if replace is not None: - str_replace = replace - filename = re.sub(in_tab, str_replace, filename) - count = 1 - while True: - str_file = filename[0-count] - if str_file == " ": - count += 1 - else: - filename = filename[0:len(filename)+1-count] - break +import os.path +import re +from opencc import OpenCC +class ComicPath: + PREFIX_SCRAMBLE = "scramble=" + + @classmethod + def getDirComicChapter(cls): + return None + + @classmethod + def getFileScrambleImageName(cls,count,block,suffix=".jpg"): return cls.PREFIX_SCRAMBLE+str(block)+"_"+str(count)+suffix + + @classmethod + def getFileScrambleImageSave(cls,file,relative=False, is_prefix=True): + file_name = str(file).split("_")[-1] + if relative: + file_name = os.path.basename(file_name) + if relative == "fullpath": + file_name = os.path.join(os.path.dirname(file), file_name) + if not is_prefix: + return file_name.split(".")[0] + else: + return file_name + + #繁体中文转简体中文 + @classmethod + def chinese_convert(cls, text,convert='t2s'): return OpenCC(convert).convert(str(text)) + + #处理成符合规定的文件名 + @classmethod + def fix_file_name(cls, filename, replace=None): + if not isinstance(filename, str): + return filename + in_tab = r'[?*/\|.:><]' + str_replace = "" + if replace is not None: + str_replace = replace + filename = re.sub(in_tab, str_replace, filename) + count = 1 + while True: + str_file = filename[0-count] + if str_file == " ": + count += 1 + else: + filename = filename[0:len(filename)+1-count] + break return filename \ No newline at end of file diff --git a/Comics/utils/FileUtils.py b/Comics/utils/FileUtils.py index 2bde3c0..69072a0 100644 --- a/Comics/utils/FileUtils.py +++ b/Comics/utils/FileUtils.py @@ -1,340 +1,361 @@ -import base64,hashlib,os,shutil -import math,time,json,datetime,logging -from PIL import Image -from Comics.utils.Constant import ComicPath -from pathlib import Path -from zipfile import ZipFile -from Comics.settings import COMIC_INFO_XML_FILE,CBZ_EXPORT_PATH,IMAGES_STORE - -class fileUtils: - @classmethod - def save_file(cls,path,data): - dir = os.path.dirname(path) - if not os.path.exists(dir): - os.makedirs(dir) - with open(path,'w',encoding='utf-8') as fs: - fs.write(str(data)) - fs.close() - -class CommonUtils: - @classmethod - def parseExec(cls,data,exec): - if data !=None and exec != None: - dots = str(exec).split(".") - if not isinstance(data,dict): data = json.loads(data) - for dot in dots: - data = data.get(dot) - return data - -class imageUtils: - - @classmethod - def deScrambleImagesByDir(cls,chapter_dir): - scramble_count = 0 - if os.path.exists(chapter_dir): #获取章节图片路径 - dirs = os.listdir(chapter_dir) - for img in dirs: - if img.startswith(ComicPath.PREFIX_SCRAMBLE): - imageUtils.encode_scramble_image(os.path.join(chapter_dir,img)) - scramble_count += 1 - logging.debug(f"{ComicPath.PREFIX_SCRAMBLE} {scramble_count}") - return scramble_count - - @classmethod - def deScrambleImagesByPath(cls, img_path, img_save=None): - if os.path.basename(img_path).startswith(ComicPath.PREFIX_SCRAMBLE): - img_path = imageUtils.encode_scramble_image(img_path, img_save) - return img_path - - @classmethod - def encodeImage(cls,str_en): - #print("en",str_en) - enc = base64.b64decode(str_en) - #print("解密:",enc) - m = hashlib.md5() - m.update(enc) - md5 = m.digest() - d = md5[-1] - #print(md5) - try: - blocks = d % 10 + 5 - except: - blocks = 0 %10 + 5 - #print("blocks=",blocks) - return blocks - - @classmethod - def scrambleImage(cls,file_path): - #检测到未下载完的图像 直接返回None - if str(file_path).endswith(".downloads"): - os.remove(file_path) - return None - file_str = str(file_path).split("=") - #10_29.jpg - base_dir = file_str[0].replace("scramble","") - base_name = file_str[-1] - base_fn = base_name.split("_") - save_name = base_fn[1] - save_name_delesu = save_name.split(".")[0] - blocks = int(base_fn[0]) - save_file_path = os.path.join(base_dir,save_name) - print("sva",save_file_path) - if os.path.exists(save_file_path): - print("图片已解密,已跳过:", save_file_path) - return None - image_su = str(file_path).split(".")[-1] - try: - img = Image.open(file_path) - except: - print(f"error Image: {file_path}") - width = img.width - height = img.height - #blocks = cls.encodeImage(enStr) - print("blocks=",blocks) - block_height = int(height / blocks) - block_width = int(width / blocks) - print("blockHeight=",block_height) - suffix = str(file_path).split(".")[-1] - split_path = os.path.join(base_dir,save_name_delesu+"split") - if image_su == "downloads": - return None - is_split = cls.splitimage(file_path,blocks,1,split_path) - if is_split != None: - cls.image_compose(split_path,blocks,1,save_file_path,block_height,width) - else: - if os.path.exists(split_path): - shutil.rmtree(split_path) - if os.path.exists(file_path): - shutil.move(file_path, save_file_path) - #完成后清空 - return file_path - - @classmethod - def splitimage(cls,src,rownum,colnum,dstpath): - img=Image.open(src) - w,h=img.size - if rownum<= h and colnum<=w: - s=os.path.split(src) - if dstpath=='': - dstpath = s[0] - if not os.path.exists(dstpath): - os.makedirs(dstpath) - fn=s[1].split('.') - basename=fn[0] - ext=fn[-1] - num=0 - rowheight=h//rownum - colwidth=w//colnum - for r in range(rownum): - for c in range(colnum): - box=(c*colwidth,r*rowheight,(c+1)*colwidth,(r+1)*rowheight) - count_image = "{:0>3d}".format(num) - file_path = os.path.join(dstpath,str(count_image)+'.'+ext) - print("file_path=",file_path) - img.crop(box).save(file_path) - num=num+1 - return "成功" - else: - print('不数!') - return None - - @classmethod - def image_compose(cls,src,row,column,save_path,image_height,image_width): - image_size = image_height - #image_height = 376 - #image_width = 720 - images_format = ['.png','.jpg'] - - #image_names = [name for name in os.listdir(src) for item in images_format if - # os.path.splitext(name)[1] == item][::-1] - img_list=os.listdir(src) - img_list.sort() - img_list.sort(key=lambda x: int(x[:-4])) - ##文件名按数字排序 - img_nums=len(img_list) - image_names = [] - for i in range(img_nums): - img_name=os.path.join(src,img_list[i]) - image_names.append(img_name) - #使用倒序 - image_names = image_names[::-1] - # 简单的对于参数的设定和实际图片集的大小进行数量判断 - if len(image_names) < row * column: - raise ValueError("合成图片的参数和要求的数量不能匹配!") - - to_image = Image.new('RGB', (column * image_width, row * image_height)) #创建一个新图 - # 循环遍历,把每张图片按顺序粘贴到对应位置上 - for y in range(1, row + 1): - for x in range(1, column + 1): - #1 * (row=1 -1) col=1 -1 - image_path = image_names[column * (y - 1) + x - 1] - print("split_image=",image_path) - from_image = Image.open(image_path) - #保持原图片大小 - #.resize( - # (image_size, image_size),Image.ANTIALIAS) - to_image.paste(from_image, ((x - 1) * image_size, (y - 1) * image_size)) - from_image.close() - to_image.save(save_path) - print("图片合并完成:", save_path) - shutil.rmtree(src) - # 保存新图 - - @classmethod - def getScrambleImage(cls,path): - scramble_file_cache = cls.scrambleImage(path) - if scramble_file_cache != None and os.path.exists(scramble_file_cache): os.remove(scramble_file_cache) - - @classmethod - def encode_scramble_image(cls,imgpath,img_save=None): - image = Image.open(imgpath) - w, h = image.size - #image.show() - file_str = str(imgpath).split("=") - #10_29.jpg - base_fn = file_str[-1].split("_") - blocks = int(base_fn[0]) - if img_save == None: - save_path = os.path.join(os.path.dirname(imgpath),ComicPath.getFileScrambleImageSave(imgpath)) - else: save_path = img_save - # print(type(aid),type(img_name)) - if blocks: - s = blocks # 随机值 - # print(s) - l = h % s # 切割最后多余的值 - box_list = [] - hz = 0 - for i in range(s): - c = math.floor(h / s) - g = i * c - hz += c - h2 = h - c * (i + 1) - l - if i == 0: - c += l;hz += l - else: - g += l - box_list.append((0, h2, w, h - g)) - - # print(box_list,len(box_list)) - item_width = w - # box_list.reverse() #还原切图可以倒序列表 - # print(box_list, len(box_list)) - newh = 0 - image_list = [image.crop(box) for box in box_list] - # print(box_list) - newimage = Image.new("RGB", (w, h)) - for image in image_list: - # image.show() - b_w, b_h = image.size - newimage.paste(image, (0, newh)) - - newh += b_h - newimage.save(save_path) - print("解密成功=",save_path) - if os.path.exists(imgpath): - os.remove(imgpath) - print("remove=",imgpath) - return save_path - - -class CBZUtils: - - @classmethod - def readDirsOrFiles(cls, dir, type): - data = [] - files = os.listdir(dir) - for file in files: - path = os.path.join(dir, file) - if type == "files" and os.path.isfile(path): - data.append(path) - if type == "dirs" and os.path.isdir(path): - data.append(path) - return data - - @classmethod - def zip_compression(cls, source_dir=None, target_file=None, remove=True): - target_dir = os.path.dirname(target_file) - if not os.path.exists(target_dir): - os.makedirs(target_dir) - if not os.path.exists(target_file) and source_dir is not None: - with ZipFile(target_file, mode='w') as zf: - for path, dir_names, filenames in os.walk(source_dir): - path = Path(path) - arc_dir = path.relative_to(source_dir) - y = 0 - for filename in filenames: - y = y + 1 - print("打包中:" + str(y) + "/" + str(len(filenames)), os.path.join(source_dir, filename)) - zf.write(path.joinpath(filename), arc_dir.joinpath(filename)) - zf.close() - logging.info(f"打包完成:{target_file}") - - @classmethod - def packComicChapterCBZ(cls, comic, chapter, remove=True): - images_chapter_path = os.path.join(IMAGES_STORE, comic, chapter) - cbz_chapter_path = os.path.join(CBZ_EXPORT_PATH, comic, chapter) + ".CBZ" - if os.path.exists(images_chapter_path): - dirs = os.listdir(images_chapter_path) - for file in dirs: - if file.startswith(ComicPath.PREFIX_SCRAMBLE): - try: - os.remove(file) - except Exception as e: - print(f"删除 {file} 发生错误 {e},已跳过") - return False - cls.zip_compression(images_chapter_path, cbz_chapter_path) - time.sleep(0.1) - if remove: shutil.rmtree(images_chapter_path) - return True - - @classmethod - def replaceZip(cls, filepath, unpack_dir=None): - if not cls.compareFileDate(filepath): return None - if unpack_dir == None: - unpack_dir = str(filepath).split(".")[0] - fz = ZipFile(filepath, 'r') - for file in fz.namelist(): - if file.endswith(".jpg"): - data = fz.read(file) - if len(data) < 500 and os.path.exists(filepath): - os.remove(filepath) - print(f"数据不完整,已删除:{filepath}") - if cls.compareFileDate(filepath): - os.utime(filepath) - print(f"已更新文件时间 {filepath}") - if os.path.exists(unpack_dir): - shutil.rmtree(unpack_dir) - # 删除删除main.ftl文件 - # delete_filename = '' - # if os.path.exists(delete_filename): - # os.remove(delete_filename) - # time.sleep(60) - # shutil.copy(文件的路径,另一个目录);拷贝main.ftl到准备压缩的目录下 - # cls.zip_compression() - # 小于则运行 - - @classmethod - def compareFileDate(cls, filepath): - if os.path.exists(filepath): - ctime = os.path.getmtime(filepath) - str_ctime = datetime.fromtimestamp(int(ctime)) - file_ctime = str(str_ctime.year) + "{:0>2d}".format(str_ctime.month) + "{:0>2d}".format( - str_ctime.day) + "{:0>2d}".format(str_ctime.hour) - c_ctime = 2023011603 - else: - return False - if int(file_ctime) < c_ctime: - return True - return False - - @classmethod - def zip_info(cls, path, filter=True): - result = None - try: - with ZipFile(path, "r") as zip_file: - result = zip_file.namelist() - if filter: - result.remove(COMIC_INFO_XML_FILE) - except Exception as e: - print(e) - return result \ No newline at end of file +import base64,hashlib,os,shutil +import math,time,json,datetime,logging +from PIL import Image +from Comics.utils.Constant import ComicPath +from pathlib import Path +from zipfile import ZipFile +from Comics.settings import COMIC_INFO_XML_FILE,CBZ_EXPORT_PATH,IMAGES_STORE + +class fileUtils: + @classmethod + def save_file(cls,path,data): + dir = os.path.dirname(path) + if not os.path.exists(dir): + os.makedirs(dir) + with open(path,'w',encoding='utf-8') as fs: + fs.write(str(data)) + fs.close() + + @classmethod + def path(cls, file): + base_dir = os.path.dirname(file) + if not os.path.exists(base_dir): os.makedirs(base_dir) + return file + +class CommonUtils: + @classmethod + def parseExec(cls,data,exec): + if data !=None and exec != None: + dots = str(exec).split(".") + if not isinstance(data,dict): data = json.loads(data) + for dot in dots: + data = data.get(dot) + return data + +class imageUtils: + + @classmethod + def descramble_images_by_dir(cls, chapter_dir): + if os.path.isfile(chapter_dir): + chapter_dir = os.path.dirname(chapter_dir) + scramble_count = 0 + if os.path.exists(chapter_dir): #获取章节图片路径 + while ComicPath.PREFIX_SCRAMBLE in os.listdir(chapter_dir): + for img in os.listdir(chapter_dir): + if img.startswith(ComicPath.PREFIX_SCRAMBLE): + imageUtils.encode_scramble_image(os.path.join(chapter_dir, img)) + scramble_count += 1 + logging.debug(f"{ComicPath.PREFIX_SCRAMBLE} {scramble_count}") + return scramble_count + + @classmethod + def deScrambleImagesByPath(cls, img_path, img_save=None): + if os.path.basename(img_path).\ + startswith(ComicPath.PREFIX_SCRAMBLE) and os.path.exists(img_path): + img_path = imageUtils.encode_scramble_image(img_path, img_save) + return img_path + + @classmethod + def encodeImage(cls,str_en): + #print("en",str_en) + enc = base64.b64decode(str_en) + #print("解密:",enc) + m = hashlib.md5() + m.update(enc) + md5 = m.digest() + d = md5[-1] + #print(md5) + try: + blocks = d % 10 + 5 + except: + blocks = 0 %10 + 5 + #print("blocks=",blocks) + return blocks + + @classmethod + def scrambleImage(cls,file_path): + #检测到未下载完的图像 直接返回None + if str(file_path).endswith(".downloads"): + os.remove(file_path) + return None + file_str = str(file_path).split("=") + #10_29.jpg + base_dir = file_str[0].replace("scramble","") + base_name = file_str[-1] + base_fn = base_name.split("_") + save_name = base_fn[1] + save_name_delesu = save_name.split(".")[0] + blocks = int(base_fn[0]) + save_file_path = os.path.join(base_dir,save_name) + print("sva",save_file_path) + if os.path.exists(save_file_path): + print("图片已解密,已跳过:", save_file_path) + return None + image_su = str(file_path).split(".")[-1] + try: + img = Image.open(file_path) + except: + print(f"error Image: {file_path}") + width = img.width + height = img.height + #blocks = cls.encodeImage(enStr) + print("blocks=",blocks) + block_height = int(height / blocks) + block_width = int(width / blocks) + print("blockHeight=",block_height) + suffix = str(file_path).split(".")[-1] + split_path = os.path.join(base_dir,save_name_delesu+"split") + if image_su == "downloads": + return None + is_split = cls.splitimage(file_path,blocks,1,split_path) + if is_split != None: + cls.image_compose(split_path,blocks,1,save_file_path,block_height,width) + else: + if os.path.exists(split_path): + shutil.rmtree(split_path) + if os.path.exists(file_path): + shutil.move(file_path, save_file_path) + #完成后清空 + return file_path + + @classmethod + def splitimage(cls,src,rownum,colnum,dstpath): + img=Image.open(src) + w,h=img.size + if rownum<= h and colnum<=w: + s=os.path.split(src) + if dstpath=='': + dstpath = s[0] + if not os.path.exists(dstpath): + os.makedirs(dstpath) + fn=s[1].split('.') + basename=fn[0] + ext=fn[-1] + num=0 + rowheight=h//rownum + colwidth=w//colnum + for r in range(rownum): + for c in range(colnum): + box=(c*colwidth,r*rowheight,(c+1)*colwidth,(r+1)*rowheight) + count_image = "{:0>3d}".format(num) + file_path = os.path.join(dstpath,str(count_image)+'.'+ext) + print("file_path=",file_path) + img.crop(box).save(file_path) + num=num+1 + return "成功" + else: + print('不数!') + return None + + @classmethod + def image_compose(cls,src,row,column,save_path,image_height,image_width): + image_size = image_height + #image_height = 376 + #image_width = 720 + images_format = ['.png','.jpg'] + + #image_names = [name for name in os.listdir(src) for item in images_format if + # os.path.splitext(name)[1] == item][::-1] + img_list=os.listdir(src) + img_list.sort() + img_list.sort(key=lambda x: int(x[:-4])) + ##文件名按数字排序 + img_nums=len(img_list) + image_names = [] + for i in range(img_nums): + img_name=os.path.join(src,img_list[i]) + image_names.append(img_name) + #使用倒序 + image_names = image_names[::-1] + # 简单的对于参数的设定和实际图片集的大小进行数量判断 + if len(image_names) < row * column: + raise ValueError("合成图片的参数和要求的数量不能匹配!") + + to_image = Image.new('RGB', (column * image_width, row * image_height)) #创建一个新图 + # 循环遍历,把每张图片按顺序粘贴到对应位置上 + for y in range(1, row + 1): + for x in range(1, column + 1): + #1 * (row=1 -1) col=1 -1 + image_path = image_names[column * (y - 1) + x - 1] + print("split_image=",image_path) + from_image = Image.open(image_path) + #保持原图片大小 + #.resize( + # (image_size, image_size),Image.ANTIALIAS) + to_image.paste(from_image, ((x - 1) * image_size, (y - 1) * image_size)) + from_image.close() + to_image.save(save_path) + print("图片合并完成:", save_path) + shutil.rmtree(src) + # 保存新图 + + @classmethod + def getScrambleImage(cls,path): + scramble_file_cache = cls.scrambleImage(path) + if scramble_file_cache != None and os.path.exists(scramble_file_cache): os.remove(scramble_file_cache) + + @classmethod + def encode_scramble_image(cls, img_path, img_save=None): + if not os.path.exists(img_path): + return + image = Image.open(img_path) + w, h = image.size + #image.show() + file_str = str(img_path).split("=") + #10_29.jpg + base_fn = file_str[-1].split("_") + blocks = int(base_fn[0]) + if img_save == None: + save_path = os.path.join(os.path.dirname(img_path),ComicPath.getFileScrambleImageSave(img_path)) + else: save_path = img_save + # print(type(aid),type(img_name)) + if blocks: + s = blocks # 随机值 + # print(s) + l = h % s # 切割最后多余的值 + box_list = [] + hz = 0 + for i in range(s): + c = math.floor(h / s) + g = i * c + hz += c + h2 = h - c * (i + 1) - l + if i == 0: + c += l;hz += l + else: + g += l + box_list.append((0, h2, w, h - g)) + + # print(box_list,len(box_list)) + item_width = w + # box_list.reverse() #还原切图可以倒序列表 + # print(box_list, len(box_list)) + newh = 0 + image_list = [image.crop(box) for box in box_list] + # print(box_list) + newimage = Image.new("RGB", (w, h)) + for image in image_list: + # image.show() + b_w, b_h = image.size + newimage.paste(image, (0, newh)) + + newh += b_h + newimage.save(save_path) + logging.info(f"解密成功 {save_path}") + if os.path.exists(img_path): + os.remove(img_path) + logging.debug(f"remove {img_path}") + return save_path + + +class CBZUtils: + + @classmethod + def readDirsOrFiles(cls, dir, type): + data = [] + files = os.listdir(dir) + for file in files: + path = os.path.join(dir, file) + if type == "files" and os.path.isfile(path): + data.append(path) + if type == "dirs" and os.path.isdir(path): + data.append(path) + return data + + @classmethod + def zip_compression(cls, source_dir=None, target_file=None, remove=True): + target_dir = os.path.dirname(target_file) + if not os.path.exists(target_dir): + os.makedirs(target_dir) + if not os.path.exists(target_file) and source_dir is not None: + with ZipFile(target_file, mode='w') as zf: + for path, dir_names, filenames in os.walk(source_dir): + path = Path(path) + arc_dir = path.relative_to(source_dir) + y = 0 + for filename in filenames: + y = y + 1 + print("打包中:" + str(y) + "/" + str(len(filenames)), os.path.join(source_dir, filename)) + zf.write(path.joinpath(filename), arc_dir.joinpath(filename)) + zf.close() + logging.info(f"打包完成:{target_file}") + + @classmethod + def packComicChapterCBZ(cls, comic, chapter, comic_info_images, remove=True): + images_chapter_path = os.path.join(IMAGES_STORE, comic, chapter) + cbz_chapter_path = os.path.join(CBZ_EXPORT_PATH, comic, chapter) + ".CBZ" + if os.path.exists(images_chapter_path): + dirs = os.listdir(images_chapter_path) + for file in dirs: + if file.startswith(ComicPath.PREFIX_SCRAMBLE): + try: + imageUtils.deScrambleImagesByPath(os.path.join(images_chapter_path,file)) + except Exception as e: + print(f"删除 {file} 发生错误 {e},已跳过") + return False + cls.zip_compression(images_chapter_path, cbz_chapter_path) + time.sleep(0.1) + if remove: shutil.rmtree(images_chapter_path) + # validation + cls.cbz_validate(cbz_chapter_path, comic_info_images) + return True + + @classmethod + def replaceZip(cls, filepath, unpack_dir=None): + if not cls.compareFileDate(filepath): return None + if unpack_dir == None: + unpack_dir = str(filepath).split(".")[0] + fz = ZipFile(filepath, 'r') + for file in fz.namelist(): + if file.endswith(".jpg"): + data = fz.read(file) + if len(data) < 500 and os.path.exists(filepath): + os.remove(filepath) + print(f"数据不完整,已删除:{filepath}") + if cls.compareFileDate(filepath): + os.utime(filepath) + print(f"已更新文件时间 {filepath}") + if os.path.exists(unpack_dir): + shutil.rmtree(unpack_dir) + # 删除删除main.ftl文件 + # delete_filename = '' + # if os.path.exists(delete_filename): + # os.remove(delete_filename) + # time.sleep(60) + # shutil.copy(文件的路径,另一个目录);拷贝main.ftl到准备压缩的目录下 + # cls.zip_compression() + # 小于则运行 + + @classmethod + def compareFileDate(cls, filepath): + if os.path.exists(filepath): + ctime = os.path.getmtime(filepath) + str_ctime = datetime.fromtimestamp(int(ctime)) + file_ctime = str(str_ctime.year) + "{:0>2d}".format(str_ctime.month) + "{:0>2d}".format( + str_ctime.day) + "{:0>2d}".format(str_ctime.hour) + c_ctime = 2023011603 + else: + return False + if int(file_ctime) < c_ctime: + return True + return False + + @classmethod + def zip_info(cls, path, filter=True): + result = None + try: + with ZipFile(path, "r") as zip_file: + result = zip_file.namelist() + if filter: + result.remove(COMIC_INFO_XML_FILE) + except Exception as e: + print(e) + return result + + @classmethod + def cbz_validate(cls, zip_path, comic_info_images): + if len(cls.zip_info(zip_path)) == len(comic_info_images): + logging.info(f"validating successfully === {zip_path}") + else: + os.remove(zip_path) + logging.error(f"validating fail === {zip_path}") \ No newline at end of file diff --git a/Comics/utils/OldUtils.py b/Comics/utils/OldUtils.py index 5bc9938..abbd243 100644 --- a/Comics/utils/OldUtils.py +++ b/Comics/utils/OldUtils.py @@ -1,16 +1,16 @@ - -class OldUtils: - old_comic_name=None - old_chapter = None - - @classmethod - def setOldComicName(cls,value): cls.old_comic_name = value - - @classmethod - def setOldChapter(cls,value): cls.old_chapter=value - - @classmethod - def getOldComicName(cls): return cls.old_comic_name - - @classmethod + +class OldUtils: + old_comic_name=None + old_chapter = None + + @classmethod + def setOldComicName(cls,value): cls.old_comic_name = value + + @classmethod + def setOldChapter(cls,value): cls.old_chapter=value + + @classmethod + def getOldComicName(cls): return cls.old_comic_name + + @classmethod def getOldChapter(cls): return cls.old_chapter \ No newline at end of file diff --git a/run.py b/run.py index c0ae72f..b6cd60f 100644 --- a/run.py +++ b/run.py @@ -1,5 +1,5 @@ -# -*- coding: utf-8 -*- - -from scrapy import cmdline - +# -*- coding: utf-8 -*- + +from scrapy import cmdline + cmdline.execute("scrapy crawl rm_comic".split()) \ No newline at end of file diff --git a/scrapy.cfg b/scrapy.cfg index 93fe4e1..a18695c 100644 --- a/scrapy.cfg +++ b/scrapy.cfg @@ -1,11 +1,11 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = Comics.settings - -[deploy] -#url = http://localhost:6800/ -project = Comics +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = Comics.settings + +[deploy] +#url = http://localhost:6800/ +project = Comics