This commit is contained in:
caiwx86 2023-06-20 02:52:51 +08:00
parent af7812794f
commit ac30f59a33
20 changed files with 1167 additions and 1060 deletions

8
.gitignore vendored
View File

@ -1,5 +1,5 @@
.scrapy/* .scrapy/*
images/* .vscode/*
json/* CBZ/*
CBZ/* output/*
/**/__pycache__ /**/__pycache__

3
.idea/.gitignore vendored
View File

@ -1,3 +0,0 @@
# Default ignored files
/shelf/
/workspace.xml

View File

@ -1,8 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="stable_vscode" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@ -1,6 +0,0 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

View File

@ -1,4 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="stable_vscode" project-jdk-type="Python SDK" />
</project>

View File

@ -1,8 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/ComicScrapy.iml" filepath="$PROJECT_DIR$/.idea/ComicScrapy.iml" />
</modules>
</component>
</project>

View File

@ -1,6 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

View File

@ -1,105 +1,130 @@
import os.path,json,ast import os.path,json,ast
from Comics.settings import COMIC_INFO_FIELDS_TO_EXPORT from Comics.settings import COMIC_INFO_FIELDS_TO_EXPORT
from scrapy.exporters import XmlItemExporter from scrapy.exporters import XmlItemExporter
from scrapy.exporters import PythonItemExporter from scrapy.exporters import PythonItemExporter
from Comics.items import ComicInfoItem from scrapy.exporters import JsonItemExporter
from Comics.items import ComicItem from Comics.items import ComicInfoItem
from Comics.settings import COMIC_INFO_XML_STORE from Comics.items import ComicItem
from Comics.utils.Constant import ComicPath from Comics.settings import COMIC_INFO_XML_STORE
from scrapy.utils.python import is_listlike, to_bytes, to_unicode from Comics.utils.Constant import ComicPath
from itemadapter import ItemAdapter from scrapy.utils.python import is_listlike, to_bytes, to_unicode
from itemadapter import ItemAdapter
class ItemExporter(PythonItemExporter): class CommonExporter():
def convert(self, data): def getPath(self, file , sufix=None):
if isinstance(data, bytes): return data.decode("utf-8") sufix = "."+sufix
if isinstance(data, dict): return dict(map(self.convert, data.items())) dirname = os.path.dirname(file)
if isinstance(data, tuple): return map(self.convert, data) if not os.path.exists(dirname):
if isinstance(data, list): return [self.convert(i) for i in data] os.makedirs(dirname)
return data if sufix != None and sufix not in file:
file = file + sufix
def export_obj(self, obj_item): return file
self.start_exporting()
obj_item = self.convert(self.export_item(obj_item)) class ItemExporter(PythonItemExporter):
self.finish_exporting() def convert(self, data):
return obj_item if isinstance(data, bytes): return data.decode("utf-8")
if isinstance(data, dict): return dict(map(self.convert, data.items()))
class ComicInfoXmlItemExporter(XmlItemExporter): if isinstance(data, tuple): return map(self.convert, data)
custom_root_element = "ComicInfo" if isinstance(data, list): return [self.convert(i) for i in data]
def __init__(self, comic, chapter): return data
file_path = os.path.join(COMIC_INFO_XML_STORE, comic,
chapter, f"{self.custom_root_element}.xml") def export_obj(self, obj_item):
dir_path = os.path.dirname(file_path) self.start_exporting()
if not os.path.exists(dir_path): os.makedirs(dir_path) obj_item = self.convert(self.export_item(obj_item))
self.xml_file = open(file_path, "wb") self.finish_exporting()
super(ComicInfoXmlItemExporter, self).__init__(self.xml_file, return obj_item
root_element=self.custom_root_element,
indent=1,fields_to_export=COMIC_INFO_FIELDS_TO_EXPORT) class JsonExport(JsonItemExporter):
def __init__(self, file, **kwargs):
def serialize_field(self, field, name, value): file = CommonExporter().getPath(file=file, sufix= "json")
#通过序列化 self.file = open(file, "wb")
value = ComicPath.chinese_convert(value) super(JsonExport, self).__init__(self.file, **kwargs)
return super().serialize_field(field, name, value)
def export_json(self, json_object, if_return=False):
def start_exporting(self): self.start_exporting()
self.xg.startDocument() self.export_item(json_object)
self.xg.startElement(self.custom_root_element, {}) self.finish_exporting()
self.file.close()
def comic_to_info_item(self, comic_item): if if_return:
comic_info = {} return ItemExporter().export_obj(json_object)
info_item = ItemAdapter(ComicInfoItem())
comic_info_dict = {}
for field in info_item.field_names(): class ComicInfoXmlItemExporter(XmlItemExporter):
meta_info = info_item.get_field_meta(field).get('info') custom_root_element = "ComicInfo"
if meta_info is not None: def __init__(self, comic, chapter):
comic_info_dict[meta_info] = field file_path = os.path.join(COMIC_INFO_XML_STORE, comic,
for key, value in ComicItem(comic_item).items(): chapter, f"{self.custom_root_element}.xml")
new_key = comic_info_dict.get(key) dir_path = os.path.dirname(file_path)
if new_key is not None: if not os.path.exists(dir_path): os.makedirs(dir_path)
comic_info[new_key] = value self.xml_file = open(file_path, "wb")
return ItemExporter().export_obj(ComicInfoItem(comic_info)) super(ComicInfoXmlItemExporter, self).__init__(self.xml_file,
root_element=self.custom_root_element,
def export_item(self, item): indent=1,fields_to_export=COMIC_INFO_FIELDS_TO_EXPORT)
comic_info = self.comic_to_info_item(item)
child_element = "Page" def serialize_field(self, field, name, value):
self._beautify_indent(depth=1) #通过序列化
self._beautify_newline() value = ComicPath.chinese_convert(value)
for name, value in self._get_serialized_fields(comic_info, default_value=""): return super().serialize_field(field, name, value)
if name is "Pages":
value = str(value).split(',') def start_exporting(self):
if value is not None or value != "": self.xg.startDocument()
self._export_xml_field(name, value, depth=2, child_element=child_element) self.xg.startElement(self.custom_root_element, {})
#self._beautify_indent(depth=1)
return comic_info def comic_to_info_item(self, comic_item):
comic_info = {}
def _export_xml_field(self, name, serialized_value, depth, child_element="value"): info_item = ItemAdapter(ComicInfoItem())
self._beautify_indent(depth=depth) comic_info_dict = {}
self.xg.startElement(name, {}) for field in info_item.field_names():
if hasattr(serialized_value, "items"): meta_info = info_item.get_field_meta(field).get('info')
self._beautify_newline() if meta_info is not None:
for sub_name, value in serialized_value.items(): comic_info_dict[meta_info] = field
self._export_xml_field(sub_name, value, depth=depth + 1) for key, value in ComicItem(comic_item).items():
self._beautify_indent(depth=depth) new_key = comic_info_dict.get(key)
elif is_listlike(serialized_value): if new_key is not None:
self._beautify_newline() comic_info[new_key] = value
for value in serialized_value: return ItemExporter().export_obj(ComicInfoItem(comic_info))
self._export_xml_field(child_element, value, depth=depth + 1)
self._beautify_indent(depth=depth) def export_item(self, item):
elif isinstance(serialized_value, str): comic_info = self.comic_to_info_item(item)
self.xg.characters(serialized_value) child_element = "Page"
else: self._beautify_indent(depth=1)
self.xg.characters(str(serialized_value)) self._beautify_newline()
self.xg.endElement(name) for name, value in self._get_serialized_fields(comic_info, default_value=""):
self._beautify_newline() if name == "Pages":
value = ast.literal_eval(value)
def finish_exporting(self): if value is not None or value != "":
self.xg.endElement(self.custom_root_element) self._export_xml_field(name, value, depth=2, child_element=child_element)
self.xg.endDocument() #self._beautify_indent(depth=1)
self.xml_file.close() return comic_info
def export_xml(self, item): def _export_xml_field(self, name, serialized_value, depth, child_element="value"):
self.start_exporting() self._beautify_indent(depth=depth)
comic_info = self.export_item(item) self.xg.startElement(name, {})
self.finish_exporting() if hasattr(serialized_value, "items"):
return comic_info self._beautify_newline()
for sub_name, value in serialized_value.items():
self._export_xml_field(sub_name, value, depth=depth + 1)
self._beautify_indent(depth=depth)
elif is_listlike(serialized_value):
self._beautify_newline()
for value in serialized_value:
self._export_xml_field(child_element, value, depth=depth + 1)
self._beautify_indent(depth=depth)
elif isinstance(serialized_value, str):
self.xg.characters(serialized_value)
else:
self.xg.characters(str(serialized_value))
self.xg.endElement(name)
self._beautify_newline()
def finish_exporting(self):
self.xg.endElement(self.custom_root_element)
self.xg.endDocument()
self.xml_file.close()
def export_xml(self, item):
self.start_exporting()
comic_info = self.export_item(item)
self.finish_exporting()
return comic_info

View File

@ -1,78 +1,151 @@
# Define here the models for your scraped items # Define here the models for your scraped items
# #
# See documentation in: # See documentation in:
# https://docs.org/en/latest/topics/items.html # https://docs.org/en/latest/topics/items.html
from scrapy.item import Item, Field import os,Comics.settings as settings,logging
from Comics.utils.Constant import ComicPath from scrapy.item import Item, Field
from scrapy.loader.processors import TakeFirst, MapCompose, Join from Comics.utils.Constant import ComicPath
from Comics.utils.FileUtils import imageUtils
def serialize_to_chinese(value): from scrapy.loader.processors import TakeFirst, MapCompose, Join
return ComicPath.chinese_convert(value)
def serialize_to_chinese(value):
def serialize_to_fix_file(value): return ComicPath.chinese_convert(value)
file = ComicPath.chinese_convert(value)
return ComicPath.fix_file_name(file) def serialize_to_fix_file(value):
file = ComicPath.chinese_convert(value)
class ComicOItem(Item): return ComicPath.fix_file_name(file)
name = Field()
chapterItem = Field() def _serialize_to_images(value, result_type=None):
count = 1
class ComicItem(Item): images_item = []
# 编号 image_urls = []
index = Field(output_processor=TakeFirst()) for image in value:
# 漫画名 (image_src, scramble) = [image.get("src"), image.get("scramble")]
name = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst()) count_image = settings.IMAGES_NAME_FORMAT.format(count)
# 章节名 suffix = "."+str(image_src).split(".")[-1]
chapter = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst()) image_name = count_image + suffix
# 图片链接 if scramble:
list_img = Field() de_str = str(image_src).split("/")[-1].replace(suffix, "==")
# 作者 blocks_num = imageUtils.encodeImage(de_str)
author = Field(serialize_to_chinese=serialize_to_chinese, output_processor=TakeFirst()) image_name = ComicPath.getFileScrambleImageName(count=count_image, block=blocks_num, suffix=suffix)
# 封面链接 #images_item.append(ImagesItem(image_name=count_image + suffix, image_url=image_src, image_path=image_name))
icon = Field(output_processor=TakeFirst()) images_item.append(image_name)
# 标签 image_urls.append(image_src)
tags = Field(serializer=serialize_to_chinese, output_processor=TakeFirst()) count += 1
# 概述 logging.info(f"images_len: {len(images_item)}")
dep = Field(serializer=serialize_to_chinese, output_processor=TakeFirst()) if result_type == "image_urls": return image_urls
# 时间 else: return images_item
date = Field(output_processor=TakeFirst())
# 流派 def serialize_to_images(value): return _serialize_to_images(value)
genre = Field(output_processor=TakeFirst())
# 年龄分级
age_rating = Field(output_processor=TakeFirst()) def serialize_to_image_urls(value): return _serialize_to_images(value, result_type="image_urls")
images = Field()
images_name = Field() class ListComicItem(Item):
name = Field()
class ImageItem(Item): link = Field()
image_name = Field()
image_url = Field()
image_path = Field() class ComicItem(Item):
# 编号
def serializer_info_writer(value): index = Field(output_processor=TakeFirst())
list_value = [] # 漫画名
str(value).replace("&", " ") name = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst())
for v in str(value).split(" "): # 章节名
list_value.append(v) chapter = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst())
return ",".join(list_value) # 图片链接
list_img = Field(serializer=serialize_to_images)
class ComicInfoItem(Item): # 作者
Title = Field(info='chapter')#"章节名",True] author = Field(serialize_to_chinese=serialize_to_chinese, output_processor=TakeFirst())
Series = Field(info='name')# ","漫画名",True] # 封面链接
Number = Field(info='index')# ","编号",True] icon = Field(output_processor=TakeFirst())
SeriesGroup = Field()# ","别名",False] # 标签
Summary = Field(info='dep')# ","概述",True] tags = Field(serializer=serialize_to_chinese, output_processor=TakeFirst())
Year = Field()# ","年",False] # 概述
Month = Field()# ","月",False] dep = Field(serializer=serialize_to_chinese, output_processor=TakeFirst())
Day = Field()# ","日",False] # 时间
Writer = Field(info='author',serializer=serializer_info_writer)# "作者",True] date = Field(output_processor=TakeFirst())
Publisher = Field()# ","出版社",False] # 流派
Genre = Field(info='genre')# ","流派",True] genre = Field(output_processor=TakeFirst())
Tags = Field(info='tags')# ","标签",True] # 年龄分级
Web = Field()# ","主页",False] age_rating = Field(output_processor=TakeFirst())
PageCount = Field()# ","总页数",True]
LanguageISO = Field()#","语言",True] images_old = Field(serializer=serialize_to_images)
AgeRating = Field(info='age_rating')#","年龄分级",False] images = Field(serializer=serialize_to_images)
Pages = Field(info='images_name')#","页码",True] image_urls = Field(serializer=serialize_to_image_urls)
# ComicInfo.xml and ComicChapter.json end images_name = Field()
class ImagesItem(Item):
image_name = Field()
image_url = Field()
image_path = Field()
images = Field()
image_urls = Field()
comic = Field()
def serializer_info_writer(value):
list_value = []
str(value).replace("&", " ")
for v in str(value).split(" "):
list_value.append(v)
return ",".join(list_value)
# Result_type name
def _serializer_info_imagesa(value, result_type=None):
info = []
for success, img in value:
img_path = os.path.join(settings.IMAGES_STORE, img['path'])
if result_type == 'name':
info.append(ComicPath().getFileScrambleImageSave(img_path,True,False))
else:
info.append(img_path)
if result_type == "len":
value = len(info)
else:
value = info
return value
def _serialize_info_images(value, result_type=None):
images = []
for image in value:
images.append(ComicPath().getFileScrambleImageSave(image,True,False))
if result_type == "count":
return len(images)
else:
return images
def serializer_info_images(value): return _serialize_info_images(value)
def serializer_info_images_count(value): return _serialize_info_images(value, "count")
def serializer_info_images_completed(value):
return _serialize_info_images(value, result_type='name')
def serializer_info_images_count(value):
return _serialize_info_images(value, result_type='len')
class ComicInfoItem(Item):
Title = Field(info='chapter')#"章节名",True]
Series = Field(info='name')# ","漫画名",True]
Number = Field(info='index')# ","编号",True]
SeriesGroup = Field()# ","别名",False]
Summary = Field(info='dep')# ","概述",True]
Year = Field()# ","年",False]
Month = Field()# ","月",False]
Day = Field()# ","日",False]
Writer = Field(info='author',serializer=serializer_info_writer)# "作者",True]
Publisher = Field()# ","出版社",False]
Genre = Field(info='genre')# ","流派",True]
Tags = Field(info='tags')# ","标签",True]
Web = Field()# ","主页",False]
#PageCount = Field()# ","总页数",True]
PageCount = Field(info='images',serializer=serializer_info_images_count)# ","总页数",True]
LanguageISO = Field()#","语言",True]
AgeRating = Field(info='age_rating')#","年龄分级",False]
#Pages = Field(info='images_name', serializer=serializer_info_images_completed)#","页码",True]
Pages = Field(info='images', serializer=serializer_info_images)#","页码",True]
# ComicInfo.xml and ComicChapter.json end

View File

@ -1,44 +1,56 @@
import json import json
from scrapy.loader import ItemLoader from scrapy.loader import ItemLoader
class ComicLoader(ItemLoader):
def parseExec(cls,data,exec): class ComicLoader(ItemLoader):
if data !=None and exec != None: def parseExec(cls,data,exec):
dots = str(exec).split(".") if data !=None and exec != None:
if not isinstance(data,dict): data = json.loads(data) dots = str(exec).split(".")
for dot in dots: if not isinstance(data,dict): data = json.loads(data)
data = data.get(dot) for dot in dots:
return data data = data.get(dot)
return data
def add_xpath(self, field_name, xpath, *processors, index=None, exec=None, re=None, **kw):
""" def add_xpath(self, field_name, xpath, *processors, index=None, exec=None, re=None, **kw):
Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a """
value, which is used to extract a list of strings from the Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a
selector associated with this :class:`ItemLoader`. value, which is used to extract a list of strings from the
selector associated with this :class:`ItemLoader`.
See :meth:`get_xpath` for ``kwargs``.
See :meth:`get_xpath` for ``kwargs``.
:param xpath: the XPath to extract data from
:type xpath: str :param xpath: the XPath to extract data from
:type xpath: str
Examples::
Examples::
# HTML snippet: <p class="product-name">Color TV</p>
loader.add_xpath('name', '//p[@class="product-name"]') # HTML snippet: <p class="product-name">Color TV</p>
# HTML snippet: <p id="price">the price is $1200</p> loader.add_xpath('name', '//p[@class="product-name"]')
loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)') # HTML snippet: <p id="price">the price is $1200</p>
loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')
"""
values = self._get_xpathvalues(xpath, **kw) """
if exec is not None: values = self._get_xpathvalues(xpath, **kw)
values = self.parseExec(values, exec) if exec is not None:
if index is not None: values = self.parseExec(values, exec)
values = values[index] if index is not None:
self.add_value(field_name, values, *processors, re=re, **kw) values = values[index]
self.add_value(field_name, values, *processors, re=re, **kw)
def add_exec(self, field_name, value, str_exec=None, *processors, re=None, **kw):
if str_exec is not None: def add_exec(self, field_name, value, str_exec=None, *processors, re=None, **kw):
value = self.parseExec(value, str_exec) if str_exec is not None:
self.add_value(field_name, value, *processors, re=re, **kw) value = self.parseExec(value, str_exec)
self.add_value(field_name, value, *processors, re=re, **kw)
def get_exec(self, value, str_exec):
return self.parseExec(value, str_exec) def get_exec(self, value, str_exec):
return self.parseExec(value, str_exec)
def add_value(self, field_name, value, *processors, re=None, **kw):
if self.auto_replace_value(field_name, value):
return super().add_value(field_name, value, *processors, re=re, **kw)
def auto_replace_value(self, field_name, value):
if self.get_output_value(field_name) != None:
self._replace_value(field_name, value)
return False
else: return True

View File

@ -1,110 +1,110 @@
# Define here the models for your spider middleware # Define here the models for your spider middleware
# #
# See documentation in: # See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals from scrapy import signals
import random,logging import random,logging
from pathlib import Path from pathlib import Path
from Comics.settings import PROXY_LIST from Comics.settings import PROXY_LIST
# useful for handling different item types with a single interface # useful for handling different item types with a single interface
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class ProxyMiddleware(object): class ProxyMiddleware(object):
def process_request(self, request, spider): def process_request(self, request, spider):
if len(PROXY_LIST) != 0: if len(PROXY_LIST) != 0:
request.meta["proxy"] = random.choice(PROXY_LIST) request.meta["proxy"] = random.choice(PROXY_LIST)
class ComicsSpiderMiddleware: class ComicsSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined, # Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the # scrapy acts as if the spider middleware does not modify the
# passed objects. # passed objects.
@classmethod @classmethod
def from_crawler(cls, crawler): def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders. # This method is used by Scrapy to create your spiders.
s = cls() s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s return s
def process_spider_input(self, response, spider): def process_spider_input(self, response, spider):
# Called for each response that goes through the spider # Called for each response that goes through the spider
# middleware and into the spider. # middleware and into the spider.
# Should return None or raise an exception. # Should return None or raise an exception.
return None return None
def process_spider_output(self, response, result, spider): def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after # Called with the results returned from the Spider, after
# it has processed the response. # it has processed the response.
# Must return an iterable of Request, or item objects. # Must return an iterable of Request, or item objects.
for i in result: for i in result:
yield i yield i
def process_spider_exception(self, response, exception, spider): def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method # Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception. # (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects. # Should return either None or an iterable of Request or item objects.
pass pass
def process_start_requests(self, start_requests, spider): def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works # Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except # similarly to the process_spider_output() method, except
# that it doesnt have a response associated. # that it doesnt have a response associated.
# Must return only requests (not items). # Must return only requests (not items).
for r in start_requests: for r in start_requests:
yield r yield r
def spider_opened(self, spider): def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name) spider.logger.info('Spider opened: %s' % spider.name)
class ComicsDownloaderMiddleware: class ComicsDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined, # Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the # scrapy acts as if the downloader middleware does not modify the
# passed objects. # passed objects.
@classmethod @classmethod
def from_crawler(cls, crawler): def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders. # This method is used by Scrapy to create your spiders.
s = cls() s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s return s
def process_request(self, request, spider): def process_request(self, request, spider):
# Called for each request that goes through the downloader # Called for each request that goes through the downloader
# middleware. # middleware.
# Must either: # Must either:
# - return None: continue processing this request # - return None: continue processing this request
# - or return a Response object # - or return a Response object
# - or return a Request object # - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of # - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called # installed downloader middleware will be called
return None return None
def process_response(self, request, response, spider): def process_response(self, request, response, spider):
# Called with the response returned from the downloader. # Called with the response returned from the downloader.
# Must either; # Must either;
# - return a Response object # - return a Response object
# - return a Request object # - return a Request object
# - or raise IgnoreRequest # - or raise IgnoreRequest
return response return response
def process_exception(self, request, exception, spider): def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request() # Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception. # (from other downloader middleware) raises an exception.
# Must either: # Must either:
# - return None: continue processing this exception # - return None: continue processing this exception
# - return a Response object: stops process_exception() chain # - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain # - return a Request object: stops process_exception() chain
pass pass
def spider_opened(self, spider): def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name) spider.logger.info('Spider opened: %s' % spider.name)

View File

@ -1,81 +1,63 @@
# Define your item pipelines here # Define your item pipelines here
# #
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface # useful for handling different item types with a single interface
import os, scrapy import os, scrapy,logging,time,random
from Comics import settings from Comics import settings
from Comics.utils.FileUtils import imageUtils from Comics.utils.FileUtils import imageUtils
from Comics.utils.FileUtils import fileUtils from Comics.utils.FileUtils import fileUtils
from Comics.utils.Constant import ComicPath from Comics.utils.Constant import ComicPath
from Comics.items import ComicItem from Comics.items import ComicItem
from Comics.items import ImageItem from Comics.items import ImagesItem
from scrapy.pipelines.images import ImagesPipeline from scrapy.pipelines.images import ImagesPipeline
from Comics.exporters import ComicInfoXmlItemExporter from Comics.exporters import ComicInfoXmlItemExporter
from Comics.exporters import ItemExporter from Comics.exporters import ItemExporter
from Comics.utils.FileUtils import CBZUtils from Comics.exporters import JsonExport
from Comics.utils.FileUtils import CBZUtils
class ComicsPipeline:
def open_spider(self, spider): class ComicsPipeline:
pass def open_spider(self, spider):
# item就是yield后面的对象 pass
def process_item(self, item, spider): # item就是yield后面的对象
if isinstance(item, ComicItem): def process_item(self, item, spider):
item = ComicItem(ItemExporter().export_obj(item)) if isinstance(item, ComicItem):
file = os.path.join("json", item['name'], item['chapter']) file = os.path.join(settings.OUTPUT_DIR,"json", item['name'], item['chapter'])
fileUtils.save_file(f"{file}.json", item) data = JsonExport(file=file).export_json(item, if_return=True)
return item #item['images'] = data['images']
# image解析 return data
# image解析
def close_spider(self,spider):
pass def close_spider(self,spider):
pass
class ImageParsePipeline:
def process_item(self, item, spider):
if isinstance(item, ComicItem): class ImgDownloadPipeline(ImagesPipeline):
count = 1 def file_exits(self, image_path):
images_item = [] en_image_path = ComicPath().getFileScrambleImageSave(image_path, relative="fullpath")
for image in item['list_img']: return os.path.exists(os.path.join(settings.IMAGES_STORE, en_image_path))
(image_src, scramble) = [image.get("src"), image.get("scramble")]
count_image = "{:0>3d}".format(count) def file_full_path(self, item, image): return os.path.join(item['name'], item['chapter'], image)
suffix = "."+str(image_src).split(".")[-1]
image_name = count_image + suffix def file_path(self, request, response=None, info=None, *, item=None): return request.meta['path']
if scramble:
de_str = str(image_src).split("/")[-1].replace(suffix, "==") def get_media_requests(self, item, info):
blocks_num = imageUtils.encodeImage(de_str) for image_url,image_path in zip(item['image_urls'],item['images']):
image_name = ComicPath.getFileScrambleImageName(count=count_image, block=blocks_num, suffix=suffix) image_path = self.file_full_path(item, image_path)
image_path = os.path.join(item['name'], item['chapter'], image_name) if self.file_exits(image_path):
images_item.append(ImageItem(image_name=count_image + suffix, image_url=image_src, image_path=image_path)) logging.info(f"file exists: {image_path}")
count += 1 else:
item['images'] = images_item logging.info(f"downloading {image_url} --> {image_path}")
return item yield scrapy.Request(url=image_url, meta={'path': image_path})
class ImgDownloadPipeline(ImagesPipeline): def item_completed(self, results, item, info):
def file_path(self, request, response=None, info=None, *, item=None): item['images_name'] = results
image = request.meta['item'] # return item
image_path = image['image_path'] # ComicInfoXml 生成
en_image_path = os.path.join(os.path.dirname(image_path), image['image_name']) comic_info = ComicInfoXmlItemExporter(comic=item['name'], chapter=item['chapter']).export_xml(item)
if os.path.exists(os.path.join(settings.IMAGES_STORE, en_image_path)): # 打包
return en_image_path CBZUtils.packComicChapterCBZ(comic=item['name'], chapter=item['chapter'],
else: comic_info_images= comic_info["Pages"], remove=False)
return image_path time.sleep(random.randint(5,10))
def get_media_requests(self, item, info):
for image in item['images']:
yield scrapy.Request(url=image['image_url'], meta={'item': image})
def item_completed(self, results, item, info):
info_img = []
for success, img in results:
img_path = os.path.join(settings.IMAGES_STORE, img['path'])
# 解密图片
img_path = imageUtils.deScrambleImagesByPath(img_path)
info_img.append(os.path.basename(img_path).split('.')[0])
item['images_name'] = ",".join(info_img)
# return item
# ComicInfoXml 生成
ComicInfoXmlItemExporter(comic=item['name'], chapter=item['chapter']).export_xml(item)
# 打包
CBZUtils.packComicChapterCBZ(comic=item['name'], chapter=item['chapter'], remove=False)

View File

@ -1,130 +1,137 @@
# Scrapy settings for Comics project # Scrapy settings for Comics project
# #
# For simplicity, this file contains only settings considered important or # For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation: # commonly used. You can find more settings consulting the documentation:
# #
# https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from fake_useragent import UserAgent from fake_useragent import UserAgent
import os
BOT_NAME = 'Comics'
BOT_NAME = 'Comics'
SPIDER_MODULES = ['Comics.spiders']
NEWSPIDER_MODULE = 'Comics.spiders' SPIDER_MODULES = ['Comics.spiders']
NEWSPIDER_MODULE = 'Comics.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent OUTPUT_DIR = "output"
#USER_AGENT = 'Comics (+http://www.yourdomain.com)' # Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = UserAgent().random #USER_AGENT = 'Comics (+http://www.yourdomain.com)'
# Obey robots.txt rules USER_AGENT = UserAgent().random
ROBOTSTXT_OBEY = False # Obey robots.txt rules
ROBOTSTXT_OBEY = False
HTTPERROR_ALLOWED_CODES = [ 200 , 403]
# Configure maximum concurrent requests performed by Scrapy (default: 16) HTTPERROR_ALLOWED_CODES = [ 200 , 403]
#CONCURRENT_REQUESTS = 32 # Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 16
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # Configure a delay for requests for the same website (default: 0)
# See also autothrottle settings and docs # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
IMAGES_STORE = 'images' # See also autothrottle settings and docs
COMIC_INFO_XML_STORE = 'images' IMAGES_STORE = os.path.join(OUTPUT_DIR, 'images')
DOWNLOAD_DELAY = 20 IMAGES_NAME_FORMAT = "{:0>3d}"
#重试 COMIC_INFO_XML_STORE = IMAGES_STORE
RETRY_ENABLED = True DOWNLOAD_DELAY = 0
RETRY_TIMES = 10 # 想重试几次就写几 #重试
# 下面这行可要可不要 RETRY_ENABLED = True
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 401] RETRY_TIMES = 10 # 想重试几次就写几
# The download delay setting will honor only one of: # 下面这行可要可不要
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 401]
#CONCURRENT_REQUESTS_PER_IP = 16 # The download delay setting will honor only one of:
PROXY_LIST = [ CONCURRENT_REQUESTS_PER_DOMAIN = 16
"http://127.0.0.1:7890", CONCURRENT_REQUESTS_PER_IP = 16
] PROXY_LIST = [
# Disable cookies (enabled by default) "http://127.0.0.1:7890",
COOKIES_ENABLED = False ]
# Disable cookies (enabled by default)
# Disable Telnet Console (enabled by default) COOKIES_ENABLED = False
#TELNETCONSOLE_ENABLED = False
# Disable Telnet Console (enabled by default)
# Override the default request headers: #TELNETCONSOLE_ENABLED = False
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # Override the default request headers:
# 'Accept-Language': 'en', #DEFAULT_REQUEST_HEADERS = {
#} # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# Enable or disable spider middlewares #}
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = { # Enable or disable spider middlewares
# 'Comics.middlewares.ComicsSpiderMiddleware': 543, # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# 'Comics.middlewares.ProxyMiddleware' : 100, #SPIDER_MIDDLEWARES = {
# 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400, # 'Comics.middlewares.ComicsSpiderMiddleware': 543,
#} # 'Comics.middlewares.ProxyMiddleware' : 100,
# 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400,
# Enable or disable downloader middlewares #}
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = { # Enable or disable downloader middlewares
# 'Comics.middlewares.ComicsDownloaderMiddleware': 543, # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500, DOWNLOADER_MIDDLEWARES = {
'Comics.middlewares.ProxyMiddleware': 100, # 'Comics.middlewares.ComicsDownloaderMiddleware': 543,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400, # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500,
} 'Comics.middlewares.ProxyMiddleware': 100,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400,
# Enable or disable extensions }
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = { # Enable or disable extensions
# 'scrapy.extensions.telnet.TelnetConsole': None, # See https://docs.scrapy.org/en/latest/topics/extensions.html
#} #EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# Configure item pipelines #}
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = { # Configure item pipelines
'Comics.pipelines.ComicsPipeline': 300, # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
'Comics.pipelines.ImageParsePipeline': 400, ITEM_PIPELINES = {
'Comics.pipelines.ImgDownloadPipeline': 500, # 'scrapy.pipelines.images.ImagesPipeline' : 1,
} 'Comics.pipelines.ComicsPipeline': 300,
# 'Comics.pipelines.ImageParsePipeline': 400,
# Enable and configure the AutoThrottle extension (disabled by default) 'Comics.pipelines.ImgDownloadPipeline': 500,
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html }
AUTOTHROTTLE_ENABLED = True
# The initial download delay # Enable and configure the AutoThrottle extension (disabled by default)
AUTOTHROTTLE_START_DELAY = 5 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# The maximum download delay to be set in case of high latencies AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_MAX_DELAY = 60 # The initial download delay
# The average number of requests Scrapy should be sending in parallel to AUTOTHROTTLE_START_DELAY = 5
# each remote server # The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 AUTOTHROTTLE_MAX_DELAY = 60
# Enable showing throttling stats for every response received: # The average number of requests Scrapy should be sending in parallel to
AUTOTHROTTLE_DEBUG = False # each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable and configure HTTP caching (disabled by default) # Enable showing throttling stats for every response received:
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings AUTOTHROTTLE_DEBUG = False
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0 # Enable and configure HTTP caching (disabled by default)
HTTPCACHE_DIR = 'httpcache' # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_IGNORE_HTTP_CODES = [500, 502, 404, 403, 401] HTTPCACHE_ENABLED = True
#HTTPCACHE_STORAGE = 'Comics.middlewares.MyFilesystemCacheStorage' HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = [500, 502, 404]
CBZ_EXPORT_PATH = "CBZ" #HTTPCACHE_STORAGE = 'Comics.middlewares.MyFilesystemCacheStorage'
#数据导出类 排序 HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
COMIC_INFO_XML_FILE = "ComicInfo.xml"
COMIC_INFO_FIELDS_TO_EXPORT = [ # Logging configuration
"Title", LOG_LEVEL = "INFO" # 日志等级
"Series", LOG_STDOUT = True # 标准化输出
"Number",
"SeriesGroup", CBZ_EXPORT_PATH = "CBZ"
"Summary", #数据导出类 排序
"Year", COMIC_INFO_XML_FILE = "ComicInfo.xml"
"Month", COMIC_INFO_FIELDS_TO_EXPORT = [
"Day", "Title",
"Writer", "Series",
"Publisher", "Number",
"Genre", "SeriesGroup",
"Tags", "Summary",
"Web", "Year",
"PageCount", "Month",
"LanguageISO", "Day",
"AgeRating", "Writer",
"Pages" "Publisher",
] "Genre",
"Tags",
"Web",
"PageCount",
"LanguageISO",
"AgeRating",
"Pages"
]

View File

@ -1,4 +1,4 @@
# This package will contain the spiders of your Scrapy project # This package will contain the spiders of your Scrapy project
# #
# Please refer to the documentation for information on how to create and manage # Please refer to the documentation for information on how to create and manage
# your spiders. # your spiders.

View File

@ -1,62 +1,75 @@
import scrapy import scrapy,logging,time
from Comics.items import ComicItem from Comics.items import ComicItem
from Comics.loader import ComicLoader from Comics.loader import ComicLoader
from itemadapter import ItemAdapter from Comics.items import ListComicItem
from Comics.items import ComicInfoItem
class RmComicSpider(scrapy.Spider):
class RmComicSpider(scrapy.Spider): name = 'rm_comic'
name = 'rm_comic' allowed_domains = ['rm01.xyz']
allowed_domains = ['rm01.xyz'] main_url = 'https://rm01.xyz'
main_url = 'https://rm01.xyz' start_urls = 'https://rm01.xyz/books'
#start_urls = ['https://rm01.xyz/books/63b65185-f798-4c8f-a0b0-8811615908fd/0']
def start_requests(self):
def start_requests(self): yield scrapy.Request(self.start_urls, callback=self.books_comic)
yield scrapy.Request('https://rm01.xyz'
'/books/306ec1e2-f701-4fda-bb78-041ad6ec4020', callback=self.parse_comic) def books_comic(self, response):
books_comic = ComicLoader(item=ListComicItem(), response=response)
# 获取某个漫画的相关数据 data = books_comic.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0]
# 获取到多个章节链接后进入下个流程 str_exec = "props.pageProps.books"
def parse_comic(self, response): books = books_comic.get_exec(data, str_exec=str_exec)
comic_item = ComicLoader(item=ComicItem(), response=response) for book in books:
comic_item.add_xpath('name', '//div[@class="col"]/h5/text()') books_comic.add_value('link', book['id'])
comic_item.add_xpath('icon', '//img[@class="img-thumbnail"]/@src') logging.info(f"downloading books %s" % book['name'])
comic_item.add_xpath('author', '//div[contains(@class,"bookid_bookInfo")]/p[1]/text()', index=1) time.sleep(3)
comic_item.add_xpath('tags', '//div[contains(@class,"bookid_bookInfo")]/p[3]/b/text()') yield scrapy.Request(url=self.start_urls+"/"+book['id'], callback=self.parse_comic)
comic_item.add_xpath('dep', '//div[contains(@class,"bookid_bookInfo")]/p[4]/text()', index=1)
comic_item.add_xpath('date', '//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()', index=1)
comic_item.add_value('genre', "韩漫") # 获取某个漫画的相关数据
comic_item.add_value('age_rating', "R18+") # 获取到多个章节链接后进入下个流程
chapter_href = comic_item.get_xpath('//div[contains(@class,"bookid_chapterBox")]' def parse_comic(self, response):
'//div[contains(@class,"bookid_chapter")]/a/@href') comic_item = ComicLoader(item=ComicItem(), response=response)
#chapters = response.xpath('//div[contains(@class,"bookid_chapterBox")]' comic_item.add_xpath('name', '//div[@class="col"]/h5/text()')
# '//div[contains(@class,"bookid_chapter")]/a/text()').extract() comic_item.add_xpath('icon', '//img[@class="img-thumbnail"]/@src')
#for chapter, link in zip(chapters, chapter_href): comic_item.add_xpath('author', '//div[contains(@class,"bookid_bookInfo")]/p[1]/text()', index=1)
for i, link in enumerate(chapter_href, start=1): comic_item.add_xpath('tags', '//div[contains(@class,"bookid_bookInfo")]/p[3]/b/text()')
yield scrapy.Request(self.main_url+link, meta={'item': comic_item.load_item(), 'num': i}, callback=self.parse_chapter) comic_item.add_xpath('dep', '//div[contains(@class,"bookid_bookInfo")]/p[4]/text()', index=1)
comic_item.add_xpath('date', '//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()', index=1)
# 读取某章节下的所有图片 comic_item.add_value('genre', "韩漫")
def parse_chapter(self, response): comic_item.add_value('age_rating', "R18+")
comic_item = ComicLoader(item=response.meta['item'], response=response) chapter_href = comic_item.get_xpath('//div[contains(@class,"bookid_chapterBox")]'
data = comic_item.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0] '//div[contains(@class,"bookid_chapter")]/a/@href')
str_exec = "props.pageProps." #chapters = response.xpath('//div[contains(@class,"bookid_chapterBox")]'
#comic_item.add_exec('name', data, str_exec=str_exec+"bookName") # '//div[contains(@class,"bookid_chapter")]/a/text()').extract()
#comic_item.add_exec('dep', data, str_exec=str_exec+"description") #for chapter, link in zip(chapters, chapter_href):
comic_item.add_value('index', response.meta['num']) for i, link in enumerate(chapter_href, start=1):
comic_item.add_exec('chapter', data, str_exec=str_exec + "chapterName") yield scrapy.Request(self.main_url+link, meta={'item': comic_item.load_item(), 'num': i}, callback=self.parse_chapter)
comic_item.add_exec('list_img', data, str_exec+"images")
comic = comic_item.load_item() # 读取某章节下的所有图片
chapter_api_url = comic_item.get_exec(data, str_exec+"chapterAPIPath") def parse_chapter(self, response):
if chapter_api_url is not None: comic_item = ComicLoader(item=response.meta['item'], response=response)
yield scrapy.Request(self.main_url + chapter_api_url, meta={'item': comic}, callback=self.parse_chapter_api) data = comic_item.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0]
else: str_exec = "props.pageProps."
yield comic #comic_item.add_exec('name', data, str_exec=str_exec+"bookName")
#comic_item.add_exec('dep', data, str_exec=str_exec+"description")
# 加密数据API处理 comic_item.add_value('index', response.meta['num'])
def parse_chapter_api(self, response): comic_item.add_exec('chapter', data, str_exec=str_exec + "chapterName")
comic_item = ComicLoader(item=response.meta['item'], response=response) comic_item.add_exec('image_urls', data, str_exec+"images")
comic_item.add_exec('chapter', response.text, str_exec='chapter.name') comic_item.add_exec('images', data, str_exec+"images")
comic_item.add_exec('list_img', response.text, str_exec='chapter.images') comic = comic_item.load_item()
yield comic_item.load_item() chapter_api_url = comic_item.get_exec(data, str_exec+"chapterAPIPath")
if chapter_api_url is not None:
def parse(self, response): yield scrapy.Request(self.main_url + chapter_api_url, meta={'item': comic}, callback=self.parse_chapter_api)
else:
yield comic
# 加密数据API处理
def parse_chapter_api(self, response):
comic_item = ComicLoader(item=response.meta['item'], response=response)
comic_item.add_exec('chapter', response.text, str_exec='chapter.name')
comic_item.add_exec('image_urls', response.text, str_exec='chapter.images')
comic_item.add_exec('images', response.text, str_exec='chapter.images')
yield comic_item.load_item()
def parse(self, response):
raise NotImplementedError raise NotImplementedError

View File

@ -1,39 +1,48 @@
import os.path import os.path
import re import re
from opencc import OpenCC from opencc import OpenCC
class ComicPath: class ComicPath:
PREFIX_SCRAMBLE = "scramble=" PREFIX_SCRAMBLE = "scramble="
@classmethod @classmethod
def getDirComicChapter(cls): def getDirComicChapter(cls):
return None return None
@classmethod @classmethod
def getFileScrambleImageName(cls,count,block,suffix=".jpg"): return cls.PREFIX_SCRAMBLE+str(block)+"_"+str(count)+suffix def getFileScrambleImageName(cls,count,block,suffix=".jpg"): return cls.PREFIX_SCRAMBLE+str(block)+"_"+str(count)+suffix
@classmethod @classmethod
def getFileScrambleImageSave(cls,file): return str(file).split("_")[-1] def getFileScrambleImageSave(cls,file,relative=False, is_prefix=True):
file_name = str(file).split("_")[-1]
#繁体中文转简体中文 if relative:
@classmethod file_name = os.path.basename(file_name)
def chinese_convert(cls, text,convert='t2s'): return OpenCC(convert).convert(str(text)) if relative == "fullpath":
file_name = os.path.join(os.path.dirname(file), file_name)
#处理成符合规定的文件名 if not is_prefix:
@classmethod return file_name.split(".")[0]
def fix_file_name(cls, filename, replace=None): else:
if not isinstance(filename, str): return file_name
return filename
in_tab = r'[?*/\|.:><]' #繁体中文转简体中文
str_replace = "" @classmethod
if replace is not None: def chinese_convert(cls, text,convert='t2s'): return OpenCC(convert).convert(str(text))
str_replace = replace
filename = re.sub(in_tab, str_replace, filename) #处理成符合规定的文件名
count = 1 @classmethod
while True: def fix_file_name(cls, filename, replace=None):
str_file = filename[0-count] if not isinstance(filename, str):
if str_file == " ": return filename
count += 1 in_tab = r'[?*/\|.:><]'
else: str_replace = ""
filename = filename[0:len(filename)+1-count] if replace is not None:
break str_replace = replace
filename = re.sub(in_tab, str_replace, filename)
count = 1
while True:
str_file = filename[0-count]
if str_file == " ":
count += 1
else:
filename = filename[0:len(filename)+1-count]
break
return filename return filename

View File

@ -1,340 +1,361 @@
import base64,hashlib,os,shutil import base64,hashlib,os,shutil
import math,time,json,datetime,logging import math,time,json,datetime,logging
from PIL import Image from PIL import Image
from Comics.utils.Constant import ComicPath from Comics.utils.Constant import ComicPath
from pathlib import Path from pathlib import Path
from zipfile import ZipFile from zipfile import ZipFile
from Comics.settings import COMIC_INFO_XML_FILE,CBZ_EXPORT_PATH,IMAGES_STORE from Comics.settings import COMIC_INFO_XML_FILE,CBZ_EXPORT_PATH,IMAGES_STORE
class fileUtils: class fileUtils:
@classmethod @classmethod
def save_file(cls,path,data): def save_file(cls,path,data):
dir = os.path.dirname(path) dir = os.path.dirname(path)
if not os.path.exists(dir): if not os.path.exists(dir):
os.makedirs(dir) os.makedirs(dir)
with open(path,'w',encoding='utf-8') as fs: with open(path,'w',encoding='utf-8') as fs:
fs.write(str(data)) fs.write(str(data))
fs.close() fs.close()
class CommonUtils: @classmethod
@classmethod def path(cls, file):
def parseExec(cls,data,exec): base_dir = os.path.dirname(file)
if data !=None and exec != None: if not os.path.exists(base_dir): os.makedirs(base_dir)
dots = str(exec).split(".") return file
if not isinstance(data,dict): data = json.loads(data)
for dot in dots: class CommonUtils:
data = data.get(dot) @classmethod
return data def parseExec(cls,data,exec):
if data !=None and exec != None:
class imageUtils: dots = str(exec).split(".")
if not isinstance(data,dict): data = json.loads(data)
@classmethod for dot in dots:
def deScrambleImagesByDir(cls,chapter_dir): data = data.get(dot)
scramble_count = 0 return data
if os.path.exists(chapter_dir): #获取章节图片路径
dirs = os.listdir(chapter_dir) class imageUtils:
for img in dirs:
if img.startswith(ComicPath.PREFIX_SCRAMBLE): @classmethod
imageUtils.encode_scramble_image(os.path.join(chapter_dir,img)) def descramble_images_by_dir(cls, chapter_dir):
scramble_count += 1 if os.path.isfile(chapter_dir):
logging.debug(f"{ComicPath.PREFIX_SCRAMBLE} {scramble_count}") chapter_dir = os.path.dirname(chapter_dir)
return scramble_count scramble_count = 0
if os.path.exists(chapter_dir): #获取章节图片路径
@classmethod while ComicPath.PREFIX_SCRAMBLE in os.listdir(chapter_dir):
def deScrambleImagesByPath(cls, img_path, img_save=None): for img in os.listdir(chapter_dir):
if os.path.basename(img_path).startswith(ComicPath.PREFIX_SCRAMBLE): if img.startswith(ComicPath.PREFIX_SCRAMBLE):
img_path = imageUtils.encode_scramble_image(img_path, img_save) imageUtils.encode_scramble_image(os.path.join(chapter_dir, img))
return img_path scramble_count += 1
logging.debug(f"{ComicPath.PREFIX_SCRAMBLE} {scramble_count}")
@classmethod return scramble_count
def encodeImage(cls,str_en):
#print("en",str_en) @classmethod
enc = base64.b64decode(str_en) def deScrambleImagesByPath(cls, img_path, img_save=None):
#print("解密:",enc) if os.path.basename(img_path).\
m = hashlib.md5() startswith(ComicPath.PREFIX_SCRAMBLE) and os.path.exists(img_path):
m.update(enc) img_path = imageUtils.encode_scramble_image(img_path, img_save)
md5 = m.digest() return img_path
d = md5[-1]
#print(md5) @classmethod
try: def encodeImage(cls,str_en):
blocks = d % 10 + 5 #print("en",str_en)
except: enc = base64.b64decode(str_en)
blocks = 0 %10 + 5 #print("解密:",enc)
#print("blocks=",blocks) m = hashlib.md5()
return blocks m.update(enc)
md5 = m.digest()
@classmethod d = md5[-1]
def scrambleImage(cls,file_path): #print(md5)
#检测到未下载完的图像 直接返回None try:
if str(file_path).endswith(".downloads"): blocks = d % 10 + 5
os.remove(file_path) except:
return None blocks = 0 %10 + 5
file_str = str(file_path).split("=") #print("blocks=",blocks)
#10_29.jpg return blocks
base_dir = file_str[0].replace("scramble","")
base_name = file_str[-1] @classmethod
base_fn = base_name.split("_") def scrambleImage(cls,file_path):
save_name = base_fn[1] #检测到未下载完的图像 直接返回None
save_name_delesu = save_name.split(".")[0] if str(file_path).endswith(".downloads"):
blocks = int(base_fn[0]) os.remove(file_path)
save_file_path = os.path.join(base_dir,save_name) return None
print("sva",save_file_path) file_str = str(file_path).split("=")
if os.path.exists(save_file_path): #10_29.jpg
print("图片已解密,已跳过:", save_file_path) base_dir = file_str[0].replace("scramble","")
return None base_name = file_str[-1]
image_su = str(file_path).split(".")[-1] base_fn = base_name.split("_")
try: save_name = base_fn[1]
img = Image.open(file_path) save_name_delesu = save_name.split(".")[0]
except: blocks = int(base_fn[0])
print(f"error Image: {file_path}") save_file_path = os.path.join(base_dir,save_name)
width = img.width print("sva",save_file_path)
height = img.height if os.path.exists(save_file_path):
#blocks = cls.encodeImage(enStr) print("图片已解密,已跳过:", save_file_path)
print("blocks=",blocks) return None
block_height = int(height / blocks) image_su = str(file_path).split(".")[-1]
block_width = int(width / blocks) try:
print("blockHeight=",block_height) img = Image.open(file_path)
suffix = str(file_path).split(".")[-1] except:
split_path = os.path.join(base_dir,save_name_delesu+"split") print(f"error Image: {file_path}")
if image_su == "downloads": width = img.width
return None height = img.height
is_split = cls.splitimage(file_path,blocks,1,split_path) #blocks = cls.encodeImage(enStr)
if is_split != None: print("blocks=",blocks)
cls.image_compose(split_path,blocks,1,save_file_path,block_height,width) block_height = int(height / blocks)
else: block_width = int(width / blocks)
if os.path.exists(split_path): print("blockHeight=",block_height)
shutil.rmtree(split_path) suffix = str(file_path).split(".")[-1]
if os.path.exists(file_path): split_path = os.path.join(base_dir,save_name_delesu+"split")
shutil.move(file_path, save_file_path) if image_su == "downloads":
#完成后清空 return None
return file_path is_split = cls.splitimage(file_path,blocks,1,split_path)
if is_split != None:
@classmethod cls.image_compose(split_path,blocks,1,save_file_path,block_height,width)
def splitimage(cls,src,rownum,colnum,dstpath): else:
img=Image.open(src) if os.path.exists(split_path):
w,h=img.size shutil.rmtree(split_path)
if rownum<= h and colnum<=w: if os.path.exists(file_path):
s=os.path.split(src) shutil.move(file_path, save_file_path)
if dstpath=='': #完成后清空
dstpath = s[0] return file_path
if not os.path.exists(dstpath):
os.makedirs(dstpath) @classmethod
fn=s[1].split('.') def splitimage(cls,src,rownum,colnum,dstpath):
basename=fn[0] img=Image.open(src)
ext=fn[-1] w,h=img.size
num=0 if rownum<= h and colnum<=w:
rowheight=h//rownum s=os.path.split(src)
colwidth=w//colnum if dstpath=='':
for r in range(rownum): dstpath = s[0]
for c in range(colnum): if not os.path.exists(dstpath):
box=(c*colwidth,r*rowheight,(c+1)*colwidth,(r+1)*rowheight) os.makedirs(dstpath)
count_image = "{:0>3d}".format(num) fn=s[1].split('.')
file_path = os.path.join(dstpath,str(count_image)+'.'+ext) basename=fn[0]
print("file_path=",file_path) ext=fn[-1]
img.crop(box).save(file_path) num=0
num=num+1 rowheight=h//rownum
return "成功" colwidth=w//colnum
else: for r in range(rownum):
print('不数!') for c in range(colnum):
return None box=(c*colwidth,r*rowheight,(c+1)*colwidth,(r+1)*rowheight)
count_image = "{:0>3d}".format(num)
@classmethod file_path = os.path.join(dstpath,str(count_image)+'.'+ext)
def image_compose(cls,src,row,column,save_path,image_height,image_width): print("file_path=",file_path)
image_size = image_height img.crop(box).save(file_path)
#image_height = 376 num=num+1
#image_width = 720 return "成功"
images_format = ['.png','.jpg'] else:
print('不数!')
#image_names = [name for name in os.listdir(src) for item in images_format if return None
# os.path.splitext(name)[1] == item][::-1]
img_list=os.listdir(src) @classmethod
img_list.sort() def image_compose(cls,src,row,column,save_path,image_height,image_width):
img_list.sort(key=lambda x: int(x[:-4])) image_size = image_height
##文件名按数字排序 #image_height = 376
img_nums=len(img_list) #image_width = 720
image_names = [] images_format = ['.png','.jpg']
for i in range(img_nums):
img_name=os.path.join(src,img_list[i]) #image_names = [name for name in os.listdir(src) for item in images_format if
image_names.append(img_name) # os.path.splitext(name)[1] == item][::-1]
#使用倒序 img_list=os.listdir(src)
image_names = image_names[::-1] img_list.sort()
# 简单的对于参数的设定和实际图片集的大小进行数量判断 img_list.sort(key=lambda x: int(x[:-4]))
if len(image_names) < row * column: ##文件名按数字排序
raise ValueError("合成图片的参数和要求的数量不能匹配!") img_nums=len(img_list)
image_names = []
to_image = Image.new('RGB', (column * image_width, row * image_height)) #创建一个新图 for i in range(img_nums):
# 循环遍历,把每张图片按顺序粘贴到对应位置上 img_name=os.path.join(src,img_list[i])
for y in range(1, row + 1): image_names.append(img_name)
for x in range(1, column + 1): #使用倒序
#1 * (row=1 -1) col=1 -1 image_names = image_names[::-1]
image_path = image_names[column * (y - 1) + x - 1] # 简单的对于参数的设定和实际图片集的大小进行数量判断
print("split_image=",image_path) if len(image_names) < row * column:
from_image = Image.open(image_path) raise ValueError("合成图片的参数和要求的数量不能匹配!")
#保持原图片大小
#.resize( to_image = Image.new('RGB', (column * image_width, row * image_height)) #创建一个新图
# (image_size, image_size),Image.ANTIALIAS) # 循环遍历,把每张图片按顺序粘贴到对应位置上
to_image.paste(from_image, ((x - 1) * image_size, (y - 1) * image_size)) for y in range(1, row + 1):
from_image.close() for x in range(1, column + 1):
to_image.save(save_path) #1 * (row=1 -1) col=1 -1
print("图片合并完成:", save_path) image_path = image_names[column * (y - 1) + x - 1]
shutil.rmtree(src) print("split_image=",image_path)
# 保存新图 from_image = Image.open(image_path)
#保持原图片大小
@classmethod #.resize(
def getScrambleImage(cls,path): # (image_size, image_size),Image.ANTIALIAS)
scramble_file_cache = cls.scrambleImage(path) to_image.paste(from_image, ((x - 1) * image_size, (y - 1) * image_size))
if scramble_file_cache != None and os.path.exists(scramble_file_cache): os.remove(scramble_file_cache) from_image.close()
to_image.save(save_path)
@classmethod print("图片合并完成:", save_path)
def encode_scramble_image(cls,imgpath,img_save=None): shutil.rmtree(src)
image = Image.open(imgpath) # 保存新图
w, h = image.size
#image.show() @classmethod
file_str = str(imgpath).split("=") def getScrambleImage(cls,path):
#10_29.jpg scramble_file_cache = cls.scrambleImage(path)
base_fn = file_str[-1].split("_") if scramble_file_cache != None and os.path.exists(scramble_file_cache): os.remove(scramble_file_cache)
blocks = int(base_fn[0])
if img_save == None: @classmethod
save_path = os.path.join(os.path.dirname(imgpath),ComicPath.getFileScrambleImageSave(imgpath)) def encode_scramble_image(cls, img_path, img_save=None):
else: save_path = img_save if not os.path.exists(img_path):
# print(type(aid),type(img_name)) return
if blocks: image = Image.open(img_path)
s = blocks # 随机值 w, h = image.size
# print(s) #image.show()
l = h % s # 切割最后多余的值 file_str = str(img_path).split("=")
box_list = [] #10_29.jpg
hz = 0 base_fn = file_str[-1].split("_")
for i in range(s): blocks = int(base_fn[0])
c = math.floor(h / s) if img_save == None:
g = i * c save_path = os.path.join(os.path.dirname(img_path),ComicPath.getFileScrambleImageSave(img_path))
hz += c else: save_path = img_save
h2 = h - c * (i + 1) - l # print(type(aid),type(img_name))
if i == 0: if blocks:
c += l;hz += l s = blocks # 随机值
else: # print(s)
g += l l = h % s # 切割最后多余的值
box_list.append((0, h2, w, h - g)) box_list = []
hz = 0
# print(box_list,len(box_list)) for i in range(s):
item_width = w c = math.floor(h / s)
# box_list.reverse() #还原切图可以倒序列表 g = i * c
# print(box_list, len(box_list)) hz += c
newh = 0 h2 = h - c * (i + 1) - l
image_list = [image.crop(box) for box in box_list] if i == 0:
# print(box_list) c += l;hz += l
newimage = Image.new("RGB", (w, h)) else:
for image in image_list: g += l
# image.show() box_list.append((0, h2, w, h - g))
b_w, b_h = image.size
newimage.paste(image, (0, newh)) # print(box_list,len(box_list))
item_width = w
newh += b_h # box_list.reverse() #还原切图可以倒序列表
newimage.save(save_path) # print(box_list, len(box_list))
print("解密成功=",save_path) newh = 0
if os.path.exists(imgpath): image_list = [image.crop(box) for box in box_list]
os.remove(imgpath) # print(box_list)
print("remove=",imgpath) newimage = Image.new("RGB", (w, h))
return save_path for image in image_list:
# image.show()
b_w, b_h = image.size
class CBZUtils: newimage.paste(image, (0, newh))
@classmethod newh += b_h
def readDirsOrFiles(cls, dir, type): newimage.save(save_path)
data = [] logging.info(f"解密成功 {save_path}")
files = os.listdir(dir) if os.path.exists(img_path):
for file in files: os.remove(img_path)
path = os.path.join(dir, file) logging.debug(f"remove {img_path}")
if type == "files" and os.path.isfile(path): return save_path
data.append(path)
if type == "dirs" and os.path.isdir(path):
data.append(path) class CBZUtils:
return data
@classmethod
@classmethod def readDirsOrFiles(cls, dir, type):
def zip_compression(cls, source_dir=None, target_file=None, remove=True): data = []
target_dir = os.path.dirname(target_file) files = os.listdir(dir)
if not os.path.exists(target_dir): for file in files:
os.makedirs(target_dir) path = os.path.join(dir, file)
if not os.path.exists(target_file) and source_dir is not None: if type == "files" and os.path.isfile(path):
with ZipFile(target_file, mode='w') as zf: data.append(path)
for path, dir_names, filenames in os.walk(source_dir): if type == "dirs" and os.path.isdir(path):
path = Path(path) data.append(path)
arc_dir = path.relative_to(source_dir) return data
y = 0
for filename in filenames: @classmethod
y = y + 1 def zip_compression(cls, source_dir=None, target_file=None, remove=True):
print("打包中:" + str(y) + "/" + str(len(filenames)), os.path.join(source_dir, filename)) target_dir = os.path.dirname(target_file)
zf.write(path.joinpath(filename), arc_dir.joinpath(filename)) if not os.path.exists(target_dir):
zf.close() os.makedirs(target_dir)
logging.info(f"打包完成:{target_file}") if not os.path.exists(target_file) and source_dir is not None:
with ZipFile(target_file, mode='w') as zf:
@classmethod for path, dir_names, filenames in os.walk(source_dir):
def packComicChapterCBZ(cls, comic, chapter, remove=True): path = Path(path)
images_chapter_path = os.path.join(IMAGES_STORE, comic, chapter) arc_dir = path.relative_to(source_dir)
cbz_chapter_path = os.path.join(CBZ_EXPORT_PATH, comic, chapter) + ".CBZ" y = 0
if os.path.exists(images_chapter_path): for filename in filenames:
dirs = os.listdir(images_chapter_path) y = y + 1
for file in dirs: print("打包中:" + str(y) + "/" + str(len(filenames)), os.path.join(source_dir, filename))
if file.startswith(ComicPath.PREFIX_SCRAMBLE): zf.write(path.joinpath(filename), arc_dir.joinpath(filename))
try: zf.close()
os.remove(file) logging.info(f"打包完成:{target_file}")
except Exception as e:
print(f"删除 {file} 发生错误 {e},已跳过") @classmethod
return False def packComicChapterCBZ(cls, comic, chapter, comic_info_images, remove=True):
cls.zip_compression(images_chapter_path, cbz_chapter_path) images_chapter_path = os.path.join(IMAGES_STORE, comic, chapter)
time.sleep(0.1) cbz_chapter_path = os.path.join(CBZ_EXPORT_PATH, comic, chapter) + ".CBZ"
if remove: shutil.rmtree(images_chapter_path) if os.path.exists(images_chapter_path):
return True dirs = os.listdir(images_chapter_path)
for file in dirs:
@classmethod if file.startswith(ComicPath.PREFIX_SCRAMBLE):
def replaceZip(cls, filepath, unpack_dir=None): try:
if not cls.compareFileDate(filepath): return None imageUtils.deScrambleImagesByPath(os.path.join(images_chapter_path,file))
if unpack_dir == None: except Exception as e:
unpack_dir = str(filepath).split(".")[0] print(f"删除 {file} 发生错误 {e},已跳过")
fz = ZipFile(filepath, 'r') return False
for file in fz.namelist(): cls.zip_compression(images_chapter_path, cbz_chapter_path)
if file.endswith(".jpg"): time.sleep(0.1)
data = fz.read(file) if remove: shutil.rmtree(images_chapter_path)
if len(data) < 500 and os.path.exists(filepath): # validation
os.remove(filepath) cls.cbz_validate(cbz_chapter_path, comic_info_images)
print(f"数据不完整,已删除:{filepath}") return True
if cls.compareFileDate(filepath):
os.utime(filepath) @classmethod
print(f"已更新文件时间 {filepath}") def replaceZip(cls, filepath, unpack_dir=None):
if os.path.exists(unpack_dir): if not cls.compareFileDate(filepath): return None
shutil.rmtree(unpack_dir) if unpack_dir == None:
# 删除删除main.ftl文件 unpack_dir = str(filepath).split(".")[0]
# delete_filename = '' fz = ZipFile(filepath, 'r')
# if os.path.exists(delete_filename): for file in fz.namelist():
# os.remove(delete_filename) if file.endswith(".jpg"):
# time.sleep(60) data = fz.read(file)
# shutil.copy(文件的路径,另一个目录);拷贝main.ftl到准备压缩的目录下 if len(data) < 500 and os.path.exists(filepath):
# cls.zip_compression() os.remove(filepath)
# 小于则运行 print(f"数据不完整,已删除:{filepath}")
if cls.compareFileDate(filepath):
@classmethod os.utime(filepath)
def compareFileDate(cls, filepath): print(f"已更新文件时间 {filepath}")
if os.path.exists(filepath): if os.path.exists(unpack_dir):
ctime = os.path.getmtime(filepath) shutil.rmtree(unpack_dir)
str_ctime = datetime.fromtimestamp(int(ctime)) # 删除删除main.ftl文件
file_ctime = str(str_ctime.year) + "{:0>2d}".format(str_ctime.month) + "{:0>2d}".format( # delete_filename = ''
str_ctime.day) + "{:0>2d}".format(str_ctime.hour) # if os.path.exists(delete_filename):
c_ctime = 2023011603 # os.remove(delete_filename)
else: # time.sleep(60)
return False # shutil.copy(文件的路径,另一个目录);拷贝main.ftl到准备压缩的目录下
if int(file_ctime) < c_ctime: # cls.zip_compression()
return True # 小于则运行
return False
@classmethod
@classmethod def compareFileDate(cls, filepath):
def zip_info(cls, path, filter=True): if os.path.exists(filepath):
result = None ctime = os.path.getmtime(filepath)
try: str_ctime = datetime.fromtimestamp(int(ctime))
with ZipFile(path, "r") as zip_file: file_ctime = str(str_ctime.year) + "{:0>2d}".format(str_ctime.month) + "{:0>2d}".format(
result = zip_file.namelist() str_ctime.day) + "{:0>2d}".format(str_ctime.hour)
if filter: c_ctime = 2023011603
result.remove(COMIC_INFO_XML_FILE) else:
except Exception as e: return False
print(e) if int(file_ctime) < c_ctime:
return result return True
return False
@classmethod
def zip_info(cls, path, filter=True):
result = None
try:
with ZipFile(path, "r") as zip_file:
result = zip_file.namelist()
if filter:
result.remove(COMIC_INFO_XML_FILE)
except Exception as e:
print(e)
return result
@classmethod
def cbz_validate(cls, zip_path, comic_info_images):
if len(cls.zip_info(zip_path)) == len(comic_info_images):
logging.info(f"validating successfully === {zip_path}")
else:
os.remove(zip_path)
logging.error(f"validating fail === {zip_path}")

View File

@ -1,16 +1,16 @@
class OldUtils: class OldUtils:
old_comic_name=None old_comic_name=None
old_chapter = None old_chapter = None
@classmethod @classmethod
def setOldComicName(cls,value): cls.old_comic_name = value def setOldComicName(cls,value): cls.old_comic_name = value
@classmethod @classmethod
def setOldChapter(cls,value): cls.old_chapter=value def setOldChapter(cls,value): cls.old_chapter=value
@classmethod @classmethod
def getOldComicName(cls): return cls.old_comic_name def getOldComicName(cls): return cls.old_comic_name
@classmethod @classmethod
def getOldChapter(cls): return cls.old_chapter def getOldChapter(cls): return cls.old_chapter

8
run.py
View File

@ -1,5 +1,5 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from scrapy import cmdline from scrapy import cmdline
cmdline.execute("scrapy crawl rm_comic".split()) cmdline.execute("scrapy crawl rm_comic".split())

View File

@ -1,11 +1,11 @@
# Automatically created by: scrapy startproject # Automatically created by: scrapy startproject
# #
# For more information about the [deploy] section see: # For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html # https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings] [settings]
default = Comics.settings default = Comics.settings
[deploy] [deploy]
#url = http://localhost:6800/ #url = http://localhost:6800/
project = Comics project = Comics