This commit is contained in:
caiwx86 2023-06-20 02:52:51 +08:00
parent af7812794f
commit ac30f59a33
20 changed files with 1167 additions and 1060 deletions

8
.gitignore vendored
View File

@ -1,5 +1,5 @@
.scrapy/*
images/*
json/*
CBZ/*
.scrapy/*
.vscode/*
CBZ/*
output/*
/**/__pycache__

3
.idea/.gitignore vendored
View File

@ -1,3 +0,0 @@
# Default ignored files
/shelf/
/workspace.xml

View File

@ -1,8 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="stable_vscode" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@ -1,6 +0,0 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

View File

@ -1,4 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="stable_vscode" project-jdk-type="Python SDK" />
</project>

View File

@ -1,8 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/ComicScrapy.iml" filepath="$PROJECT_DIR$/.idea/ComicScrapy.iml" />
</modules>
</component>
</project>

View File

@ -1,6 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

View File

@ -1,105 +1,130 @@
import os.path,json,ast
from Comics.settings import COMIC_INFO_FIELDS_TO_EXPORT
from scrapy.exporters import XmlItemExporter
from scrapy.exporters import PythonItemExporter
from Comics.items import ComicInfoItem
from Comics.items import ComicItem
from Comics.settings import COMIC_INFO_XML_STORE
from Comics.utils.Constant import ComicPath
from scrapy.utils.python import is_listlike, to_bytes, to_unicode
from itemadapter import ItemAdapter
class ItemExporter(PythonItemExporter):
def convert(self, data):
if isinstance(data, bytes): return data.decode("utf-8")
if isinstance(data, dict): return dict(map(self.convert, data.items()))
if isinstance(data, tuple): return map(self.convert, data)
if isinstance(data, list): return [self.convert(i) for i in data]
return data
def export_obj(self, obj_item):
self.start_exporting()
obj_item = self.convert(self.export_item(obj_item))
self.finish_exporting()
return obj_item
class ComicInfoXmlItemExporter(XmlItemExporter):
custom_root_element = "ComicInfo"
def __init__(self, comic, chapter):
file_path = os.path.join(COMIC_INFO_XML_STORE, comic,
chapter, f"{self.custom_root_element}.xml")
dir_path = os.path.dirname(file_path)
if not os.path.exists(dir_path): os.makedirs(dir_path)
self.xml_file = open(file_path, "wb")
super(ComicInfoXmlItemExporter, self).__init__(self.xml_file,
root_element=self.custom_root_element,
indent=1,fields_to_export=COMIC_INFO_FIELDS_TO_EXPORT)
def serialize_field(self, field, name, value):
#通过序列化
value = ComicPath.chinese_convert(value)
return super().serialize_field(field, name, value)
def start_exporting(self):
self.xg.startDocument()
self.xg.startElement(self.custom_root_element, {})
def comic_to_info_item(self, comic_item):
comic_info = {}
info_item = ItemAdapter(ComicInfoItem())
comic_info_dict = {}
for field in info_item.field_names():
meta_info = info_item.get_field_meta(field).get('info')
if meta_info is not None:
comic_info_dict[meta_info] = field
for key, value in ComicItem(comic_item).items():
new_key = comic_info_dict.get(key)
if new_key is not None:
comic_info[new_key] = value
return ItemExporter().export_obj(ComicInfoItem(comic_info))
def export_item(self, item):
comic_info = self.comic_to_info_item(item)
child_element = "Page"
self._beautify_indent(depth=1)
self._beautify_newline()
for name, value in self._get_serialized_fields(comic_info, default_value=""):
if name is "Pages":
value = str(value).split(',')
if value is not None or value != "":
self._export_xml_field(name, value, depth=2, child_element=child_element)
#self._beautify_indent(depth=1)
return comic_info
def _export_xml_field(self, name, serialized_value, depth, child_element="value"):
self._beautify_indent(depth=depth)
self.xg.startElement(name, {})
if hasattr(serialized_value, "items"):
self._beautify_newline()
for sub_name, value in serialized_value.items():
self._export_xml_field(sub_name, value, depth=depth + 1)
self._beautify_indent(depth=depth)
elif is_listlike(serialized_value):
self._beautify_newline()
for value in serialized_value:
self._export_xml_field(child_element, value, depth=depth + 1)
self._beautify_indent(depth=depth)
elif isinstance(serialized_value, str):
self.xg.characters(serialized_value)
else:
self.xg.characters(str(serialized_value))
self.xg.endElement(name)
self._beautify_newline()
def finish_exporting(self):
self.xg.endElement(self.custom_root_element)
self.xg.endDocument()
self.xml_file.close()
def export_xml(self, item):
self.start_exporting()
comic_info = self.export_item(item)
self.finish_exporting()
return comic_info
import os.path,json,ast
from Comics.settings import COMIC_INFO_FIELDS_TO_EXPORT
from scrapy.exporters import XmlItemExporter
from scrapy.exporters import PythonItemExporter
from scrapy.exporters import JsonItemExporter
from Comics.items import ComicInfoItem
from Comics.items import ComicItem
from Comics.settings import COMIC_INFO_XML_STORE
from Comics.utils.Constant import ComicPath
from scrapy.utils.python import is_listlike, to_bytes, to_unicode
from itemadapter import ItemAdapter
class CommonExporter():
def getPath(self, file , sufix=None):
sufix = "."+sufix
dirname = os.path.dirname(file)
if not os.path.exists(dirname):
os.makedirs(dirname)
if sufix != None and sufix not in file:
file = file + sufix
return file
class ItemExporter(PythonItemExporter):
def convert(self, data):
if isinstance(data, bytes): return data.decode("utf-8")
if isinstance(data, dict): return dict(map(self.convert, data.items()))
if isinstance(data, tuple): return map(self.convert, data)
if isinstance(data, list): return [self.convert(i) for i in data]
return data
def export_obj(self, obj_item):
self.start_exporting()
obj_item = self.convert(self.export_item(obj_item))
self.finish_exporting()
return obj_item
class JsonExport(JsonItemExporter):
def __init__(self, file, **kwargs):
file = CommonExporter().getPath(file=file, sufix= "json")
self.file = open(file, "wb")
super(JsonExport, self).__init__(self.file, **kwargs)
def export_json(self, json_object, if_return=False):
self.start_exporting()
self.export_item(json_object)
self.finish_exporting()
self.file.close()
if if_return:
return ItemExporter().export_obj(json_object)
class ComicInfoXmlItemExporter(XmlItemExporter):
custom_root_element = "ComicInfo"
def __init__(self, comic, chapter):
file_path = os.path.join(COMIC_INFO_XML_STORE, comic,
chapter, f"{self.custom_root_element}.xml")
dir_path = os.path.dirname(file_path)
if not os.path.exists(dir_path): os.makedirs(dir_path)
self.xml_file = open(file_path, "wb")
super(ComicInfoXmlItemExporter, self).__init__(self.xml_file,
root_element=self.custom_root_element,
indent=1,fields_to_export=COMIC_INFO_FIELDS_TO_EXPORT)
def serialize_field(self, field, name, value):
#通过序列化
value = ComicPath.chinese_convert(value)
return super().serialize_field(field, name, value)
def start_exporting(self):
self.xg.startDocument()
self.xg.startElement(self.custom_root_element, {})
def comic_to_info_item(self, comic_item):
comic_info = {}
info_item = ItemAdapter(ComicInfoItem())
comic_info_dict = {}
for field in info_item.field_names():
meta_info = info_item.get_field_meta(field).get('info')
if meta_info is not None:
comic_info_dict[meta_info] = field
for key, value in ComicItem(comic_item).items():
new_key = comic_info_dict.get(key)
if new_key is not None:
comic_info[new_key] = value
return ItemExporter().export_obj(ComicInfoItem(comic_info))
def export_item(self, item):
comic_info = self.comic_to_info_item(item)
child_element = "Page"
self._beautify_indent(depth=1)
self._beautify_newline()
for name, value in self._get_serialized_fields(comic_info, default_value=""):
if name == "Pages":
value = ast.literal_eval(value)
if value is not None or value != "":
self._export_xml_field(name, value, depth=2, child_element=child_element)
#self._beautify_indent(depth=1)
return comic_info
def _export_xml_field(self, name, serialized_value, depth, child_element="value"):
self._beautify_indent(depth=depth)
self.xg.startElement(name, {})
if hasattr(serialized_value, "items"):
self._beautify_newline()
for sub_name, value in serialized_value.items():
self._export_xml_field(sub_name, value, depth=depth + 1)
self._beautify_indent(depth=depth)
elif is_listlike(serialized_value):
self._beautify_newline()
for value in serialized_value:
self._export_xml_field(child_element, value, depth=depth + 1)
self._beautify_indent(depth=depth)
elif isinstance(serialized_value, str):
self.xg.characters(serialized_value)
else:
self.xg.characters(str(serialized_value))
self.xg.endElement(name)
self._beautify_newline()
def finish_exporting(self):
self.xg.endElement(self.custom_root_element)
self.xg.endDocument()
self.xml_file.close()
def export_xml(self, item):
self.start_exporting()
comic_info = self.export_item(item)
self.finish_exporting()
return comic_info

View File

@ -1,78 +1,151 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.org/en/latest/topics/items.html
from scrapy.item import Item, Field
from Comics.utils.Constant import ComicPath
from scrapy.loader.processors import TakeFirst, MapCompose, Join
def serialize_to_chinese(value):
return ComicPath.chinese_convert(value)
def serialize_to_fix_file(value):
file = ComicPath.chinese_convert(value)
return ComicPath.fix_file_name(file)
class ComicOItem(Item):
name = Field()
chapterItem = Field()
class ComicItem(Item):
# 编号
index = Field(output_processor=TakeFirst())
# 漫画名
name = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst())
# 章节名
chapter = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst())
# 图片链接
list_img = Field()
# 作者
author = Field(serialize_to_chinese=serialize_to_chinese, output_processor=TakeFirst())
# 封面链接
icon = Field(output_processor=TakeFirst())
# 标签
tags = Field(serializer=serialize_to_chinese, output_processor=TakeFirst())
# 概述
dep = Field(serializer=serialize_to_chinese, output_processor=TakeFirst())
# 时间
date = Field(output_processor=TakeFirst())
# 流派
genre = Field(output_processor=TakeFirst())
# 年龄分级
age_rating = Field(output_processor=TakeFirst())
images = Field()
images_name = Field()
class ImageItem(Item):
image_name = Field()
image_url = Field()
image_path = Field()
def serializer_info_writer(value):
list_value = []
str(value).replace("&", " ")
for v in str(value).split(" "):
list_value.append(v)
return ",".join(list_value)
class ComicInfoItem(Item):
Title = Field(info='chapter')#"章节名",True]
Series = Field(info='name')# ","漫画名",True]
Number = Field(info='index')# ","编号",True]
SeriesGroup = Field()# ","别名",False]
Summary = Field(info='dep')# ","概述",True]
Year = Field()# ","年",False]
Month = Field()# ","月",False]
Day = Field()# ","日",False]
Writer = Field(info='author',serializer=serializer_info_writer)# "作者",True]
Publisher = Field()# ","出版社",False]
Genre = Field(info='genre')# ","流派",True]
Tags = Field(info='tags')# ","标签",True]
Web = Field()# ","主页",False]
PageCount = Field()# ","总页数",True]
LanguageISO = Field()#","语言",True]
AgeRating = Field(info='age_rating')#","年龄分级",False]
Pages = Field(info='images_name')#","页码",True]
# ComicInfo.xml and ComicChapter.json end
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.org/en/latest/topics/items.html
import os,Comics.settings as settings,logging
from scrapy.item import Item, Field
from Comics.utils.Constant import ComicPath
from Comics.utils.FileUtils import imageUtils
from scrapy.loader.processors import TakeFirst, MapCompose, Join
def serialize_to_chinese(value):
return ComicPath.chinese_convert(value)
def serialize_to_fix_file(value):
file = ComicPath.chinese_convert(value)
return ComicPath.fix_file_name(file)
def _serialize_to_images(value, result_type=None):
count = 1
images_item = []
image_urls = []
for image in value:
(image_src, scramble) = [image.get("src"), image.get("scramble")]
count_image = settings.IMAGES_NAME_FORMAT.format(count)
suffix = "."+str(image_src).split(".")[-1]
image_name = count_image + suffix
if scramble:
de_str = str(image_src).split("/")[-1].replace(suffix, "==")
blocks_num = imageUtils.encodeImage(de_str)
image_name = ComicPath.getFileScrambleImageName(count=count_image, block=blocks_num, suffix=suffix)
#images_item.append(ImagesItem(image_name=count_image + suffix, image_url=image_src, image_path=image_name))
images_item.append(image_name)
image_urls.append(image_src)
count += 1
logging.info(f"images_len: {len(images_item)}")
if result_type == "image_urls": return image_urls
else: return images_item
def serialize_to_images(value): return _serialize_to_images(value)
def serialize_to_image_urls(value): return _serialize_to_images(value, result_type="image_urls")
class ListComicItem(Item):
name = Field()
link = Field()
class ComicItem(Item):
# 编号
index = Field(output_processor=TakeFirst())
# 漫画名
name = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst())
# 章节名
chapter = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst())
# 图片链接
list_img = Field(serializer=serialize_to_images)
# 作者
author = Field(serialize_to_chinese=serialize_to_chinese, output_processor=TakeFirst())
# 封面链接
icon = Field(output_processor=TakeFirst())
# 标签
tags = Field(serializer=serialize_to_chinese, output_processor=TakeFirst())
# 概述
dep = Field(serializer=serialize_to_chinese, output_processor=TakeFirst())
# 时间
date = Field(output_processor=TakeFirst())
# 流派
genre = Field(output_processor=TakeFirst())
# 年龄分级
age_rating = Field(output_processor=TakeFirst())
images_old = Field(serializer=serialize_to_images)
images = Field(serializer=serialize_to_images)
image_urls = Field(serializer=serialize_to_image_urls)
images_name = Field()
class ImagesItem(Item):
image_name = Field()
image_url = Field()
image_path = Field()
images = Field()
image_urls = Field()
comic = Field()
def serializer_info_writer(value):
list_value = []
str(value).replace("&", " ")
for v in str(value).split(" "):
list_value.append(v)
return ",".join(list_value)
# Result_type name
def _serializer_info_imagesa(value, result_type=None):
info = []
for success, img in value:
img_path = os.path.join(settings.IMAGES_STORE, img['path'])
if result_type == 'name':
info.append(ComicPath().getFileScrambleImageSave(img_path,True,False))
else:
info.append(img_path)
if result_type == "len":
value = len(info)
else:
value = info
return value
def _serialize_info_images(value, result_type=None):
images = []
for image in value:
images.append(ComicPath().getFileScrambleImageSave(image,True,False))
if result_type == "count":
return len(images)
else:
return images
def serializer_info_images(value): return _serialize_info_images(value)
def serializer_info_images_count(value): return _serialize_info_images(value, "count")
def serializer_info_images_completed(value):
return _serialize_info_images(value, result_type='name')
def serializer_info_images_count(value):
return _serialize_info_images(value, result_type='len')
class ComicInfoItem(Item):
Title = Field(info='chapter')#"章节名",True]
Series = Field(info='name')# ","漫画名",True]
Number = Field(info='index')# ","编号",True]
SeriesGroup = Field()# ","别名",False]
Summary = Field(info='dep')# ","概述",True]
Year = Field()# ","年",False]
Month = Field()# ","月",False]
Day = Field()# ","日",False]
Writer = Field(info='author',serializer=serializer_info_writer)# "作者",True]
Publisher = Field()# ","出版社",False]
Genre = Field(info='genre')# ","流派",True]
Tags = Field(info='tags')# ","标签",True]
Web = Field()# ","主页",False]
#PageCount = Field()# ","总页数",True]
PageCount = Field(info='images',serializer=serializer_info_images_count)# ","总页数",True]
LanguageISO = Field()#","语言",True]
AgeRating = Field(info='age_rating')#","年龄分级",False]
#Pages = Field(info='images_name', serializer=serializer_info_images_completed)#","页码",True]
Pages = Field(info='images', serializer=serializer_info_images)#","页码",True]
# ComicInfo.xml and ComicChapter.json end

View File

@ -1,44 +1,56 @@
import json
from scrapy.loader import ItemLoader
class ComicLoader(ItemLoader):
def parseExec(cls,data,exec):
if data !=None and exec != None:
dots = str(exec).split(".")
if not isinstance(data,dict): data = json.loads(data)
for dot in dots:
data = data.get(dot)
return data
def add_xpath(self, field_name, xpath, *processors, index=None, exec=None, re=None, **kw):
"""
Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a
value, which is used to extract a list of strings from the
selector associated with this :class:`ItemLoader`.
See :meth:`get_xpath` for ``kwargs``.
:param xpath: the XPath to extract data from
:type xpath: str
Examples::
# HTML snippet: <p class="product-name">Color TV</p>
loader.add_xpath('name', '//p[@class="product-name"]')
# HTML snippet: <p id="price">the price is $1200</p>
loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')
"""
values = self._get_xpathvalues(xpath, **kw)
if exec is not None:
values = self.parseExec(values, exec)
if index is not None:
values = values[index]
self.add_value(field_name, values, *processors, re=re, **kw)
def add_exec(self, field_name, value, str_exec=None, *processors, re=None, **kw):
if str_exec is not None:
value = self.parseExec(value, str_exec)
self.add_value(field_name, value, *processors, re=re, **kw)
def get_exec(self, value, str_exec):
return self.parseExec(value, str_exec)
import json
from scrapy.loader import ItemLoader
class ComicLoader(ItemLoader):
def parseExec(cls,data,exec):
if data !=None and exec != None:
dots = str(exec).split(".")
if not isinstance(data,dict): data = json.loads(data)
for dot in dots:
data = data.get(dot)
return data
def add_xpath(self, field_name, xpath, *processors, index=None, exec=None, re=None, **kw):
"""
Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a
value, which is used to extract a list of strings from the
selector associated with this :class:`ItemLoader`.
See :meth:`get_xpath` for ``kwargs``.
:param xpath: the XPath to extract data from
:type xpath: str
Examples::
# HTML snippet: <p class="product-name">Color TV</p>
loader.add_xpath('name', '//p[@class="product-name"]')
# HTML snippet: <p id="price">the price is $1200</p>
loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')
"""
values = self._get_xpathvalues(xpath, **kw)
if exec is not None:
values = self.parseExec(values, exec)
if index is not None:
values = values[index]
self.add_value(field_name, values, *processors, re=re, **kw)
def add_exec(self, field_name, value, str_exec=None, *processors, re=None, **kw):
if str_exec is not None:
value = self.parseExec(value, str_exec)
self.add_value(field_name, value, *processors, re=re, **kw)
def get_exec(self, value, str_exec):
return self.parseExec(value, str_exec)
def add_value(self, field_name, value, *processors, re=None, **kw):
if self.auto_replace_value(field_name, value):
return super().add_value(field_name, value, *processors, re=re, **kw)
def auto_replace_value(self, field_name, value):
if self.get_output_value(field_name) != None:
self._replace_value(field_name, value)
return False
else: return True

View File

@ -1,110 +1,110 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
import random,logging
from pathlib import Path
from Comics.settings import PROXY_LIST
# useful for handling different item types with a single interface
logger = logging.getLogger(__name__)
class ProxyMiddleware(object):
def process_request(self, request, spider):
if len(PROXY_LIST) != 0:
request.meta["proxy"] = random.choice(PROXY_LIST)
class ComicsSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class ComicsDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
import random,logging
from pathlib import Path
from Comics.settings import PROXY_LIST
# useful for handling different item types with a single interface
logger = logging.getLogger(__name__)
class ProxyMiddleware(object):
def process_request(self, request, spider):
if len(PROXY_LIST) != 0:
request.meta["proxy"] = random.choice(PROXY_LIST)
class ComicsSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class ComicsDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)

View File

@ -1,81 +1,63 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
import os, scrapy
from Comics import settings
from Comics.utils.FileUtils import imageUtils
from Comics.utils.FileUtils import fileUtils
from Comics.utils.Constant import ComicPath
from Comics.items import ComicItem
from Comics.items import ImageItem
from scrapy.pipelines.images import ImagesPipeline
from Comics.exporters import ComicInfoXmlItemExporter
from Comics.exporters import ItemExporter
from Comics.utils.FileUtils import CBZUtils
class ComicsPipeline:
def open_spider(self, spider):
pass
# item就是yield后面的对象
def process_item(self, item, spider):
if isinstance(item, ComicItem):
item = ComicItem(ItemExporter().export_obj(item))
file = os.path.join("json", item['name'], item['chapter'])
fileUtils.save_file(f"{file}.json", item)
return item
# image解析
def close_spider(self,spider):
pass
class ImageParsePipeline:
def process_item(self, item, spider):
if isinstance(item, ComicItem):
count = 1
images_item = []
for image in item['list_img']:
(image_src, scramble) = [image.get("src"), image.get("scramble")]
count_image = "{:0>3d}".format(count)
suffix = "."+str(image_src).split(".")[-1]
image_name = count_image + suffix
if scramble:
de_str = str(image_src).split("/")[-1].replace(suffix, "==")
blocks_num = imageUtils.encodeImage(de_str)
image_name = ComicPath.getFileScrambleImageName(count=count_image, block=blocks_num, suffix=suffix)
image_path = os.path.join(item['name'], item['chapter'], image_name)
images_item.append(ImageItem(image_name=count_image + suffix, image_url=image_src, image_path=image_path))
count += 1
item['images'] = images_item
return item
class ImgDownloadPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
image = request.meta['item']
image_path = image['image_path']
en_image_path = os.path.join(os.path.dirname(image_path), image['image_name'])
if os.path.exists(os.path.join(settings.IMAGES_STORE, en_image_path)):
return en_image_path
else:
return image_path
def get_media_requests(self, item, info):
for image in item['images']:
yield scrapy.Request(url=image['image_url'], meta={'item': image})
def item_completed(self, results, item, info):
info_img = []
for success, img in results:
img_path = os.path.join(settings.IMAGES_STORE, img['path'])
# 解密图片
img_path = imageUtils.deScrambleImagesByPath(img_path)
info_img.append(os.path.basename(img_path).split('.')[0])
item['images_name'] = ",".join(info_img)
# return item
# ComicInfoXml 生成
ComicInfoXmlItemExporter(comic=item['name'], chapter=item['chapter']).export_xml(item)
# 打包
CBZUtils.packComicChapterCBZ(comic=item['name'], chapter=item['chapter'], remove=False)
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
import os, scrapy,logging,time,random
from Comics import settings
from Comics.utils.FileUtils import imageUtils
from Comics.utils.FileUtils import fileUtils
from Comics.utils.Constant import ComicPath
from Comics.items import ComicItem
from Comics.items import ImagesItem
from scrapy.pipelines.images import ImagesPipeline
from Comics.exporters import ComicInfoXmlItemExporter
from Comics.exporters import ItemExporter
from Comics.exporters import JsonExport
from Comics.utils.FileUtils import CBZUtils
class ComicsPipeline:
def open_spider(self, spider):
pass
# item就是yield后面的对象
def process_item(self, item, spider):
if isinstance(item, ComicItem):
file = os.path.join(settings.OUTPUT_DIR,"json", item['name'], item['chapter'])
data = JsonExport(file=file).export_json(item, if_return=True)
#item['images'] = data['images']
return data
# image解析
def close_spider(self,spider):
pass
class ImgDownloadPipeline(ImagesPipeline):
def file_exits(self, image_path):
en_image_path = ComicPath().getFileScrambleImageSave(image_path, relative="fullpath")
return os.path.exists(os.path.join(settings.IMAGES_STORE, en_image_path))
def file_full_path(self, item, image): return os.path.join(item['name'], item['chapter'], image)
def file_path(self, request, response=None, info=None, *, item=None): return request.meta['path']
def get_media_requests(self, item, info):
for image_url,image_path in zip(item['image_urls'],item['images']):
image_path = self.file_full_path(item, image_path)
if self.file_exits(image_path):
logging.info(f"file exists: {image_path}")
else:
logging.info(f"downloading {image_url} --> {image_path}")
yield scrapy.Request(url=image_url, meta={'path': image_path})
def item_completed(self, results, item, info):
item['images_name'] = results
# return item
# ComicInfoXml 生成
comic_info = ComicInfoXmlItemExporter(comic=item['name'], chapter=item['chapter']).export_xml(item)
# 打包
CBZUtils.packComicChapterCBZ(comic=item['name'], chapter=item['chapter'],
comic_info_images= comic_info["Pages"], remove=False)
time.sleep(random.randint(5,10))

View File

@ -1,130 +1,137 @@
# Scrapy settings for Comics project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from fake_useragent import UserAgent
BOT_NAME = 'Comics'
SPIDER_MODULES = ['Comics.spiders']
NEWSPIDER_MODULE = 'Comics.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Comics (+http://www.yourdomain.com)'
USER_AGENT = UserAgent().random
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
HTTPERROR_ALLOWED_CODES = [ 200 , 403]
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
IMAGES_STORE = 'images'
COMIC_INFO_XML_STORE = 'images'
DOWNLOAD_DELAY = 20
#重试
RETRY_ENABLED = True
RETRY_TIMES = 10 # 想重试几次就写几
# 下面这行可要可不要
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 401]
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
PROXY_LIST = [
"http://127.0.0.1:7890",
]
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'Comics.middlewares.ComicsSpiderMiddleware': 543,
# 'Comics.middlewares.ProxyMiddleware' : 100,
# 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
# 'Comics.middlewares.ComicsDownloaderMiddleware': 543,
# 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500,
'Comics.middlewares.ProxyMiddleware': 100,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'Comics.pipelines.ComicsPipeline': 300,
'Comics.pipelines.ImageParsePipeline': 400,
'Comics.pipelines.ImgDownloadPipeline': 500,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = [500, 502, 404, 403, 401]
#HTTPCACHE_STORAGE = 'Comics.middlewares.MyFilesystemCacheStorage'
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
CBZ_EXPORT_PATH = "CBZ"
#数据导出类 排序
COMIC_INFO_XML_FILE = "ComicInfo.xml"
COMIC_INFO_FIELDS_TO_EXPORT = [
"Title",
"Series",
"Number",
"SeriesGroup",
"Summary",
"Year",
"Month",
"Day",
"Writer",
"Publisher",
"Genre",
"Tags",
"Web",
"PageCount",
"LanguageISO",
"AgeRating",
"Pages"
]
# Scrapy settings for Comics project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from fake_useragent import UserAgent
import os
BOT_NAME = 'Comics'
SPIDER_MODULES = ['Comics.spiders']
NEWSPIDER_MODULE = 'Comics.spiders'
OUTPUT_DIR = "output"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Comics (+http://www.yourdomain.com)'
USER_AGENT = UserAgent().random
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
HTTPERROR_ALLOWED_CODES = [ 200 , 403]
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 16
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
IMAGES_STORE = os.path.join(OUTPUT_DIR, 'images')
IMAGES_NAME_FORMAT = "{:0>3d}"
COMIC_INFO_XML_STORE = IMAGES_STORE
DOWNLOAD_DELAY = 0
#重试
RETRY_ENABLED = True
RETRY_TIMES = 10 # 想重试几次就写几
# 下面这行可要可不要
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 401]
# The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 16
CONCURRENT_REQUESTS_PER_IP = 16
PROXY_LIST = [
"http://127.0.0.1:7890",
]
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'Comics.middlewares.ComicsSpiderMiddleware': 543,
# 'Comics.middlewares.ProxyMiddleware' : 100,
# 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
# 'Comics.middlewares.ComicsDownloaderMiddleware': 543,
# 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500,
'Comics.middlewares.ProxyMiddleware': 100,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# 'scrapy.pipelines.images.ImagesPipeline' : 1,
'Comics.pipelines.ComicsPipeline': 300,
# 'Comics.pipelines.ImageParsePipeline': 400,
'Comics.pipelines.ImgDownloadPipeline': 500,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = [500, 502, 404]
#HTTPCACHE_STORAGE = 'Comics.middlewares.MyFilesystemCacheStorage'
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# Logging configuration
LOG_LEVEL = "INFO" # 日志等级
LOG_STDOUT = True # 标准化输出
CBZ_EXPORT_PATH = "CBZ"
#数据导出类 排序
COMIC_INFO_XML_FILE = "ComicInfo.xml"
COMIC_INFO_FIELDS_TO_EXPORT = [
"Title",
"Series",
"Number",
"SeriesGroup",
"Summary",
"Year",
"Month",
"Day",
"Writer",
"Publisher",
"Genre",
"Tags",
"Web",
"PageCount",
"LanguageISO",
"AgeRating",
"Pages"
]

View File

@ -1,4 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@ -1,62 +1,75 @@
import scrapy
from Comics.items import ComicItem
from Comics.loader import ComicLoader
from itemadapter import ItemAdapter
from Comics.items import ComicInfoItem
class RmComicSpider(scrapy.Spider):
name = 'rm_comic'
allowed_domains = ['rm01.xyz']
main_url = 'https://rm01.xyz'
#start_urls = ['https://rm01.xyz/books/63b65185-f798-4c8f-a0b0-8811615908fd/0']
def start_requests(self):
yield scrapy.Request('https://rm01.xyz'
'/books/306ec1e2-f701-4fda-bb78-041ad6ec4020', callback=self.parse_comic)
# 获取某个漫画的相关数据
# 获取到多个章节链接后进入下个流程
def parse_comic(self, response):
comic_item = ComicLoader(item=ComicItem(), response=response)
comic_item.add_xpath('name', '//div[@class="col"]/h5/text()')
comic_item.add_xpath('icon', '//img[@class="img-thumbnail"]/@src')
comic_item.add_xpath('author', '//div[contains(@class,"bookid_bookInfo")]/p[1]/text()', index=1)
comic_item.add_xpath('tags', '//div[contains(@class,"bookid_bookInfo")]/p[3]/b/text()')
comic_item.add_xpath('dep', '//div[contains(@class,"bookid_bookInfo")]/p[4]/text()', index=1)
comic_item.add_xpath('date', '//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()', index=1)
comic_item.add_value('genre', "韩漫")
comic_item.add_value('age_rating', "R18+")
chapter_href = comic_item.get_xpath('//div[contains(@class,"bookid_chapterBox")]'
'//div[contains(@class,"bookid_chapter")]/a/@href')
#chapters = response.xpath('//div[contains(@class,"bookid_chapterBox")]'
# '//div[contains(@class,"bookid_chapter")]/a/text()').extract()
#for chapter, link in zip(chapters, chapter_href):
for i, link in enumerate(chapter_href, start=1):
yield scrapy.Request(self.main_url+link, meta={'item': comic_item.load_item(), 'num': i}, callback=self.parse_chapter)
# 读取某章节下的所有图片
def parse_chapter(self, response):
comic_item = ComicLoader(item=response.meta['item'], response=response)
data = comic_item.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0]
str_exec = "props.pageProps."
#comic_item.add_exec('name', data, str_exec=str_exec+"bookName")
#comic_item.add_exec('dep', data, str_exec=str_exec+"description")
comic_item.add_value('index', response.meta['num'])
comic_item.add_exec('chapter', data, str_exec=str_exec + "chapterName")
comic_item.add_exec('list_img', data, str_exec+"images")
comic = comic_item.load_item()
chapter_api_url = comic_item.get_exec(data, str_exec+"chapterAPIPath")
if chapter_api_url is not None:
yield scrapy.Request(self.main_url + chapter_api_url, meta={'item': comic}, callback=self.parse_chapter_api)
else:
yield comic
# 加密数据API处理
def parse_chapter_api(self, response):
comic_item = ComicLoader(item=response.meta['item'], response=response)
comic_item.add_exec('chapter', response.text, str_exec='chapter.name')
comic_item.add_exec('list_img', response.text, str_exec='chapter.images')
yield comic_item.load_item()
def parse(self, response):
import scrapy,logging,time
from Comics.items import ComicItem
from Comics.loader import ComicLoader
from Comics.items import ListComicItem
class RmComicSpider(scrapy.Spider):
name = 'rm_comic'
allowed_domains = ['rm01.xyz']
main_url = 'https://rm01.xyz'
start_urls = 'https://rm01.xyz/books'
def start_requests(self):
yield scrapy.Request(self.start_urls, callback=self.books_comic)
def books_comic(self, response):
books_comic = ComicLoader(item=ListComicItem(), response=response)
data = books_comic.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0]
str_exec = "props.pageProps.books"
books = books_comic.get_exec(data, str_exec=str_exec)
for book in books:
books_comic.add_value('link', book['id'])
logging.info(f"downloading books %s" % book['name'])
time.sleep(3)
yield scrapy.Request(url=self.start_urls+"/"+book['id'], callback=self.parse_comic)
# 获取某个漫画的相关数据
# 获取到多个章节链接后进入下个流程
def parse_comic(self, response):
comic_item = ComicLoader(item=ComicItem(), response=response)
comic_item.add_xpath('name', '//div[@class="col"]/h5/text()')
comic_item.add_xpath('icon', '//img[@class="img-thumbnail"]/@src')
comic_item.add_xpath('author', '//div[contains(@class,"bookid_bookInfo")]/p[1]/text()', index=1)
comic_item.add_xpath('tags', '//div[contains(@class,"bookid_bookInfo")]/p[3]/b/text()')
comic_item.add_xpath('dep', '//div[contains(@class,"bookid_bookInfo")]/p[4]/text()', index=1)
comic_item.add_xpath('date', '//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()', index=1)
comic_item.add_value('genre', "韩漫")
comic_item.add_value('age_rating', "R18+")
chapter_href = comic_item.get_xpath('//div[contains(@class,"bookid_chapterBox")]'
'//div[contains(@class,"bookid_chapter")]/a/@href')
#chapters = response.xpath('//div[contains(@class,"bookid_chapterBox")]'
# '//div[contains(@class,"bookid_chapter")]/a/text()').extract()
#for chapter, link in zip(chapters, chapter_href):
for i, link in enumerate(chapter_href, start=1):
yield scrapy.Request(self.main_url+link, meta={'item': comic_item.load_item(), 'num': i}, callback=self.parse_chapter)
# 读取某章节下的所有图片
def parse_chapter(self, response):
comic_item = ComicLoader(item=response.meta['item'], response=response)
data = comic_item.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0]
str_exec = "props.pageProps."
#comic_item.add_exec('name', data, str_exec=str_exec+"bookName")
#comic_item.add_exec('dep', data, str_exec=str_exec+"description")
comic_item.add_value('index', response.meta['num'])
comic_item.add_exec('chapter', data, str_exec=str_exec + "chapterName")
comic_item.add_exec('image_urls', data, str_exec+"images")
comic_item.add_exec('images', data, str_exec+"images")
comic = comic_item.load_item()
chapter_api_url = comic_item.get_exec(data, str_exec+"chapterAPIPath")
if chapter_api_url is not None:
yield scrapy.Request(self.main_url + chapter_api_url, meta={'item': comic}, callback=self.parse_chapter_api)
else:
yield comic
# 加密数据API处理
def parse_chapter_api(self, response):
comic_item = ComicLoader(item=response.meta['item'], response=response)
comic_item.add_exec('chapter', response.text, str_exec='chapter.name')
comic_item.add_exec('image_urls', response.text, str_exec='chapter.images')
comic_item.add_exec('images', response.text, str_exec='chapter.images')
yield comic_item.load_item()
def parse(self, response):
raise NotImplementedError

View File

@ -1,39 +1,48 @@
import os.path
import re
from opencc import OpenCC
class ComicPath:
PREFIX_SCRAMBLE = "scramble="
@classmethod
def getDirComicChapter(cls):
return None
@classmethod
def getFileScrambleImageName(cls,count,block,suffix=".jpg"): return cls.PREFIX_SCRAMBLE+str(block)+"_"+str(count)+suffix
@classmethod
def getFileScrambleImageSave(cls,file): return str(file).split("_")[-1]
#繁体中文转简体中文
@classmethod
def chinese_convert(cls, text,convert='t2s'): return OpenCC(convert).convert(str(text))
#处理成符合规定的文件名
@classmethod
def fix_file_name(cls, filename, replace=None):
if not isinstance(filename, str):
return filename
in_tab = r'[?*/\|.:><]'
str_replace = ""
if replace is not None:
str_replace = replace
filename = re.sub(in_tab, str_replace, filename)
count = 1
while True:
str_file = filename[0-count]
if str_file == " ":
count += 1
else:
filename = filename[0:len(filename)+1-count]
break
import os.path
import re
from opencc import OpenCC
class ComicPath:
PREFIX_SCRAMBLE = "scramble="
@classmethod
def getDirComicChapter(cls):
return None
@classmethod
def getFileScrambleImageName(cls,count,block,suffix=".jpg"): return cls.PREFIX_SCRAMBLE+str(block)+"_"+str(count)+suffix
@classmethod
def getFileScrambleImageSave(cls,file,relative=False, is_prefix=True):
file_name = str(file).split("_")[-1]
if relative:
file_name = os.path.basename(file_name)
if relative == "fullpath":
file_name = os.path.join(os.path.dirname(file), file_name)
if not is_prefix:
return file_name.split(".")[0]
else:
return file_name
#繁体中文转简体中文
@classmethod
def chinese_convert(cls, text,convert='t2s'): return OpenCC(convert).convert(str(text))
#处理成符合规定的文件名
@classmethod
def fix_file_name(cls, filename, replace=None):
if not isinstance(filename, str):
return filename
in_tab = r'[?*/\|.:><]'
str_replace = ""
if replace is not None:
str_replace = replace
filename = re.sub(in_tab, str_replace, filename)
count = 1
while True:
str_file = filename[0-count]
if str_file == " ":
count += 1
else:
filename = filename[0:len(filename)+1-count]
break
return filename

View File

@ -1,340 +1,361 @@
import base64,hashlib,os,shutil
import math,time,json,datetime,logging
from PIL import Image
from Comics.utils.Constant import ComicPath
from pathlib import Path
from zipfile import ZipFile
from Comics.settings import COMIC_INFO_XML_FILE,CBZ_EXPORT_PATH,IMAGES_STORE
class fileUtils:
@classmethod
def save_file(cls,path,data):
dir = os.path.dirname(path)
if not os.path.exists(dir):
os.makedirs(dir)
with open(path,'w',encoding='utf-8') as fs:
fs.write(str(data))
fs.close()
class CommonUtils:
@classmethod
def parseExec(cls,data,exec):
if data !=None and exec != None:
dots = str(exec).split(".")
if not isinstance(data,dict): data = json.loads(data)
for dot in dots:
data = data.get(dot)
return data
class imageUtils:
@classmethod
def deScrambleImagesByDir(cls,chapter_dir):
scramble_count = 0
if os.path.exists(chapter_dir): #获取章节图片路径
dirs = os.listdir(chapter_dir)
for img in dirs:
if img.startswith(ComicPath.PREFIX_SCRAMBLE):
imageUtils.encode_scramble_image(os.path.join(chapter_dir,img))
scramble_count += 1
logging.debug(f"{ComicPath.PREFIX_SCRAMBLE} {scramble_count}")
return scramble_count
@classmethod
def deScrambleImagesByPath(cls, img_path, img_save=None):
if os.path.basename(img_path).startswith(ComicPath.PREFIX_SCRAMBLE):
img_path = imageUtils.encode_scramble_image(img_path, img_save)
return img_path
@classmethod
def encodeImage(cls,str_en):
#print("en",str_en)
enc = base64.b64decode(str_en)
#print("解密:",enc)
m = hashlib.md5()
m.update(enc)
md5 = m.digest()
d = md5[-1]
#print(md5)
try:
blocks = d % 10 + 5
except:
blocks = 0 %10 + 5
#print("blocks=",blocks)
return blocks
@classmethod
def scrambleImage(cls,file_path):
#检测到未下载完的图像 直接返回None
if str(file_path).endswith(".downloads"):
os.remove(file_path)
return None
file_str = str(file_path).split("=")
#10_29.jpg
base_dir = file_str[0].replace("scramble","")
base_name = file_str[-1]
base_fn = base_name.split("_")
save_name = base_fn[1]
save_name_delesu = save_name.split(".")[0]
blocks = int(base_fn[0])
save_file_path = os.path.join(base_dir,save_name)
print("sva",save_file_path)
if os.path.exists(save_file_path):
print("图片已解密,已跳过:", save_file_path)
return None
image_su = str(file_path).split(".")[-1]
try:
img = Image.open(file_path)
except:
print(f"error Image: {file_path}")
width = img.width
height = img.height
#blocks = cls.encodeImage(enStr)
print("blocks=",blocks)
block_height = int(height / blocks)
block_width = int(width / blocks)
print("blockHeight=",block_height)
suffix = str(file_path).split(".")[-1]
split_path = os.path.join(base_dir,save_name_delesu+"split")
if image_su == "downloads":
return None
is_split = cls.splitimage(file_path,blocks,1,split_path)
if is_split != None:
cls.image_compose(split_path,blocks,1,save_file_path,block_height,width)
else:
if os.path.exists(split_path):
shutil.rmtree(split_path)
if os.path.exists(file_path):
shutil.move(file_path, save_file_path)
#完成后清空
return file_path
@classmethod
def splitimage(cls,src,rownum,colnum,dstpath):
img=Image.open(src)
w,h=img.size
if rownum<= h and colnum<=w:
s=os.path.split(src)
if dstpath=='':
dstpath = s[0]
if not os.path.exists(dstpath):
os.makedirs(dstpath)
fn=s[1].split('.')
basename=fn[0]
ext=fn[-1]
num=0
rowheight=h//rownum
colwidth=w//colnum
for r in range(rownum):
for c in range(colnum):
box=(c*colwidth,r*rowheight,(c+1)*colwidth,(r+1)*rowheight)
count_image = "{:0>3d}".format(num)
file_path = os.path.join(dstpath,str(count_image)+'.'+ext)
print("file_path=",file_path)
img.crop(box).save(file_path)
num=num+1
return "成功"
else:
print('不数!')
return None
@classmethod
def image_compose(cls,src,row,column,save_path,image_height,image_width):
image_size = image_height
#image_height = 376
#image_width = 720
images_format = ['.png','.jpg']
#image_names = [name for name in os.listdir(src) for item in images_format if
# os.path.splitext(name)[1] == item][::-1]
img_list=os.listdir(src)
img_list.sort()
img_list.sort(key=lambda x: int(x[:-4]))
##文件名按数字排序
img_nums=len(img_list)
image_names = []
for i in range(img_nums):
img_name=os.path.join(src,img_list[i])
image_names.append(img_name)
#使用倒序
image_names = image_names[::-1]
# 简单的对于参数的设定和实际图片集的大小进行数量判断
if len(image_names) < row * column:
raise ValueError("合成图片的参数和要求的数量不能匹配!")
to_image = Image.new('RGB', (column * image_width, row * image_height)) #创建一个新图
# 循环遍历,把每张图片按顺序粘贴到对应位置上
for y in range(1, row + 1):
for x in range(1, column + 1):
#1 * (row=1 -1) col=1 -1
image_path = image_names[column * (y - 1) + x - 1]
print("split_image=",image_path)
from_image = Image.open(image_path)
#保持原图片大小
#.resize(
# (image_size, image_size),Image.ANTIALIAS)
to_image.paste(from_image, ((x - 1) * image_size, (y - 1) * image_size))
from_image.close()
to_image.save(save_path)
print("图片合并完成:", save_path)
shutil.rmtree(src)
# 保存新图
@classmethod
def getScrambleImage(cls,path):
scramble_file_cache = cls.scrambleImage(path)
if scramble_file_cache != None and os.path.exists(scramble_file_cache): os.remove(scramble_file_cache)
@classmethod
def encode_scramble_image(cls,imgpath,img_save=None):
image = Image.open(imgpath)
w, h = image.size
#image.show()
file_str = str(imgpath).split("=")
#10_29.jpg
base_fn = file_str[-1].split("_")
blocks = int(base_fn[0])
if img_save == None:
save_path = os.path.join(os.path.dirname(imgpath),ComicPath.getFileScrambleImageSave(imgpath))
else: save_path = img_save
# print(type(aid),type(img_name))
if blocks:
s = blocks # 随机值
# print(s)
l = h % s # 切割最后多余的值
box_list = []
hz = 0
for i in range(s):
c = math.floor(h / s)
g = i * c
hz += c
h2 = h - c * (i + 1) - l
if i == 0:
c += l;hz += l
else:
g += l
box_list.append((0, h2, w, h - g))
# print(box_list,len(box_list))
item_width = w
# box_list.reverse() #还原切图可以倒序列表
# print(box_list, len(box_list))
newh = 0
image_list = [image.crop(box) for box in box_list]
# print(box_list)
newimage = Image.new("RGB", (w, h))
for image in image_list:
# image.show()
b_w, b_h = image.size
newimage.paste(image, (0, newh))
newh += b_h
newimage.save(save_path)
print("解密成功=",save_path)
if os.path.exists(imgpath):
os.remove(imgpath)
print("remove=",imgpath)
return save_path
class CBZUtils:
@classmethod
def readDirsOrFiles(cls, dir, type):
data = []
files = os.listdir(dir)
for file in files:
path = os.path.join(dir, file)
if type == "files" and os.path.isfile(path):
data.append(path)
if type == "dirs" and os.path.isdir(path):
data.append(path)
return data
@classmethod
def zip_compression(cls, source_dir=None, target_file=None, remove=True):
target_dir = os.path.dirname(target_file)
if not os.path.exists(target_dir):
os.makedirs(target_dir)
if not os.path.exists(target_file) and source_dir is not None:
with ZipFile(target_file, mode='w') as zf:
for path, dir_names, filenames in os.walk(source_dir):
path = Path(path)
arc_dir = path.relative_to(source_dir)
y = 0
for filename in filenames:
y = y + 1
print("打包中:" + str(y) + "/" + str(len(filenames)), os.path.join(source_dir, filename))
zf.write(path.joinpath(filename), arc_dir.joinpath(filename))
zf.close()
logging.info(f"打包完成:{target_file}")
@classmethod
def packComicChapterCBZ(cls, comic, chapter, remove=True):
images_chapter_path = os.path.join(IMAGES_STORE, comic, chapter)
cbz_chapter_path = os.path.join(CBZ_EXPORT_PATH, comic, chapter) + ".CBZ"
if os.path.exists(images_chapter_path):
dirs = os.listdir(images_chapter_path)
for file in dirs:
if file.startswith(ComicPath.PREFIX_SCRAMBLE):
try:
os.remove(file)
except Exception as e:
print(f"删除 {file} 发生错误 {e},已跳过")
return False
cls.zip_compression(images_chapter_path, cbz_chapter_path)
time.sleep(0.1)
if remove: shutil.rmtree(images_chapter_path)
return True
@classmethod
def replaceZip(cls, filepath, unpack_dir=None):
if not cls.compareFileDate(filepath): return None
if unpack_dir == None:
unpack_dir = str(filepath).split(".")[0]
fz = ZipFile(filepath, 'r')
for file in fz.namelist():
if file.endswith(".jpg"):
data = fz.read(file)
if len(data) < 500 and os.path.exists(filepath):
os.remove(filepath)
print(f"数据不完整,已删除:{filepath}")
if cls.compareFileDate(filepath):
os.utime(filepath)
print(f"已更新文件时间 {filepath}")
if os.path.exists(unpack_dir):
shutil.rmtree(unpack_dir)
# 删除删除main.ftl文件
# delete_filename = ''
# if os.path.exists(delete_filename):
# os.remove(delete_filename)
# time.sleep(60)
# shutil.copy(文件的路径,另一个目录);拷贝main.ftl到准备压缩的目录下
# cls.zip_compression()
# 小于则运行
@classmethod
def compareFileDate(cls, filepath):
if os.path.exists(filepath):
ctime = os.path.getmtime(filepath)
str_ctime = datetime.fromtimestamp(int(ctime))
file_ctime = str(str_ctime.year) + "{:0>2d}".format(str_ctime.month) + "{:0>2d}".format(
str_ctime.day) + "{:0>2d}".format(str_ctime.hour)
c_ctime = 2023011603
else:
return False
if int(file_ctime) < c_ctime:
return True
return False
@classmethod
def zip_info(cls, path, filter=True):
result = None
try:
with ZipFile(path, "r") as zip_file:
result = zip_file.namelist()
if filter:
result.remove(COMIC_INFO_XML_FILE)
except Exception as e:
print(e)
return result
import base64,hashlib,os,shutil
import math,time,json,datetime,logging
from PIL import Image
from Comics.utils.Constant import ComicPath
from pathlib import Path
from zipfile import ZipFile
from Comics.settings import COMIC_INFO_XML_FILE,CBZ_EXPORT_PATH,IMAGES_STORE
class fileUtils:
@classmethod
def save_file(cls,path,data):
dir = os.path.dirname(path)
if not os.path.exists(dir):
os.makedirs(dir)
with open(path,'w',encoding='utf-8') as fs:
fs.write(str(data))
fs.close()
@classmethod
def path(cls, file):
base_dir = os.path.dirname(file)
if not os.path.exists(base_dir): os.makedirs(base_dir)
return file
class CommonUtils:
@classmethod
def parseExec(cls,data,exec):
if data !=None and exec != None:
dots = str(exec).split(".")
if not isinstance(data,dict): data = json.loads(data)
for dot in dots:
data = data.get(dot)
return data
class imageUtils:
@classmethod
def descramble_images_by_dir(cls, chapter_dir):
if os.path.isfile(chapter_dir):
chapter_dir = os.path.dirname(chapter_dir)
scramble_count = 0
if os.path.exists(chapter_dir): #获取章节图片路径
while ComicPath.PREFIX_SCRAMBLE in os.listdir(chapter_dir):
for img in os.listdir(chapter_dir):
if img.startswith(ComicPath.PREFIX_SCRAMBLE):
imageUtils.encode_scramble_image(os.path.join(chapter_dir, img))
scramble_count += 1
logging.debug(f"{ComicPath.PREFIX_SCRAMBLE} {scramble_count}")
return scramble_count
@classmethod
def deScrambleImagesByPath(cls, img_path, img_save=None):
if os.path.basename(img_path).\
startswith(ComicPath.PREFIX_SCRAMBLE) and os.path.exists(img_path):
img_path = imageUtils.encode_scramble_image(img_path, img_save)
return img_path
@classmethod
def encodeImage(cls,str_en):
#print("en",str_en)
enc = base64.b64decode(str_en)
#print("解密:",enc)
m = hashlib.md5()
m.update(enc)
md5 = m.digest()
d = md5[-1]
#print(md5)
try:
blocks = d % 10 + 5
except:
blocks = 0 %10 + 5
#print("blocks=",blocks)
return blocks
@classmethod
def scrambleImage(cls,file_path):
#检测到未下载完的图像 直接返回None
if str(file_path).endswith(".downloads"):
os.remove(file_path)
return None
file_str = str(file_path).split("=")
#10_29.jpg
base_dir = file_str[0].replace("scramble","")
base_name = file_str[-1]
base_fn = base_name.split("_")
save_name = base_fn[1]
save_name_delesu = save_name.split(".")[0]
blocks = int(base_fn[0])
save_file_path = os.path.join(base_dir,save_name)
print("sva",save_file_path)
if os.path.exists(save_file_path):
print("图片已解密,已跳过:", save_file_path)
return None
image_su = str(file_path).split(".")[-1]
try:
img = Image.open(file_path)
except:
print(f"error Image: {file_path}")
width = img.width
height = img.height
#blocks = cls.encodeImage(enStr)
print("blocks=",blocks)
block_height = int(height / blocks)
block_width = int(width / blocks)
print("blockHeight=",block_height)
suffix = str(file_path).split(".")[-1]
split_path = os.path.join(base_dir,save_name_delesu+"split")
if image_su == "downloads":
return None
is_split = cls.splitimage(file_path,blocks,1,split_path)
if is_split != None:
cls.image_compose(split_path,blocks,1,save_file_path,block_height,width)
else:
if os.path.exists(split_path):
shutil.rmtree(split_path)
if os.path.exists(file_path):
shutil.move(file_path, save_file_path)
#完成后清空
return file_path
@classmethod
def splitimage(cls,src,rownum,colnum,dstpath):
img=Image.open(src)
w,h=img.size
if rownum<= h and colnum<=w:
s=os.path.split(src)
if dstpath=='':
dstpath = s[0]
if not os.path.exists(dstpath):
os.makedirs(dstpath)
fn=s[1].split('.')
basename=fn[0]
ext=fn[-1]
num=0
rowheight=h//rownum
colwidth=w//colnum
for r in range(rownum):
for c in range(colnum):
box=(c*colwidth,r*rowheight,(c+1)*colwidth,(r+1)*rowheight)
count_image = "{:0>3d}".format(num)
file_path = os.path.join(dstpath,str(count_image)+'.'+ext)
print("file_path=",file_path)
img.crop(box).save(file_path)
num=num+1
return "成功"
else:
print('不数!')
return None
@classmethod
def image_compose(cls,src,row,column,save_path,image_height,image_width):
image_size = image_height
#image_height = 376
#image_width = 720
images_format = ['.png','.jpg']
#image_names = [name for name in os.listdir(src) for item in images_format if
# os.path.splitext(name)[1] == item][::-1]
img_list=os.listdir(src)
img_list.sort()
img_list.sort(key=lambda x: int(x[:-4]))
##文件名按数字排序
img_nums=len(img_list)
image_names = []
for i in range(img_nums):
img_name=os.path.join(src,img_list[i])
image_names.append(img_name)
#使用倒序
image_names = image_names[::-1]
# 简单的对于参数的设定和实际图片集的大小进行数量判断
if len(image_names) < row * column:
raise ValueError("合成图片的参数和要求的数量不能匹配!")
to_image = Image.new('RGB', (column * image_width, row * image_height)) #创建一个新图
# 循环遍历,把每张图片按顺序粘贴到对应位置上
for y in range(1, row + 1):
for x in range(1, column + 1):
#1 * (row=1 -1) col=1 -1
image_path = image_names[column * (y - 1) + x - 1]
print("split_image=",image_path)
from_image = Image.open(image_path)
#保持原图片大小
#.resize(
# (image_size, image_size),Image.ANTIALIAS)
to_image.paste(from_image, ((x - 1) * image_size, (y - 1) * image_size))
from_image.close()
to_image.save(save_path)
print("图片合并完成:", save_path)
shutil.rmtree(src)
# 保存新图
@classmethod
def getScrambleImage(cls,path):
scramble_file_cache = cls.scrambleImage(path)
if scramble_file_cache != None and os.path.exists(scramble_file_cache): os.remove(scramble_file_cache)
@classmethod
def encode_scramble_image(cls, img_path, img_save=None):
if not os.path.exists(img_path):
return
image = Image.open(img_path)
w, h = image.size
#image.show()
file_str = str(img_path).split("=")
#10_29.jpg
base_fn = file_str[-1].split("_")
blocks = int(base_fn[0])
if img_save == None:
save_path = os.path.join(os.path.dirname(img_path),ComicPath.getFileScrambleImageSave(img_path))
else: save_path = img_save
# print(type(aid),type(img_name))
if blocks:
s = blocks # 随机值
# print(s)
l = h % s # 切割最后多余的值
box_list = []
hz = 0
for i in range(s):
c = math.floor(h / s)
g = i * c
hz += c
h2 = h - c * (i + 1) - l
if i == 0:
c += l;hz += l
else:
g += l
box_list.append((0, h2, w, h - g))
# print(box_list,len(box_list))
item_width = w
# box_list.reverse() #还原切图可以倒序列表
# print(box_list, len(box_list))
newh = 0
image_list = [image.crop(box) for box in box_list]
# print(box_list)
newimage = Image.new("RGB", (w, h))
for image in image_list:
# image.show()
b_w, b_h = image.size
newimage.paste(image, (0, newh))
newh += b_h
newimage.save(save_path)
logging.info(f"解密成功 {save_path}")
if os.path.exists(img_path):
os.remove(img_path)
logging.debug(f"remove {img_path}")
return save_path
class CBZUtils:
@classmethod
def readDirsOrFiles(cls, dir, type):
data = []
files = os.listdir(dir)
for file in files:
path = os.path.join(dir, file)
if type == "files" and os.path.isfile(path):
data.append(path)
if type == "dirs" and os.path.isdir(path):
data.append(path)
return data
@classmethod
def zip_compression(cls, source_dir=None, target_file=None, remove=True):
target_dir = os.path.dirname(target_file)
if not os.path.exists(target_dir):
os.makedirs(target_dir)
if not os.path.exists(target_file) and source_dir is not None:
with ZipFile(target_file, mode='w') as zf:
for path, dir_names, filenames in os.walk(source_dir):
path = Path(path)
arc_dir = path.relative_to(source_dir)
y = 0
for filename in filenames:
y = y + 1
print("打包中:" + str(y) + "/" + str(len(filenames)), os.path.join(source_dir, filename))
zf.write(path.joinpath(filename), arc_dir.joinpath(filename))
zf.close()
logging.info(f"打包完成:{target_file}")
@classmethod
def packComicChapterCBZ(cls, comic, chapter, comic_info_images, remove=True):
images_chapter_path = os.path.join(IMAGES_STORE, comic, chapter)
cbz_chapter_path = os.path.join(CBZ_EXPORT_PATH, comic, chapter) + ".CBZ"
if os.path.exists(images_chapter_path):
dirs = os.listdir(images_chapter_path)
for file in dirs:
if file.startswith(ComicPath.PREFIX_SCRAMBLE):
try:
imageUtils.deScrambleImagesByPath(os.path.join(images_chapter_path,file))
except Exception as e:
print(f"删除 {file} 发生错误 {e},已跳过")
return False
cls.zip_compression(images_chapter_path, cbz_chapter_path)
time.sleep(0.1)
if remove: shutil.rmtree(images_chapter_path)
# validation
cls.cbz_validate(cbz_chapter_path, comic_info_images)
return True
@classmethod
def replaceZip(cls, filepath, unpack_dir=None):
if not cls.compareFileDate(filepath): return None
if unpack_dir == None:
unpack_dir = str(filepath).split(".")[0]
fz = ZipFile(filepath, 'r')
for file in fz.namelist():
if file.endswith(".jpg"):
data = fz.read(file)
if len(data) < 500 and os.path.exists(filepath):
os.remove(filepath)
print(f"数据不完整,已删除:{filepath}")
if cls.compareFileDate(filepath):
os.utime(filepath)
print(f"已更新文件时间 {filepath}")
if os.path.exists(unpack_dir):
shutil.rmtree(unpack_dir)
# 删除删除main.ftl文件
# delete_filename = ''
# if os.path.exists(delete_filename):
# os.remove(delete_filename)
# time.sleep(60)
# shutil.copy(文件的路径,另一个目录);拷贝main.ftl到准备压缩的目录下
# cls.zip_compression()
# 小于则运行
@classmethod
def compareFileDate(cls, filepath):
if os.path.exists(filepath):
ctime = os.path.getmtime(filepath)
str_ctime = datetime.fromtimestamp(int(ctime))
file_ctime = str(str_ctime.year) + "{:0>2d}".format(str_ctime.month) + "{:0>2d}".format(
str_ctime.day) + "{:0>2d}".format(str_ctime.hour)
c_ctime = 2023011603
else:
return False
if int(file_ctime) < c_ctime:
return True
return False
@classmethod
def zip_info(cls, path, filter=True):
result = None
try:
with ZipFile(path, "r") as zip_file:
result = zip_file.namelist()
if filter:
result.remove(COMIC_INFO_XML_FILE)
except Exception as e:
print(e)
return result
@classmethod
def cbz_validate(cls, zip_path, comic_info_images):
if len(cls.zip_info(zip_path)) == len(comic_info_images):
logging.info(f"validating successfully === {zip_path}")
else:
os.remove(zip_path)
logging.error(f"validating fail === {zip_path}")

View File

@ -1,16 +1,16 @@
class OldUtils:
old_comic_name=None
old_chapter = None
@classmethod
def setOldComicName(cls,value): cls.old_comic_name = value
@classmethod
def setOldChapter(cls,value): cls.old_chapter=value
@classmethod
def getOldComicName(cls): return cls.old_comic_name
@classmethod
class OldUtils:
old_comic_name=None
old_chapter = None
@classmethod
def setOldComicName(cls,value): cls.old_comic_name = value
@classmethod
def setOldChapter(cls,value): cls.old_chapter=value
@classmethod
def getOldComicName(cls): return cls.old_comic_name
@classmethod
def getOldChapter(cls): return cls.old_chapter

8
run.py
View File

@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
from scrapy import cmdline
# -*- coding: utf-8 -*-
from scrapy import cmdline
cmdline.execute("scrapy crawl rm_comic".split())

View File

@ -1,11 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = Comics.settings
[deploy]
#url = http://localhost:6800/
project = Comics
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = Comics.settings
[deploy]
#url = http://localhost:6800/
project = Comics