This commit is contained in:
caiwx86 2023-05-21 03:10:51 +08:00
parent 5884f1e92c
commit 8f55a51140
16 changed files with 489 additions and 492 deletions

3
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
# Default ignored files
/shelf/
/workspace.xml

8
.idea/ComicScrapy.iml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="stable_vscode" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

4
.idea/misc.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="stable_vscode" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/ComicScrapy.iml" filepath="$PROJECT_DIR$/.idea/ComicScrapy.iml" />
</modules>
</component>
</project>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

98
Comics/exporters.py Normal file
View File

@ -0,0 +1,98 @@
import os.path,json,ast
from Comics.settings import COMIC_INFO_FIELDS_TO_EXPORT
from scrapy.exporters import XmlItemExporter
from scrapy.exporters import PythonItemExporter
from Comics.items import ComicInfoItem
from Comics.items import ComicItem
from Comics.settings import COMIC_INFO_XML_STORE
from Comics.utils.Constant import ComicPath
from scrapy.utils.python import is_listlike, to_bytes, to_unicode
class ItemExporter(PythonItemExporter):
def convert(self, data):
if isinstance(data, bytes): return data.decode("utf-8")
if isinstance(data, dict): return dict(map(self.convert, data.items()))
if isinstance(data, tuple): return map(self.convert, data)
if isinstance(data, list): return [self.convert(i) for i in data]
return data
def export_obj(self, obj_item):
self.start_exporting()
obj_item = self.convert(self.export_item(obj_item))
self.finish_exporting()
return obj_item
class ComicInfoXmlItemExporter(XmlItemExporter):
custom_root_element = "ComicInfo"
def __init__(self, comic, chapter):
file_path = os.path.join(COMIC_INFO_XML_STORE, comic,
chapter, f"{self.custom_root_element}.xml")
dir_path = os.path.dirname(file_path)
if not os.path.exists(dir_path): os.makedirs(dir_path)
self.xml_file = open(file_path, "wb")
super(ComicInfoXmlItemExporter, self).__init__(self.xml_file,
root_element=self.custom_root_element,
indent=1,fields_to_export=COMIC_INFO_FIELDS_TO_EXPORT)
def serialize_field(self, field, name, value):
#通过序列化
value = ComicPath.chinese_convert(value)
return super().serialize_field(field, name, value)
def start_exporting(self):
self.xg.startDocument()
self.xg.startElement(self.custom_root_element, {})
def comic_to_info_item(self, comic_item):
comic_info = {}
comic_info_dict = getattr(ComicItem, "data", 0)
for key, value in ComicItem(comic_item).items():
new_key = comic_info_dict.get(key)
if new_key is not None:
comic_info[new_key] = value
return ItemExporter().export_obj(ComicInfoItem(comic_info))
def export_item(self, item):
comic_info = self.comic_to_info_item(item)
child_element = "Page"
self._beautify_indent(depth=1)
self._beautify_newline()
for name, value in self._get_serialized_fields(comic_info, default_value=""):
if name is "Pages":
value = str(value).split(',')
if value is not None or value != "":
self._export_xml_field(name, value, depth=2, child_element=child_element)
self._beautify_indent(depth=1)
return comic_info
def _export_xml_field(self, name, serialized_value, depth, child_element="value"):
self._beautify_indent(depth=depth)
self.xg.startElement(name, {})
if hasattr(serialized_value, "items"):
self._beautify_newline()
for subname, value in serialized_value.items():
self._export_xml_field(subname, value, depth=depth + 1)
self._beautify_indent(depth=depth)
elif is_listlike(serialized_value):
self._beautify_newline()
for value in serialized_value:
self._export_xml_field(child_element, value, depth=depth + 1)
self._beautify_indent(depth=depth)
elif isinstance(serialized_value, str):
self.xg.characters(serialized_value)
else:
self.xg.characters(str(serialized_value))
self.xg.endElement(name)
self._beautify_newline()
def finish_exporting(self):
self.xg.endElement(self.custom_root_element)
self.xg.endDocument()
self.xml_file.close()
def export_xml(self, item):
self.start_exporting()
comic_info = self.export_item(item)
self.finish_exporting()
return comic_info

View File

@ -1,52 +1,93 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
# https://docs.org/en/latest/topics/items.html
from scrapy.item import Item, Field
from Comics.utils.Constant import ComicPath
from dataclasses import dataclass
from scrapy.loader.processors import TakeFirst, MapCompose, Join
class ComicsItem(scrapy.Item):
#漫画名
name = scrapy.Field()
#链接
link = scrapy.Field()
data = {}
def setinfo(**kwds):
def decorate(f):
for k in kwds: data[k] = kwds[k]
setattr(f, "data", data)
return f
return decorate
class ComicItem(scrapy.Item):
name = scrapy.Field()
chapter = scrapy.Field()
list_img = scrapy.Field()
author= scrapy.Field()
icon = scrapy.Field()
tags = scrapy.Field()
dep = scrapy.Field()
date = scrapy.Field()
chapters = scrapy.Field()
chapter_href= scrapy.Field()
genre = scrapy.Field()
age_rating = scrapy.Field()
def serialize_to_chinese(value):
return ComicPath.chinese_convert(value)
class ImageItem(scrapy.Item):
image_name = scrapy.Field()
image_url = scrapy.Field()
image_path = scrapy.Field()
def serialize_to_fix_file(value):
file = ComicPath.chinese_convert(value)
return ComicPath.fix_file_name(file)
class ComicInfoItem(scrapy.Item):
Title= scrapy.Field()#"章节名",True]
Series = scrapy.Field()# ","漫画名",True]
Number = scrapy.Field()# ","编号",True]
SeriesGroup = scrapy.Field()# ","别名",False]
Summary = scrapy.Field()# ","概述",True]
Year = scrapy.Field()# ","年",False]
Month = scrapy.Field()# ","月",False]
Day = scrapy.Field()# ","日",False]
Writer = scrapy.Field()# "作者",True]
Publisher = scrapy.Field()# ","出版社",False]
Genre = scrapy.Field()# ","流派",True]
Tags = scrapy.Field()# ","标签",True]
Web = scrapy.Field()# ","主页",False]
PageCount = scrapy.Field()# ","总页数",True]
LanguageISO = scrapy.Field()#","语言",True]
AgeRating = scrapy.Field()#","年龄分级",False]
Pages = scrapy.Field()#","页码",True]
# ComicInfo.xml and ComicChapter.json end
class ComicOItem(Item):
name = Field()
chapterItem = Field()
@setinfo(name="Series", chapter="Title",
author="Writer", tags="Tags",
dep="Summary", genre="Genre",
index="Number", images_name="Pages",
age_rating="AgeRating")
class ComicItem(Item):
# 编号
index = Field()
# 漫画名
name = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst())
# 章节名
chapter = Field(serializer=serialize_to_fix_file)
# 图片链接
list_img = Field()
# 作者
author = Field(serialize_to_chinese=serialize_to_chinese, output_processor=TakeFirst())
# 封面链接
icon = Field()
# 标签
tags = Field(serializer=serialize_to_chinese)
# 概述
dep = Field(serializer=serialize_to_chinese)
# 时间
date = Field()
# 流派
genre = Field()
# 年龄分级
age_rating = Field()
images = Field()
images_name = Field()
class ImageItem(Item):
image_name = Field()
image_url = Field()
image_path = Field()
def serializer_info_writer(value):
list_value = []
str(value).replace("&", " ")
for v in str(value).split(" "):
list_value.append(v)
return ",".join(list_value)
class ComicInfoItem(Item):
Title = Field()#"章节名",True]
Series = Field()# ","漫画名",True]
Number = Field()# ","编号",True]
SeriesGroup = Field()# ","别名",False]
Summary = Field()# ","概述",True]
Year = Field()# ","年",False]
Month = Field()# ","月",False]
Day = Field()# ","日",False]
Writer = Field(serializer=serializer_info_writer)# "作者",True]
Publisher = Field()# ","出版社",False]
Genre = Field()# ","流派",True]
Tags = Field()# ","标签",True]
Web = Field()# ","主页",False]
PageCount = Field()# ","总页数",True]
LanguageISO = Field()#","语言",True]
AgeRating = Field()#","年龄分级",False]
Pages = Field()#","页码",True]
Page = Field()
# ComicInfo.xml and ComicChapter.json end

View File

@ -8,93 +8,74 @@
import os,requests,re,scrapy,logging
from Comics import settings
from Comics.utils.FileUtils import imageUtils
from Comics.utils.FileUtils import fileUtils
from Comics.utils.Constant import ComicPath
from Comics.items import ComicItem
from Comics.items import ImageItem
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exporters import XmlItemExporter
from itemadapter import ItemAdapter
from Comics.exporters import ComicInfoXmlItemExporter
from Comics.exporters import ItemExporter
from Comics.utils.CBZUtils import CBZUtils
class ComicsPipeline:
def open_spider(self,spider):
self.fp = open('book.json','w',encoding='utf-8')
def open_spider(self, spider):
pass
# item就是yield后面的对象
def process_item(self, item, spider):
self.fp.write(str(item))
if isinstance(item, ComicItem):
item = ComicItem(ItemExporter().export_obj(item))
file = os.path.join("json",item['name'],item['chapter'])
fileUtils.save_file(f"{file}.json",item)
return item
#image解析
def close_spider(self,spider):
self.fp.close()
pass
class ImageParsePipeline:
def process_item(self, item, spider):
if isinstance(item, ComicItem):
list_img = item['list_img']
count = 1
scramble_count = 0
list_image_item = []
for image in list_img:
images_item = []
for image in item['list_img']:
(image_src,scramble) = [image.get("src"),image.get("scramble")]
count_image = "{:0>3d}".format(count)
image_src_suffix = "."+str(image_src).split(".")[-1]
image_file_name = count_image+image_src_suffix
suffix = "."+str(image_src).split(".")[-1]
image_name = count_image + suffix
if scramble:
de_str = str(image_src).split("/")[-1].replace(image_src_suffix,"==")
de_str = str(image_src).split("/")[-1].replace(suffix,"==")
blocks_num = imageUtils.encodeImage(de_str)
scramble_image_file_name = ComicPath.getFileScrambleImageName(count=count_image,block=blocks_num,suffix=image_src_suffix)
scramble_count += 1
image_path = os.path.join(item['name'],item['chapter'],scramble_image_file_name)
image_path = ComicPath.ChineseConvert(image_path)
list_image_item.append(ImageItem(image_name=image_file_name,image_url=image_src,image_path=image_path))
count+=1
return list_image_item
image_name = ComicPath.getFileScrambleImageName(count=count_image,block=blocks_num,suffix=suffix)
image_path = os.path.join(item['name'],item['chapter'], image_name)
images_item.append(ImageItem(image_name=count_image + suffix,image_url=image_src,image_path=image_path))
count += 1
item['images'] = images_item
return item
class ImgDownloadPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
image = request.meta['item']
image_path = image['image_path']
en_image_path = os.path.join(os.path.dirname(image_path),image['image_name'])
if os.path.exists(en_image_path): return en_image_path
else: return image_path
en_image_path = os.path.join(os.path.dirname(image_path), image['image_name'])
if os.path.exists(os.path.join(settings.IMAGES_STORE, en_image_path)):
return en_image_path
else:
return image_path
def get_media_requests(self, item, info):
for image in item:
host = re.sub(r'(http://|https://)', '', image['image_url']).split('/')[0]
for image in item['images']:
yield scrapy.Request(url= image['image_url'], meta= {'item' : image})
def item_completed(self, results, item, info):
if len(results) == len(item):
for image in results:
success = image[0]
img = image[1]
img_path = os.path.join(settings.IMAGES_STORE,img['path'])
#解密图片
imageUtils.deScrambleImagesByPath(img_path)
return item
class ComicInfoXmlPipeline:
def open_spider(self, spider):
self.xml_exporter = {}
def close_spider(self, spider):
for exporter, xml_file in self.xml_exporter.values():
exporter.finish_exporting()
xml_file.close()
def _exporter_for_item(self, item):
adapter = ItemAdapter(item)
xml_file = open("ComicInfo.xml", "wb")
exporter = XmlItemExporter(xml_file)
exporter.start_exporting()
self.xml_exporter = (exporter, xml_file)
return self.xml_exporter
def process_item(self, item, spider):
exporter = self._exporter_for_item(item)
exporter.export_item(item)
return item
info_img = []
for success, img in results:
img_path = os.path.join(settings.IMAGES_STORE, img['path'])
#解密图片
img_path = imageUtils.deScrambleImagesByPath(img_path)
info_img.append(os.path.basename(img_path).split('.')[0])
item['images_name'] = ",".join(info_img)
#return item
#ComicInfoXml 生成
ComicInfoXmlItemExporter(comic=item['name'], chapter=item['chapter']).export_xml(item)
#打包
CBZUtils.packComicChapterCBZ(comic=item['name'], chapter=item['chapter'], remove= False)

View File

@ -18,17 +18,17 @@ NEWSPIDER_MODULE = 'Comics.spiders'
#USER_AGENT = 'Comics (+http://www.yourdomain.com)'
USER_AGENT = UserAgent().random
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
ROBOTSTXT_OBEY = False
HTTPERROR_ALLOWED_CODES = [ 200 , 403]
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
IMAGES_URLS_FIELD = "image_url"
IMAGES_RESULT_FIELD = "image_path"
IMAGES_STORE = 'images'
COMIC_INFO_XML_STORE = 'images'
DOWNLOAD_DELAY = 20
#重试
RETRY_ENABLED = True
@ -66,7 +66,7 @@ COOKIES_ENABLED = False
DOWNLOADER_MIDDLEWARES = {
# 'Comics.middlewares.ComicsDownloaderMiddleware': 543,
# 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500,
'Comics.middlewares.ProxyMiddleware' : 100,
'Comics.middlewares.ProxyMiddleware': 100,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400,
}
@ -82,7 +82,6 @@ ITEM_PIPELINES = {
'Comics.pipelines.ComicsPipeline': 300,
'Comics.pipelines.ImageParsePipeline': 400,
'Comics.pipelines.ImgDownloadPipeline': 500,
'Comics.pipelines.ComicInfoXmlPipeline': 600,
}
# Enable and configure the AutoThrottle extension (disabled by default)
@ -103,5 +102,28 @@ AUTOTHROTTLE_DEBUG = False
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_IGNORE_HTTP_CODES = [500, 502, 404, 403]
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
CBZ_EXPORT_PATH = "CBZ"
#数据导出类 排序
COMIC_INFO_XML_FILE = "ComicInfo.xml"
COMIC_INFO_FIELDS_TO_EXPORT = [
"Title",
"Series",
"Number",
"SeriesGroup",
"Summary",
"Year",
"Month",
"Day",
"Writer",
"Publisher",
"Genre",
"Tags",
"Web",
"PageCount",
"LanguageISO",
"AgeRating",
"Pages"
]

View File

@ -1,26 +1,9 @@
import urllib.parse
import scrapy,json,requests
from Comics.items import ComicItem
from Comics.utils.FileUtils import CommonUtils
import threading
import toml
class ErrorLog:
def __init__(self) -> None:
self.lock = threading.Lock()
def err_ls(self, dic):
self.lock.acquire()
with open('error.toml', 'r+t') as f:
data = toml.load('error.toml')
f.seek(0, 0)
f.truncate()
dic_name = f'err_{len(data)}'
data[dic_name] = dic
_ = toml.dump(data, f)
self.lock.release()
error_logger = ErrorLog()
from scrapy.loader import ItemLoader
class RmComicSpider(scrapy.Spider):
name = 'rm_comic'
@ -29,45 +12,51 @@ class RmComicSpider(scrapy.Spider):
#start_urls = ['https://rm01.xyz/books/63b65185-f798-4c8f-a0b0-8811615908fd/0']
def start_requests(self):
yield scrapy.Request(self.main_url + '/books/0a7e8bd1-4cfa-481a-b067-1df663fb2017', callback=self.parse_comic)
yield scrapy.Request('https://rm01.xyz'
'/books/306ec1e2-f701-4fda-bb78-041ad6ec4020', callback=self.parse_comic)
def parse_comic(self, response):
comic = ComicItem()
# comic_item = ItemLoader(item=ComicItem(), response=response)
comic['name'] = response.xpath('//div[@class="col"]/h5/text()').extract_first()
comic['icon'] = response.xpath('//img[@class="img-thumbnail"]/@src').extract_first()
comic['author'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[1]/text()').extract()[1]
comic['tags'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[3]/b/text()').extract_first()
comic['dep'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[4]/text()').extract()[1]
comic['date'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()').extract()[1]
comic['chapters'] = response.xpath('//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/text()').extract()
comic['chapter_href'] = response.xpath('//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/@href').extract()
for link in comic['chapter_href']:
yield scrapy.Request(self.main_url+link,meta={'item' : comic} , callback=self.parse_chapter,errback=self.err)
def err(self):
print("Error=====")
comic['genre'] = "韩漫"
comic['age_rating'] = "R18+"
chapters = response.xpath('//div[contains(@class,"bookid_chapterBox")]'
'//div[contains(@class,"bookid_chapter")]/a/text()').extract()
chapter_href = response.xpath('//div[contains(@class,"bookid_chapterBox")]'
'//div[contains(@class,"bookid_chapter")]/a/@href').extract()
#for chapter, link in zip(chapters, chapter_href):
for i, link in enumerate(chapter_href, start=1):
yield scrapy.Request(self.main_url+link, meta={'item' : comic, 'number': i}, callback=self.parse_chapter)
def parse_chapter(self, response):
item = response.meta['item']
number = response.meta['number']
data = response.xpath('//script[@id="__NEXT_DATA__"]/text()').extract_first()
str_exec="props.pageProps."
comic_name = CommonUtils.parseExec(data,str_exec+"bookName")
chapterName = CommonUtils.parseExec(data,str_exec+"chapterName")
description = CommonUtils.parseExec(data,str_exec+"description")
images = CommonUtils.parseExec(data,str_exec+"images")
chapter_api_url = CommonUtils.parseExec(data,str_exec+"chapterAPIPath")
str_exec = "props.pageProps."
comic_name = CommonUtils.parseExec(data, str_exec+"bookName")
chapterName = CommonUtils.parseExec(data, str_exec+"chapterName")
description = CommonUtils.parseExec(data, str_exec+"description")
images = CommonUtils.parseExec(data, str_exec+"images")
chapter_api_url = CommonUtils.parseExec(data, str_exec+"chapterAPIPath")
item['chapter'] = chapterName
item['list_img'] = images
item['index'] = number
if chapter_api_url != None:
yield scrapy.Request(url=self.main_url+chapter_api_url,meta={'item' : item}, callback=self.parse_chapter_api, errback=self.err)
yield scrapy.Request(self.main_url+ chapter_api_url,meta={'item' : item}, callback= self.parse_chapter_api)
else:
item['list_img'] = images
yield item
def parse_chapter_api(self,response,item):
data = response.meta['item']
print(item)
return response
def parse_chapter_api(self, response):
item = response.meta['item']
item['chapter'] = CommonUtils.parseExec(response.text, "chapter.name")
item['list_img'] = CommonUtils.parseExec(response.text, "chapter.images")
yield item
def parse(self, response):
raise NotImplementedError

105
Comics/utils/CBZUtils.py Normal file
View File

@ -0,0 +1,105 @@
import os, shutil, time, logging
from datetime import datetime
from pathlib import Path
from zipfile import ZipFile
from Comics.settings import COMIC_INFO_XML_FILE,CBZ_EXPORT_PATH,IMAGES_STORE
class CBZUtils:
@classmethod
def readDirsOrFiles(cls,dir,type):
data = []
files = os.listdir(dir)
for file in files:
path = os.path.join(dir,file)
if type == "files" and os.path.isfile(path):
data.append(path)
if type == "dirs" and os.path.isdir(path):
data.append(path)
return data
@classmethod
def zip_compression(cls, source_dir=None, target_file=None, remove=True):
target_dir = os.path.dirname(target_file)
if not os.path.exists(target_dir):
os.makedirs(target_dir)
if not os.path.exists(target_file) and source_dir != None:
with ZipFile(target_file, mode='w') as zf:
for path, dir_names, filenames in os.walk(source_dir):
path = Path(path)
arc_dir = path.relative_to(source_dir)
y = 0
for filename in filenames:
y = y + 1
print("打包中:" + str(y) + "/" + str(len(filenames)), os.path.join(source_dir, filename))
zf.write(path.joinpath(filename), arc_dir.joinpath(filename))
zf.close()
logging.info(f"打包完成:{target_file}")
@classmethod
def packComicChapterCBZ(cls, comic, chapter, remove=True):
images_chapter_path = os.path.join(IMAGES_STORE, comic, chapter)
cbz_chapter_path = os.path.join(CBZ_EXPORT_PATH, comic, chapter)+".CBZ"
if os.path.exists(images_chapter_path):
dirs = os.listdir(images_chapter_path)
for file in dirs:
if file.startswith("scramble="):
try:
os.remove(file)
except:
print(f"删除 {file} 发生错误,已跳过")
return False
cls.zip_compression(images_chapter_path, cbz_chapter_path)
time.sleep(0.1)
if remove: shutil.rmtree(images_chapter_path)
return True
@classmethod
def replaceZip(cls,filepath,unpack_dir=None):
if not cls.compareFileDate(filepath): return None
if unpack_dir == None:
unpack_dir = str(filepath).split(".")[0]
fz = ZipFile(filepath, 'r')
for file in fz.namelist():
if file.endswith(".jpg"):
data = fz.read(file)
if len(data) < 500 and os.path.exists(filepath):
os.remove(filepath)
print(f"数据不完整,已删除:{filepath}")
if cls.compareFileDate(filepath):
os.utime(filepath)
print(f"已更新文件时间 {filepath}")
if os.path.exists(unpack_dir):
shutil.rmtree(unpack_dir)
# 删除删除main.ftl文件
#delete_filename = ''
#if os.path.exists(delete_filename):
# os.remove(delete_filename)
# time.sleep(60)
# shutil.copy(文件的路径,另一个目录);拷贝main.ftl到准备压缩的目录下
#cls.zip_compression()
#小于则运行
@classmethod
def compareFileDate(cls,filepath):
if os.path.exists(filepath):
ctime = os.path.getmtime(filepath)
str_ctime = datetime.fromtimestamp(int(ctime))
file_ctime = str(str_ctime.year)+"{:0>2d}".format(str_ctime.month)+"{:0>2d}".format(str_ctime.day)+"{:0>2d}".format(str_ctime.hour)
c_ctime = 2023011603
else:
return False
if int(file_ctime) < c_ctime:
return True
return False
@classmethod
def zip_info(cls, path, filter=True):
result = None
try:
with ZipFile(path, "r") as zip_file:
result = zip_file.namelist()
if filter:
result.remove(COMIC_INFO_XML_FILE)
except Exception as e:
print(e)
return result

View File

@ -1,248 +0,0 @@
import json,re
from opencc import OpenCC
from queue import Queue
from utils.OldUtils import OldUtils
class Comic:
# ComicInfo.xml and ComicChapter.json bengin
# value origin node dep required
dict_chapter = [None,None,"Title","章节名",True]
dict_comic_name = [None,None,"Series","漫画名",True]
dict_number = [None,None,"Number","编号",True]
dict_comic_names = [None,None,"SeriesGroup","别名",False]
dict_dep = [None,None,"Summary","概述",True]
dict_year = [None,None,"Year","",False]
dict_month = [None,None,"Month","",False]
dict_day = [None,None,"Day","",False]
dict_author = [None,None,"Writer","作者",True]
dict_cbs = [None,None,"Publisher","出版社",False]
dict_genre = [None,None,"Genre","流派",True]
dict_tags = [None,None,"Tags","标签",True]
dict_homepage = [None,None,"Web","主页",False]
dict_page_count = [None,None,"PageCount","总页数",True]
dict_language = [None,None,"LanguageISO","语言",True]
dict_agerating = [None,None,"AgeRating","年龄分级",False]
dict_pages = [None,None,"Pages","页码",True]
CURRENT_DOWN_LINK = None
# ComicInfo.xml and ComicChapter.json end
dict_icon = [None,None,"Icon","图标",True]
dict_chapter_imgs = [None,None,"ChapterImgs","图像",True]
#主页
dict_list_chapter = [None,None,"ListChapter","全部章节名",True]
(update_at,current_chapter_img,file_chapter_imgs) = [None,None,None]
#繁体中文转简体中文
@classmethod
def ChineseConvert(cls, text,convert='t2s'): return OpenCC(convert).convert(str(text))
#处理成符合规定的文件名
@classmethod
def fixFileName(cls,filename,replace=None):
if not isinstance(filename,str): return filename
intab = r'[?*/\|.:><]'
str_replace = ""
if replace != None: str_replace = replace
filename = re.sub(intab, str_replace, filename)
count = 1
while True:
str_file = filename[0-count]
if str_file == " ": count += 1
else:
filename = filename[0:len(filename)+1-count]
break
return filename
@classmethod
def setValue(cls,value):
if value != None: value = cls.ChineseConvert(value)
return value
@classmethod
def setField(cls,field,value,origin=True,convert=True):
if value != None:
if origin:
field[1] = value
if convert: value = cls.ChineseConvert(value)
field[0] = value
return field
@classmethod
def getFieldValue(cls,field):
if field == None: return None
return field[0]
@classmethod
def setFieldOrigin(cls,filed,origin):
filed[1] = origin
return filed
@classmethod
def getFieldOrigin(cls,filed): return filed[1]
@classmethod
def getFieldNode(cls,filed): return filed[2]
@classmethod
def getValue(cls,field,exec=None):
if exec != None: return cls.parseExec(field,exec=exec)
return field
#章节名
@classmethod
def setChapterName(cls,value,exec=None):
value = cls.fixFileName(cls.parseExec(value,exec=exec))
OldUtils.setOldChapter(value)
cls.dict_chapter = cls.setField(cls.dict_chapter,value)
@classmethod
def getChapterName(cls): return cls.getFieldValue(cls.dict_chapter)
@classmethod
def getOriginChapterName(cls): return cls.getFieldOrigin(cls.dict_chapter)
#漫画名
@classmethod
def setComicName(cls,value,exec=None):
value = cls.fixFileName(cls.parseExec(value,exec=exec))
OldUtils.setOldComicName(value)
cls.dict_comic_name = cls.setField(cls.dict_comic_name,value)
@classmethod
def getComicName(cls): return cls.getFieldValue(cls.dict_comic_name)
@classmethod
def getOriginComicName(cls): return cls.getFieldOrigin(cls.dict_comic_name)
#编号
@classmethod
def setNumber(cls,value): cls.dict_number = cls.setField(cls.dict_number,value)
@classmethod
def getNumber(cls): return cls.getFieldValue(cls.dict_number)
#概述
@classmethod
def setDep(cls,value,exec=None):
cls.dict_dep = cls.setField(cls.dict_dep,cls.parseExec(value,exec=exec))
@classmethod
def getDep(cls): return cls.getFieldValue(cls.dict_dep)
#作者
@classmethod
def setAuthor(cls,value): cls.dict_author = cls.setField(cls.dict_author,value)
@classmethod
def getAuthor(cls): return cls.getFieldValue(cls.dict_author)
#流派
@classmethod
def setGenre(cls,value): cls.dict_genre = cls.setField(cls.dict_genre,value)
@classmethod
def getGenre(cls): return cls.getFieldValue(cls.dict_genre)
#语言
@classmethod
def setLanguage(cls,value): cls.dict_language = cls.setField(cls.dict_language,value)
@classmethod
def getLanguage(cls): return cls.getFieldValue(cls.dict_language)
#年龄分级
@classmethod
def setAgeRating(cls,value): cls.dict_agerating = cls.setField(cls.dict_agerating,value)
@classmethod
def getAgeRating(cls): return cls.getFieldValue(cls.dict_agerating)
#标签
@classmethod
def setTags(cls,value): cls.dict_tags = cls.setField(cls.dict_tags,value)
@classmethod
def getTags(cls): return cls.getFieldValue(cls.dict_tags)
#总页数
@classmethod
def setPageCount(cls,value): cls.dict_page_count = cls.setField(cls.dict_page_count,value)
@classmethod
def getPageCount(cls): return cls.getFieldValue(cls.dict_page_count)
#------------------------------------------------------------------------
@classmethod
def parseExec(cls,data,exec,item=True):
if data !=None and exec != None:
dots = str(exec).split(".")
if not isinstance(data,dict): data = json.loads(data)
for dot in dots:
data = data.get(dot)
return data
@classmethod
def setHomePage(cls,value): cls.dict_homepage = cls.setField(cls.dict_homepage,value)
@classmethod
def getHomePage(cls): return cls.getFieldValue(cls.dict_homepage)
@classmethod
def setIcon(cls,value): cls.dict_icon = cls.setField(cls.dict_icon,value,convert=False)
@classmethod
def getIcon(cls): return cls.getFieldValue(cls.dict_icon)
@classmethod
def setListChapter(cls,value): cls.dict_list_chapter = cls.setField(cls.dict_list_chapter,value,convert=False)
@classmethod
def getListChapter(cls): return cls.getFieldValue(cls.dict_list_chapter)
@classmethod
def getLenChapters(cls): return len(cls.getListChapter())
@classmethod
def setChapterImgs(cls,value,exec=None,item=None):
cls.dict_chapter_imgs = cls.setField(cls.dict_chapter_imgs,cls.parseExec(value,exec=exec,item=item),convert=False)
@classmethod
def getChapterImgs(cls): return cls.getFieldValue(cls.dict_chapter_imgs)
@classmethod
def setUpdateAt(cls,value): cls.update_at = value
@classmethod
def getUpdateAt(cls): return cls.update_at
@classmethod
def setCurrentChapterImg(cls,value): cls.current_chapter_img = value
@classmethod
def getCurrentChapterImg(cls): return cls.current_chapter_img
@classmethod
def setChapterFilesName(cls,value): cls.file_chapter_imgs= value
@classmethod
def getChapterFilesName(cls): return cls.file_chapter_imgs
@classmethod
def setCurrentDownLink(cls,value): cls.CURRENT_DOWN_LINK = value
@classmethod
def getCurrentDownLink(cls): return cls.CURRENT_DOWN_LINK
class ListComic:
LIST_COMIC_QUEUE = Queue()
(LIST_COMIC_NAME,LIST_COMIC_LINK,LIST_COMIC_UPDATEAT) = [None,None,None]
@classmethod
def setListComicsLinksUpdateAt(cls,names,links,update_at):
if isinstance(names,list) and isinstance(links,list) and isinstance(update_at,list):
for x in range(0,len(names)):
cls.LIST_COMIC_QUEUE.put([names[x],links[x],update_at[x]])
@classmethod
def getListComicsLinksUpdateAt(cls):
result = None
if cls.LIST_COMIC_NAME != None and cls.LIST_COMIC_LINK != None:
cls.setListComicsLinksUpdateAt(cls.LIST_COMIC_NAME,cls.LIST_COMIC_LINK,cls.LIST_COMIC_UPDATEAT)
(cls.LIST_COMIC_NAME,cls.LIST_COMIC_LINK,cls.LIST_COMIC_UPDATEAT) = [None,None,None]
if not cls.LIST_COMIC_QUEUE.empty(): result = cls.LIST_COMIC_QUEUE.get(False)
return result
@classmethod
def addListComicChapterLink(cls,name,link,update_at):
if name != None and link != None:
cls.LIST_COMIC_QUEUE.put(name,link,update_at)
@classmethod
def getListValue(cls,result,type,start_add=None,result_type="list"):
if result == None: return None
if type == None: return result
if result_type == "list" and type != None:
data = []
for x in range(0, len(result)):
if start_add != None:
data.append(start_add+result[x].get(type))
else:
data.append(result[x].get(type))
return data
return result
@classmethod
def setListComicName(cls,value,type=None): cls.LIST_COMIC_NAME = cls.getListValue(value,type)
@classmethod
def getListComicName(cls): return cls.LIST_COMIC_NAME
@classmethod
def setListComicChapterLink(cls,value,type=None,start_add=None): cls.LIST_COMIC_LINK = cls.getListValue(value,type,start_add)
@classmethod
def getListComicChapterLink(cls): return cls.LIST_COMIC_LINK
@classmethod
def setListComicUpdateAt(cls,value,type=None): cls.LIST_COMIC_UPDATEAT = cls.getListValue(value,type)
@classmethod
def getListComicUpdateAt(cls): return cls.LIST_COMIC_UPDATEAT
@classmethod
def getListComicChapterLink(cls): return cls.LIST_COMIC_QUEUE.get(False)
#domain end....

View File

@ -1,41 +1,14 @@
import json,os
import logging
from xml.dom.minidom import Document
from Comics.utils.Comic import Comic
from Comics.utils.Constant import ComicPath
from itemadapter import is_item, ItemAdapter
class ComicInfoEntity:
@classmethod
def getNodes(cls):
return [Comic.dict_chapter,Comic.dict_comic_name,Comic.dict_number,Comic.dict_comic_names,
Comic.dict_dep,Comic.dict_year,Comic.dict_month,Comic.dict_day,Comic.dict_author,
Comic.dict_cbs,Comic.dict_genre,Comic.dict_tags,Comic.dict_page_count,
Comic.dict_language,Comic.dict_agerating,Comic.dict_pages]
@classmethod
def getJsonNodes(cls):
return [Comic.dict_chapter,Comic.dict_comic_name,Comic.dict_icon,Comic.dict_number,
Comic.dict_comic_names,
Comic.dict_dep,Comic.dict_year,Comic.dict_month,Comic.dict_day,Comic.dict_author,
Comic.dict_cbs,Comic.dict_genre,Comic.dict_tags,Comic.dict_page_count,
Comic.dict_language,Comic.dict_agerating,Comic.dict_pages,
Comic.dict_list_chapter,Comic.dict_chapter_imgs]
class ComicInfo:
IS_NEW_ICON = False
document = Document()
path_comic_info = None
@classmethod
def parseExec(cls,data,exec,start_add=None,item=True):
if data !=None and exec != None:
dots = str(exec).split(".")
if not isinstance(data,dict): data = json.loads(data)
for dot in dots:
data = data.get(dot)
if start_add != None and data != None:
data = start_add+data
return data
@classmethod
def setNodeAndValue(cls,node,value):
if value != None:
@ -50,12 +23,12 @@ class ComicInfo:
#页数
@classmethod
def setPages(cls,values=None):
if values == None: values = Comic.getChapterFilesName()
#if values == None: values = Comic.getChapterFilesName()
if values != None and isinstance(values,list):
suffix = "."+str(values[0]).split(".")[-1]
join_list=",".join(values).replace(suffix,"")
values = join_list.split(",")
Comic.setPageCount(len(values)+1 if cls.IS_NEW_ICON else len(values))
#Comic.setPageCount(len(values)+1 if cls.IS_NEW_ICON else len(values))
root_node = cls.document.createElement("Pages")
if cls.IS_NEW_ICON:
#添加封面
@ -68,12 +41,12 @@ class ComicInfo:
page = page.split("_")[-1]
c_node.setAttribute("Image",page)
root_node.appendChild(c_node)
Comic.dict_pages = Comic.setField(Comic.dict_pages,root_node,convert=False)
#Comic.dict_pages = Comic.setField(Comic.dict_pages,root_node,convert=False)
@classmethod
def getBaseUrl(cls,url=None):
if url == None:
url = Comic.getHomePage()
#if url == None:
# url = Comic.getHomePage()
(num,index) = [3,0]
for x in range(0, num):
index = str(url).find("/",index)+1
@ -84,24 +57,30 @@ class ComicInfo:
def root_node(cls,root_value): return cls.document.createElement(root_value)
@classmethod
def add_nodes(cls,root,list_value):
if len(list_value) == 0: return list_value
for value in list_value:
#Comic.chapter
if value[0] == None and value[4]:
#数据为空 value[0] 但不允许为空value[4] = False
msg = f"#数据为空 key={value[3]} value[0]={value[0]} 但不允许为空value[4]={value[4]}"
logger.error(msg)
exit()
if value[0] != None: root.appendChild(cls.setNodeAndValue(value[2],value[0]))
def add_nodes(cls,root,item):
item = ItemAdapter(item)
keys = item.keys()
files = item.field_names()
values = item.values()
print("test")
#if len(list_value) == 0: return list_value
#for value in list_value:
# #Comic.chapter
# if value[0] == None and value[4]:
# #数据为空 value[0] 但不允许为空value[4] = False
# msg = f"#数据为空 key={value[3]} value[0]={value[0]} 但不允许为空value[4]={value[4]}"
# logging.error(msg)
# exit()
# if value[0] != None: root.appendChild(cls.setNodeAndValue(value[2],value[0]))
@classmethod
def initComicInfoXML(cls):
cls.setPages()
@classmethod
def writeComicInfoXML(cls,overlay=False):
save_path = ComicPath.getPathComicInfoXML()
def writeComicInfoXML(cls,item,overlay=False):
#save_path = ComicPath.getPathComicInfoXML()
save_path = "ComicInfo.xml"
if os.path.exists(save_path):
if overlay:
os.remove(save_path)
@ -113,44 +92,8 @@ class ComicInfo:
root = cls.root_node("ComicInfo")
new_document = Document()
new_document.appendChild(root)
cls.add_nodes(root,ComicInfoEntity.getNodes())
cls.add_nodes(root, item)
with open(save_path, "w", encoding="utf-8") as fo:
new_document.writexml(fo, indent='', addindent='\t', newl='\n', encoding="utf-8")
fo.close()
logging.info(f"已生成文件... {save_path}")
@classmethod
def setComicInfo(cls,comicname=None,homepage=None,alias=None,author=None,icon=None,tags=None,
dep=None,genre=None,lang=None,age_rating=None,chapters=None,current_chapter_img=None):
author = ",".join(set(str(str(author).replace("&",",").replace(" ",",")).split(",")))
Comic.setHomePage(homepage)
Comic.setIcon(icon)
Comic.setListChapter(chapters)
#Comic.setUpdateAt(update_at)
Comic.setComicName(str(comicname))
#if alias != None: comicInfo.setComicNames(alias)
Comic.setAuthor(author)
Comic.setTags(tags)
Comic.setDep(dep)
#comicInfo.setCBS("韩漫")
if genre != None: Comic.setGenre(genre)
Comic.setLanguage(lang)
Comic.setAgeRating(age_rating)
Comic.setCurrentChapterImg(current_chapter_img)
@classmethod
def writeJson(cls):
dict_data = {}
nodes = ComicInfoEntity.getJsonNodes()
for node in nodes:
key = Comic.getFieldNode(node)
value = Comic.getFieldOrigin(node)
if isinstance(value,list):
value = ",".join(value)
if key != None and isinstance(value,str):
child_dict = { key : value}
dict_data.update(child_dict)
s = json.dumps(dict_data,ensure_ascii=True)
logging.debug(f"json={s}")
with open(ComicPath.getPathConfComicChapterJson(mkdir=True),"w") as fs:
fs.write(s)
logging.info(f"已生成文件... {save_path}")

View File

@ -1,5 +1,7 @@
import os.path
import re
from opencc import OpenCC
from Comics.settings import IMAGES_STORE
class ComicPath:
@classmethod
def getDirComicChapter(cls):
@ -13,4 +15,24 @@ class ComicPath:
#繁体中文转简体中文
@classmethod
def ChineseConvert(cls, text,convert='t2s'): return OpenCC(convert).convert(str(text))
def chinese_convert(cls, text,convert='t2s'): return OpenCC(convert).convert(str(text))
#处理成符合规定的文件名
@classmethod
def fix_file_name(cls, filename, replace=None):
if not isinstance(filename, str):
return filename
in_tab = r'[?*/\|.:><]'
str_replace = ""
if replace is not None:
str_replace = replace
filename = re.sub(in_tab, str_replace, filename)
count = 1
while True:
str_file = filename[0-count]
if str_file == " ":
count += 1
else:
filename = filename[0:len(filename)+1-count]
break
return filename

View File

@ -2,7 +2,17 @@ import base64,hashlib,os,shutil
import math,time,json,datetime,logging
from PIL import Image
from tinydb import TinyDB, Query
from Comics.spiders.utils.Constant import ComicPath
from Comics.utils.Constant import ComicPath
class fileUtils:
@classmethod
def save_file(cls,path,data):
dir = os.path.dirname(path)
if not os.path.exists(dir):
os.makedirs(dir)
with open(path,'w',encoding='utf-8') as fs:
fs.write(str(data))
fs.close()
class CommonUtils:
@classmethod
@ -31,11 +41,9 @@ class imageUtils:
@classmethod
def deScrambleImagesByPath(cls,img_path,img_save=None):
if os.path.basename(img_path).startswith("scramble="):
imageUtils.encode_scramble_image(img_path,img_save)
return True
else:
return False
img_path = imageUtils.encode_scramble_image(img_path,img_save)
return img_path
@classmethod
def encodeImage(cls,str_en):
#print("en",str_en)
@ -223,4 +231,5 @@ class imageUtils:
print("解密成功=",save_path)
if os.path.exists(imgpath):
os.remove(imgpath)
print("remove=",imgpath)
print("remove=",imgpath)
return save_path