diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..26d3352
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
diff --git a/.idea/ComicScrapy.iml b/.idea/ComicScrapy.iml
new file mode 100644
index 0000000..8ac55ea
--- /dev/null
+++ b/.idea/ComicScrapy.iml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..9ef495e
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..23f68eb
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/Comics/exporters.py b/Comics/exporters.py
new file mode 100644
index 0000000..77faca6
--- /dev/null
+++ b/Comics/exporters.py
@@ -0,0 +1,98 @@
+import os.path,json,ast
+
+from Comics.settings import COMIC_INFO_FIELDS_TO_EXPORT
+from scrapy.exporters import XmlItemExporter
+from scrapy.exporters import PythonItemExporter
+from Comics.items import ComicInfoItem
+from Comics.items import ComicItem
+from Comics.settings import COMIC_INFO_XML_STORE
+from Comics.utils.Constant import ComicPath
+from scrapy.utils.python import is_listlike, to_bytes, to_unicode
+
+class ItemExporter(PythonItemExporter):
+ def convert(self, data):
+ if isinstance(data, bytes): return data.decode("utf-8")
+ if isinstance(data, dict): return dict(map(self.convert, data.items()))
+ if isinstance(data, tuple): return map(self.convert, data)
+ if isinstance(data, list): return [self.convert(i) for i in data]
+ return data
+
+ def export_obj(self, obj_item):
+ self.start_exporting()
+ obj_item = self.convert(self.export_item(obj_item))
+ self.finish_exporting()
+ return obj_item
+
+class ComicInfoXmlItemExporter(XmlItemExporter):
+ custom_root_element = "ComicInfo"
+ def __init__(self, comic, chapter):
+ file_path = os.path.join(COMIC_INFO_XML_STORE, comic,
+ chapter, f"{self.custom_root_element}.xml")
+ dir_path = os.path.dirname(file_path)
+ if not os.path.exists(dir_path): os.makedirs(dir_path)
+ self.xml_file = open(file_path, "wb")
+ super(ComicInfoXmlItemExporter, self).__init__(self.xml_file,
+ root_element=self.custom_root_element,
+ indent=1,fields_to_export=COMIC_INFO_FIELDS_TO_EXPORT)
+
+ def serialize_field(self, field, name, value):
+ #通过序列化
+ value = ComicPath.chinese_convert(value)
+ return super().serialize_field(field, name, value)
+
+ def start_exporting(self):
+ self.xg.startDocument()
+ self.xg.startElement(self.custom_root_element, {})
+
+ def comic_to_info_item(self, comic_item):
+ comic_info = {}
+ comic_info_dict = getattr(ComicItem, "data", 0)
+ for key, value in ComicItem(comic_item).items():
+ new_key = comic_info_dict.get(key)
+ if new_key is not None:
+ comic_info[new_key] = value
+ return ItemExporter().export_obj(ComicInfoItem(comic_info))
+
+ def export_item(self, item):
+ comic_info = self.comic_to_info_item(item)
+ child_element = "Page"
+ self._beautify_indent(depth=1)
+ self._beautify_newline()
+ for name, value in self._get_serialized_fields(comic_info, default_value=""):
+ if name is "Pages":
+ value = str(value).split(',')
+ if value is not None or value != "":
+ self._export_xml_field(name, value, depth=2, child_element=child_element)
+ self._beautify_indent(depth=1)
+ return comic_info
+
+ def _export_xml_field(self, name, serialized_value, depth, child_element="value"):
+ self._beautify_indent(depth=depth)
+ self.xg.startElement(name, {})
+ if hasattr(serialized_value, "items"):
+ self._beautify_newline()
+ for subname, value in serialized_value.items():
+ self._export_xml_field(subname, value, depth=depth + 1)
+ self._beautify_indent(depth=depth)
+ elif is_listlike(serialized_value):
+ self._beautify_newline()
+ for value in serialized_value:
+ self._export_xml_field(child_element, value, depth=depth + 1)
+ self._beautify_indent(depth=depth)
+ elif isinstance(serialized_value, str):
+ self.xg.characters(serialized_value)
+ else:
+ self.xg.characters(str(serialized_value))
+ self.xg.endElement(name)
+ self._beautify_newline()
+
+ def finish_exporting(self):
+ self.xg.endElement(self.custom_root_element)
+ self.xg.endDocument()
+ self.xml_file.close()
+
+ def export_xml(self, item):
+ self.start_exporting()
+ comic_info = self.export_item(item)
+ self.finish_exporting()
+ return comic_info
diff --git a/Comics/items.py b/Comics/items.py
index 08bddc2..7700738 100644
--- a/Comics/items.py
+++ b/Comics/items.py
@@ -1,52 +1,93 @@
# Define here the models for your scraped items
#
# See documentation in:
-# https://docs.scrapy.org/en/latest/topics/items.html
-
-import scrapy
+# https://docs.org/en/latest/topics/items.html
+from scrapy.item import Item, Field
+from Comics.utils.Constant import ComicPath
+from dataclasses import dataclass
+from scrapy.loader.processors import TakeFirst, MapCompose, Join
-class ComicsItem(scrapy.Item):
- #漫画名
- name = scrapy.Field()
- #链接
- link = scrapy.Field()
+data = {}
+def setinfo(**kwds):
+ def decorate(f):
+ for k in kwds: data[k] = kwds[k]
+ setattr(f, "data", data)
+ return f
+ return decorate
-class ComicItem(scrapy.Item):
- name = scrapy.Field()
- chapter = scrapy.Field()
- list_img = scrapy.Field()
- author= scrapy.Field()
- icon = scrapy.Field()
- tags = scrapy.Field()
- dep = scrapy.Field()
- date = scrapy.Field()
- chapters = scrapy.Field()
- chapter_href= scrapy.Field()
- genre = scrapy.Field()
- age_rating = scrapy.Field()
+def serialize_to_chinese(value):
+ return ComicPath.chinese_convert(value)
-class ImageItem(scrapy.Item):
- image_name = scrapy.Field()
- image_url = scrapy.Field()
- image_path = scrapy.Field()
+def serialize_to_fix_file(value):
+ file = ComicPath.chinese_convert(value)
+ return ComicPath.fix_file_name(file)
-class ComicInfoItem(scrapy.Item):
- Title= scrapy.Field()#"章节名",True]
- Series = scrapy.Field()# ","漫画名",True]
- Number = scrapy.Field()# ","编号",True]
- SeriesGroup = scrapy.Field()# ","别名",False]
- Summary = scrapy.Field()# ","概述",True]
- Year = scrapy.Field()# ","年",False]
- Month = scrapy.Field()# ","月",False]
- Day = scrapy.Field()# ","日",False]
- Writer = scrapy.Field()# "作者",True]
- Publisher = scrapy.Field()# ","出版社",False]
- Genre = scrapy.Field()# ","流派",True]
- Tags = scrapy.Field()# ","标签",True]
- Web = scrapy.Field()# ","主页",False]
- PageCount = scrapy.Field()# ","总页数",True]
- LanguageISO = scrapy.Field()#","语言",True]
- AgeRating = scrapy.Field()#","年龄分级",False]
- Pages = scrapy.Field()#","页码",True]
- # ComicInfo.xml and ComicChapter.json end
\ No newline at end of file
+class ComicOItem(Item):
+ name = Field()
+ chapterItem = Field()
+
+@setinfo(name="Series", chapter="Title",
+ author="Writer", tags="Tags",
+ dep="Summary", genre="Genre",
+ index="Number", images_name="Pages",
+ age_rating="AgeRating")
+class ComicItem(Item):
+ # 编号
+ index = Field()
+ # 漫画名
+ name = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst())
+ # 章节名
+ chapter = Field(serializer=serialize_to_fix_file)
+ # 图片链接
+ list_img = Field()
+ # 作者
+ author = Field(serialize_to_chinese=serialize_to_chinese, output_processor=TakeFirst())
+ # 封面链接
+ icon = Field()
+ # 标签
+ tags = Field(serializer=serialize_to_chinese)
+ # 概述
+ dep = Field(serializer=serialize_to_chinese)
+ # 时间
+ date = Field()
+ # 流派
+ genre = Field()
+ # 年龄分级
+ age_rating = Field()
+
+ images = Field()
+ images_name = Field()
+
+class ImageItem(Item):
+ image_name = Field()
+ image_url = Field()
+ image_path = Field()
+
+def serializer_info_writer(value):
+ list_value = []
+ str(value).replace("&", " ")
+ for v in str(value).split(" "):
+ list_value.append(v)
+ return ",".join(list_value)
+
+class ComicInfoItem(Item):
+ Title = Field()#"章节名",True]
+ Series = Field()# ","漫画名",True]
+ Number = Field()# ","编号",True]
+ SeriesGroup = Field()# ","别名",False]
+ Summary = Field()# ","概述",True]
+ Year = Field()# ","年",False]
+ Month = Field()# ","月",False]
+ Day = Field()# ","日",False]
+ Writer = Field(serializer=serializer_info_writer)# "作者",True]
+ Publisher = Field()# ","出版社",False]
+ Genre = Field()# ","流派",True]
+ Tags = Field()# ","标签",True]
+ Web = Field()# ","主页",False]
+ PageCount = Field()# ","总页数",True]
+ LanguageISO = Field()#","语言",True]
+ AgeRating = Field()#","年龄分级",False]
+ Pages = Field()#","页码",True]
+ Page = Field()
+ # ComicInfo.xml and ComicChapter.json end
\ No newline at end of file
diff --git a/Comics/pipelines.py b/Comics/pipelines.py
index 208df52..a114f8e 100644
--- a/Comics/pipelines.py
+++ b/Comics/pipelines.py
@@ -8,93 +8,74 @@
import os,requests,re,scrapy,logging
from Comics import settings
from Comics.utils.FileUtils import imageUtils
+from Comics.utils.FileUtils import fileUtils
from Comics.utils.Constant import ComicPath
from Comics.items import ComicItem
from Comics.items import ImageItem
from scrapy.pipelines.images import ImagesPipeline
-from scrapy.exporters import XmlItemExporter
-from itemadapter import ItemAdapter
-
+from Comics.exporters import ComicInfoXmlItemExporter
+from Comics.exporters import ItemExporter
+from Comics.utils.CBZUtils import CBZUtils
class ComicsPipeline:
- def open_spider(self,spider):
- self.fp = open('book.json','w',encoding='utf-8')
-
+ def open_spider(self, spider):
+ pass
# item就是yield后面的对象
def process_item(self, item, spider):
- self.fp.write(str(item))
+ if isinstance(item, ComicItem):
+ item = ComicItem(ItemExporter().export_obj(item))
+ file = os.path.join("json",item['name'],item['chapter'])
+ fileUtils.save_file(f"{file}.json",item)
return item
#image解析
def close_spider(self,spider):
- self.fp.close()
+ pass
class ImageParsePipeline:
def process_item(self, item, spider):
if isinstance(item, ComicItem):
- list_img = item['list_img']
count = 1
- scramble_count = 0
- list_image_item = []
- for image in list_img:
+ images_item = []
+ for image in item['list_img']:
(image_src,scramble) = [image.get("src"),image.get("scramble")]
count_image = "{:0>3d}".format(count)
- image_src_suffix = "."+str(image_src).split(".")[-1]
- image_file_name = count_image+image_src_suffix
+ suffix = "."+str(image_src).split(".")[-1]
+ image_name = count_image + suffix
if scramble:
- de_str = str(image_src).split("/")[-1].replace(image_src_suffix,"==")
+ de_str = str(image_src).split("/")[-1].replace(suffix,"==")
blocks_num = imageUtils.encodeImage(de_str)
- scramble_image_file_name = ComicPath.getFileScrambleImageName(count=count_image,block=blocks_num,suffix=image_src_suffix)
- scramble_count += 1
- image_path = os.path.join(item['name'],item['chapter'],scramble_image_file_name)
- image_path = ComicPath.ChineseConvert(image_path)
- list_image_item.append(ImageItem(image_name=image_file_name,image_url=image_src,image_path=image_path))
- count+=1
- return list_image_item
+ image_name = ComicPath.getFileScrambleImageName(count=count_image,block=blocks_num,suffix=suffix)
+ image_path = os.path.join(item['name'],item['chapter'], image_name)
+ images_item.append(ImageItem(image_name=count_image + suffix,image_url=image_src,image_path=image_path))
+ count += 1
+ item['images'] = images_item
+ return item
class ImgDownloadPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
image = request.meta['item']
image_path = image['image_path']
- en_image_path = os.path.join(os.path.dirname(image_path),image['image_name'])
- if os.path.exists(en_image_path): return en_image_path
- else: return image_path
+ en_image_path = os.path.join(os.path.dirname(image_path), image['image_name'])
+ if os.path.exists(os.path.join(settings.IMAGES_STORE, en_image_path)):
+ return en_image_path
+ else:
+ return image_path
def get_media_requests(self, item, info):
- for image in item:
- host = re.sub(r'(http://|https://)', '', image['image_url']).split('/')[0]
+ for image in item['images']:
yield scrapy.Request(url= image['image_url'], meta= {'item' : image})
def item_completed(self, results, item, info):
- if len(results) == len(item):
- for image in results:
- success = image[0]
- img = image[1]
- img_path = os.path.join(settings.IMAGES_STORE,img['path'])
- #解密图片
- imageUtils.deScrambleImagesByPath(img_path)
- return item
-
-class ComicInfoXmlPipeline:
-
- def open_spider(self, spider):
- self.xml_exporter = {}
-
- def close_spider(self, spider):
- for exporter, xml_file in self.xml_exporter.values():
- exporter.finish_exporting()
- xml_file.close()
-
- def _exporter_for_item(self, item):
- adapter = ItemAdapter(item)
- xml_file = open("ComicInfo.xml", "wb")
- exporter = XmlItemExporter(xml_file)
- exporter.start_exporting()
- self.xml_exporter = (exporter, xml_file)
- return self.xml_exporter
-
- def process_item(self, item, spider):
- exporter = self._exporter_for_item(item)
- exporter.export_item(item)
- return item
-
+ info_img = []
+ for success, img in results:
+ img_path = os.path.join(settings.IMAGES_STORE, img['path'])
+ #解密图片
+ img_path = imageUtils.deScrambleImagesByPath(img_path)
+ info_img.append(os.path.basename(img_path).split('.')[0])
+ item['images_name'] = ",".join(info_img)
+ #return item
+ #ComicInfoXml 生成
+ ComicInfoXmlItemExporter(comic=item['name'], chapter=item['chapter']).export_xml(item)
+ #打包
+ CBZUtils.packComicChapterCBZ(comic=item['name'], chapter=item['chapter'], remove= False)
\ No newline at end of file
diff --git a/Comics/settings.py b/Comics/settings.py
index 51e79ef..4d04dd7 100644
--- a/Comics/settings.py
+++ b/Comics/settings.py
@@ -18,17 +18,17 @@ NEWSPIDER_MODULE = 'Comics.spiders'
#USER_AGENT = 'Comics (+http://www.yourdomain.com)'
USER_AGENT = UserAgent().random
# Obey robots.txt rules
-ROBOTSTXT_OBEY = False
+ROBOTSTXT_OBEY = False
+HTTPERROR_ALLOWED_CODES = [ 200 , 403]
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
-IMAGES_URLS_FIELD = "image_url"
-IMAGES_RESULT_FIELD = "image_path"
IMAGES_STORE = 'images'
+COMIC_INFO_XML_STORE = 'images'
DOWNLOAD_DELAY = 20
#重试
RETRY_ENABLED = True
@@ -66,7 +66,7 @@ COOKIES_ENABLED = False
DOWNLOADER_MIDDLEWARES = {
# 'Comics.middlewares.ComicsDownloaderMiddleware': 543,
# 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500,
- 'Comics.middlewares.ProxyMiddleware' : 100,
+ 'Comics.middlewares.ProxyMiddleware': 100,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400,
}
@@ -82,7 +82,6 @@ ITEM_PIPELINES = {
'Comics.pipelines.ComicsPipeline': 300,
'Comics.pipelines.ImageParsePipeline': 400,
'Comics.pipelines.ImgDownloadPipeline': 500,
- 'Comics.pipelines.ComicInfoXmlPipeline': 600,
}
# Enable and configure the AutoThrottle extension (disabled by default)
@@ -103,5 +102,28 @@ AUTOTHROTTLE_DEBUG = False
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
-HTTPCACHE_IGNORE_HTTP_CODES = []
+HTTPCACHE_IGNORE_HTTP_CODES = [500, 502, 404, 403]
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
+CBZ_EXPORT_PATH = "CBZ"
+#数据导出类 排序
+COMIC_INFO_XML_FILE = "ComicInfo.xml"
+COMIC_INFO_FIELDS_TO_EXPORT = [
+ "Title",
+ "Series",
+ "Number",
+ "SeriesGroup",
+ "Summary",
+ "Year",
+ "Month",
+ "Day",
+ "Writer",
+ "Publisher",
+ "Genre",
+ "Tags",
+ "Web",
+ "PageCount",
+ "LanguageISO",
+ "AgeRating",
+ "Pages"
+]
diff --git a/Comics/spiders/rm_comic.py b/Comics/spiders/rm_comic.py
index 4c7ba47..86fef07 100644
--- a/Comics/spiders/rm_comic.py
+++ b/Comics/spiders/rm_comic.py
@@ -1,26 +1,9 @@
+import urllib.parse
+
import scrapy,json,requests
from Comics.items import ComicItem
from Comics.utils.FileUtils import CommonUtils
-import threading
-import toml
-
-class ErrorLog:
- def __init__(self) -> None:
- self.lock = threading.Lock()
-
- def err_ls(self, dic):
- self.lock.acquire()
- with open('error.toml', 'r+t') as f:
- data = toml.load('error.toml')
- f.seek(0, 0)
- f.truncate()
- dic_name = f'err_{len(data)}'
- data[dic_name] = dic
- _ = toml.dump(data, f)
- self.lock.release()
-
-
-error_logger = ErrorLog()
+from scrapy.loader import ItemLoader
class RmComicSpider(scrapy.Spider):
name = 'rm_comic'
@@ -29,45 +12,51 @@ class RmComicSpider(scrapy.Spider):
#start_urls = ['https://rm01.xyz/books/63b65185-f798-4c8f-a0b0-8811615908fd/0']
def start_requests(self):
- yield scrapy.Request(self.main_url + '/books/0a7e8bd1-4cfa-481a-b067-1df663fb2017', callback=self.parse_comic)
+ yield scrapy.Request('https://rm01.xyz'
+ '/books/306ec1e2-f701-4fda-bb78-041ad6ec4020', callback=self.parse_comic)
def parse_comic(self, response):
comic = ComicItem()
+# comic_item = ItemLoader(item=ComicItem(), response=response)
comic['name'] = response.xpath('//div[@class="col"]/h5/text()').extract_first()
comic['icon'] = response.xpath('//img[@class="img-thumbnail"]/@src').extract_first()
comic['author'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[1]/text()').extract()[1]
comic['tags'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[3]/b/text()').extract_first()
comic['dep'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[4]/text()').extract()[1]
comic['date'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()').extract()[1]
- comic['chapters'] = response.xpath('//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/text()').extract()
- comic['chapter_href'] = response.xpath('//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/@href').extract()
- for link in comic['chapter_href']:
- yield scrapy.Request(self.main_url+link,meta={'item' : comic} , callback=self.parse_chapter,errback=self.err)
-
- def err(self):
- print("Error=====")
-
+ comic['genre'] = "韩漫"
+ comic['age_rating'] = "R18+"
+ chapters = response.xpath('//div[contains(@class,"bookid_chapterBox")]'
+ '//div[contains(@class,"bookid_chapter")]/a/text()').extract()
+ chapter_href = response.xpath('//div[contains(@class,"bookid_chapterBox")]'
+ '//div[contains(@class,"bookid_chapter")]/a/@href').extract()
+ #for chapter, link in zip(chapters, chapter_href):
+ for i, link in enumerate(chapter_href, start=1):
+ yield scrapy.Request(self.main_url+link, meta={'item' : comic, 'number': i}, callback=self.parse_chapter)
+
def parse_chapter(self, response):
item = response.meta['item']
+ number = response.meta['number']
data = response.xpath('//script[@id="__NEXT_DATA__"]/text()').extract_first()
- str_exec="props.pageProps."
- comic_name = CommonUtils.parseExec(data,str_exec+"bookName")
- chapterName = CommonUtils.parseExec(data,str_exec+"chapterName")
- description = CommonUtils.parseExec(data,str_exec+"description")
- images = CommonUtils.parseExec(data,str_exec+"images")
- chapter_api_url = CommonUtils.parseExec(data,str_exec+"chapterAPIPath")
+ str_exec = "props.pageProps."
+ comic_name = CommonUtils.parseExec(data, str_exec+"bookName")
+ chapterName = CommonUtils.parseExec(data, str_exec+"chapterName")
+ description = CommonUtils.parseExec(data, str_exec+"description")
+ images = CommonUtils.parseExec(data, str_exec+"images")
+ chapter_api_url = CommonUtils.parseExec(data, str_exec+"chapterAPIPath")
item['chapter'] = chapterName
item['list_img'] = images
+ item['index'] = number
if chapter_api_url != None:
- yield scrapy.Request(url=self.main_url+chapter_api_url,meta={'item' : item}, callback=self.parse_chapter_api, errback=self.err)
+ yield scrapy.Request(self.main_url+ chapter_api_url,meta={'item' : item}, callback= self.parse_chapter_api)
else:
- item['list_img'] = images
yield item
-
- def parse_chapter_api(self,response,item):
- data = response.meta['item']
- print(item)
- return response
-
+
+ def parse_chapter_api(self, response):
+ item = response.meta['item']
+ item['chapter'] = CommonUtils.parseExec(response.text, "chapter.name")
+ item['list_img'] = CommonUtils.parseExec(response.text, "chapter.images")
+ yield item
+
def parse(self, response):
raise NotImplementedError
\ No newline at end of file
diff --git a/Comics/utils/CBZUtils.py b/Comics/utils/CBZUtils.py
new file mode 100644
index 0000000..3eba3ff
--- /dev/null
+++ b/Comics/utils/CBZUtils.py
@@ -0,0 +1,105 @@
+import os, shutil, time, logging
+from datetime import datetime
+from pathlib import Path
+from zipfile import ZipFile
+from Comics.settings import COMIC_INFO_XML_FILE,CBZ_EXPORT_PATH,IMAGES_STORE
+
+class CBZUtils:
+
+ @classmethod
+ def readDirsOrFiles(cls,dir,type):
+ data = []
+ files = os.listdir(dir)
+ for file in files:
+ path = os.path.join(dir,file)
+ if type == "files" and os.path.isfile(path):
+ data.append(path)
+ if type == "dirs" and os.path.isdir(path):
+ data.append(path)
+ return data
+
+ @classmethod
+ def zip_compression(cls, source_dir=None, target_file=None, remove=True):
+ target_dir = os.path.dirname(target_file)
+ if not os.path.exists(target_dir):
+ os.makedirs(target_dir)
+ if not os.path.exists(target_file) and source_dir != None:
+ with ZipFile(target_file, mode='w') as zf:
+ for path, dir_names, filenames in os.walk(source_dir):
+ path = Path(path)
+ arc_dir = path.relative_to(source_dir)
+ y = 0
+ for filename in filenames:
+ y = y + 1
+ print("打包中:" + str(y) + "/" + str(len(filenames)), os.path.join(source_dir, filename))
+ zf.write(path.joinpath(filename), arc_dir.joinpath(filename))
+ zf.close()
+ logging.info(f"打包完成:{target_file}")
+
+ @classmethod
+ def packComicChapterCBZ(cls, comic, chapter, remove=True):
+ images_chapter_path = os.path.join(IMAGES_STORE, comic, chapter)
+ cbz_chapter_path = os.path.join(CBZ_EXPORT_PATH, comic, chapter)+".CBZ"
+ if os.path.exists(images_chapter_path):
+ dirs = os.listdir(images_chapter_path)
+ for file in dirs:
+ if file.startswith("scramble="):
+ try:
+ os.remove(file)
+ except:
+ print(f"删除 {file} 发生错误,已跳过")
+ return False
+ cls.zip_compression(images_chapter_path, cbz_chapter_path)
+ time.sleep(0.1)
+ if remove: shutil.rmtree(images_chapter_path)
+ return True
+
+ @classmethod
+ def replaceZip(cls,filepath,unpack_dir=None):
+ if not cls.compareFileDate(filepath): return None
+ if unpack_dir == None:
+ unpack_dir = str(filepath).split(".")[0]
+ fz = ZipFile(filepath, 'r')
+ for file in fz.namelist():
+ if file.endswith(".jpg"):
+ data = fz.read(file)
+ if len(data) < 500 and os.path.exists(filepath):
+ os.remove(filepath)
+ print(f"数据不完整,已删除:{filepath}")
+ if cls.compareFileDate(filepath):
+ os.utime(filepath)
+ print(f"已更新文件时间 {filepath}")
+ if os.path.exists(unpack_dir):
+ shutil.rmtree(unpack_dir)
+ # 删除删除main.ftl文件
+ #delete_filename = ''
+ #if os.path.exists(delete_filename):
+ # os.remove(delete_filename)
+ # time.sleep(60)
+ # shutil.copy(文件的路径,另一个目录);拷贝main.ftl到准备压缩的目录下
+ #cls.zip_compression()
+ #小于则运行
+ @classmethod
+ def compareFileDate(cls,filepath):
+ if os.path.exists(filepath):
+ ctime = os.path.getmtime(filepath)
+ str_ctime = datetime.fromtimestamp(int(ctime))
+ file_ctime = str(str_ctime.year)+"{:0>2d}".format(str_ctime.month)+"{:0>2d}".format(str_ctime.day)+"{:0>2d}".format(str_ctime.hour)
+ c_ctime = 2023011603
+ else:
+ return False
+ if int(file_ctime) < c_ctime:
+ return True
+ return False
+
+ @classmethod
+ def zip_info(cls, path, filter=True):
+ result = None
+ try:
+ with ZipFile(path, "r") as zip_file:
+ result = zip_file.namelist()
+ if filter:
+ result.remove(COMIC_INFO_XML_FILE)
+ except Exception as e:
+ print(e)
+ return result
\ No newline at end of file
diff --git a/Comics/utils/Comic.py b/Comics/utils/Comic.py
deleted file mode 100644
index 7a11e38..0000000
--- a/Comics/utils/Comic.py
+++ /dev/null
@@ -1,248 +0,0 @@
-import json,re
-from opencc import OpenCC
-from queue import Queue
-from utils.OldUtils import OldUtils
-
-class Comic:
- # ComicInfo.xml and ComicChapter.json bengin
- # value origin node dep required
- dict_chapter = [None,None,"Title","章节名",True]
- dict_comic_name = [None,None,"Series","漫画名",True]
- dict_number = [None,None,"Number","编号",True]
- dict_comic_names = [None,None,"SeriesGroup","别名",False]
- dict_dep = [None,None,"Summary","概述",True]
- dict_year = [None,None,"Year","年",False]
- dict_month = [None,None,"Month","月",False]
- dict_day = [None,None,"Day","日",False]
- dict_author = [None,None,"Writer","作者",True]
- dict_cbs = [None,None,"Publisher","出版社",False]
- dict_genre = [None,None,"Genre","流派",True]
- dict_tags = [None,None,"Tags","标签",True]
- dict_homepage = [None,None,"Web","主页",False]
- dict_page_count = [None,None,"PageCount","总页数",True]
- dict_language = [None,None,"LanguageISO","语言",True]
- dict_agerating = [None,None,"AgeRating","年龄分级",False]
- dict_pages = [None,None,"Pages","页码",True]
- CURRENT_DOWN_LINK = None
- # ComicInfo.xml and ComicChapter.json end
- dict_icon = [None,None,"Icon","图标",True]
- dict_chapter_imgs = [None,None,"ChapterImgs","图像",True]
- #主页
- dict_list_chapter = [None,None,"ListChapter","全部章节名",True]
- (update_at,current_chapter_img,file_chapter_imgs) = [None,None,None]
-
-
- #繁体中文转简体中文
- @classmethod
- def ChineseConvert(cls, text,convert='t2s'): return OpenCC(convert).convert(str(text))
- #处理成符合规定的文件名
- @classmethod
- def fixFileName(cls,filename,replace=None):
- if not isinstance(filename,str): return filename
- intab = r'[?*/\|.:><]'
- str_replace = ""
- if replace != None: str_replace = replace
- filename = re.sub(intab, str_replace, filename)
- count = 1
- while True:
- str_file = filename[0-count]
- if str_file == " ": count += 1
- else:
- filename = filename[0:len(filename)+1-count]
- break
- return filename
-
- @classmethod
- def setValue(cls,value):
- if value != None: value = cls.ChineseConvert(value)
- return value
-
- @classmethod
- def setField(cls,field,value,origin=True,convert=True):
- if value != None:
- if origin:
- field[1] = value
- if convert: value = cls.ChineseConvert(value)
- field[0] = value
- return field
-
- @classmethod
- def getFieldValue(cls,field):
- if field == None: return None
- return field[0]
- @classmethod
- def setFieldOrigin(cls,filed,origin):
- filed[1] = origin
- return filed
- @classmethod
- def getFieldOrigin(cls,filed): return filed[1]
- @classmethod
- def getFieldNode(cls,filed): return filed[2]
-
- @classmethod
- def getValue(cls,field,exec=None):
- if exec != None: return cls.parseExec(field,exec=exec)
- return field
- #章节名
- @classmethod
- def setChapterName(cls,value,exec=None):
- value = cls.fixFileName(cls.parseExec(value,exec=exec))
- OldUtils.setOldChapter(value)
- cls.dict_chapter = cls.setField(cls.dict_chapter,value)
-
- @classmethod
- def getChapterName(cls): return cls.getFieldValue(cls.dict_chapter)
- @classmethod
- def getOriginChapterName(cls): return cls.getFieldOrigin(cls.dict_chapter)
-
- #漫画名
- @classmethod
- def setComicName(cls,value,exec=None):
- value = cls.fixFileName(cls.parseExec(value,exec=exec))
- OldUtils.setOldComicName(value)
- cls.dict_comic_name = cls.setField(cls.dict_comic_name,value)
-
- @classmethod
- def getComicName(cls): return cls.getFieldValue(cls.dict_comic_name)
- @classmethod
- def getOriginComicName(cls): return cls.getFieldOrigin(cls.dict_comic_name)
- #编号
- @classmethod
- def setNumber(cls,value): cls.dict_number = cls.setField(cls.dict_number,value)
- @classmethod
- def getNumber(cls): return cls.getFieldValue(cls.dict_number)
- #概述
- @classmethod
- def setDep(cls,value,exec=None):
- cls.dict_dep = cls.setField(cls.dict_dep,cls.parseExec(value,exec=exec))
- @classmethod
- def getDep(cls): return cls.getFieldValue(cls.dict_dep)
- #作者
- @classmethod
- def setAuthor(cls,value): cls.dict_author = cls.setField(cls.dict_author,value)
- @classmethod
- def getAuthor(cls): return cls.getFieldValue(cls.dict_author)
- #流派
- @classmethod
- def setGenre(cls,value): cls.dict_genre = cls.setField(cls.dict_genre,value)
- @classmethod
- def getGenre(cls): return cls.getFieldValue(cls.dict_genre)
- #语言
- @classmethod
- def setLanguage(cls,value): cls.dict_language = cls.setField(cls.dict_language,value)
- @classmethod
- def getLanguage(cls): return cls.getFieldValue(cls.dict_language)
- #年龄分级
- @classmethod
- def setAgeRating(cls,value): cls.dict_agerating = cls.setField(cls.dict_agerating,value)
- @classmethod
- def getAgeRating(cls): return cls.getFieldValue(cls.dict_agerating)
- #标签
- @classmethod
- def setTags(cls,value): cls.dict_tags = cls.setField(cls.dict_tags,value)
- @classmethod
- def getTags(cls): return cls.getFieldValue(cls.dict_tags)
- #总页数
- @classmethod
- def setPageCount(cls,value): cls.dict_page_count = cls.setField(cls.dict_page_count,value)
- @classmethod
- def getPageCount(cls): return cls.getFieldValue(cls.dict_page_count)
-
- #------------------------------------------------------------------------
- @classmethod
- def parseExec(cls,data,exec,item=True):
- if data !=None and exec != None:
- dots = str(exec).split(".")
- if not isinstance(data,dict): data = json.loads(data)
- for dot in dots:
- data = data.get(dot)
- return data
- @classmethod
- def setHomePage(cls,value): cls.dict_homepage = cls.setField(cls.dict_homepage,value)
- @classmethod
- def getHomePage(cls): return cls.getFieldValue(cls.dict_homepage)
- @classmethod
- def setIcon(cls,value): cls.dict_icon = cls.setField(cls.dict_icon,value,convert=False)
- @classmethod
- def getIcon(cls): return cls.getFieldValue(cls.dict_icon)
- @classmethod
- def setListChapter(cls,value): cls.dict_list_chapter = cls.setField(cls.dict_list_chapter,value,convert=False)
- @classmethod
- def getListChapter(cls): return cls.getFieldValue(cls.dict_list_chapter)
- @classmethod
- def getLenChapters(cls): return len(cls.getListChapter())
- @classmethod
- def setChapterImgs(cls,value,exec=None,item=None):
- cls.dict_chapter_imgs = cls.setField(cls.dict_chapter_imgs,cls.parseExec(value,exec=exec,item=item),convert=False)
- @classmethod
- def getChapterImgs(cls): return cls.getFieldValue(cls.dict_chapter_imgs)
- @classmethod
- def setUpdateAt(cls,value): cls.update_at = value
- @classmethod
- def getUpdateAt(cls): return cls.update_at
- @classmethod
- def setCurrentChapterImg(cls,value): cls.current_chapter_img = value
- @classmethod
- def getCurrentChapterImg(cls): return cls.current_chapter_img
- @classmethod
- def setChapterFilesName(cls,value): cls.file_chapter_imgs= value
- @classmethod
- def getChapterFilesName(cls): return cls.file_chapter_imgs
- @classmethod
- def setCurrentDownLink(cls,value): cls.CURRENT_DOWN_LINK = value
- @classmethod
- def getCurrentDownLink(cls): return cls.CURRENT_DOWN_LINK
-
-class ListComic:
- LIST_COMIC_QUEUE = Queue()
- (LIST_COMIC_NAME,LIST_COMIC_LINK,LIST_COMIC_UPDATEAT) = [None,None,None]
-
- @classmethod
- def setListComicsLinksUpdateAt(cls,names,links,update_at):
- if isinstance(names,list) and isinstance(links,list) and isinstance(update_at,list):
- for x in range(0,len(names)):
- cls.LIST_COMIC_QUEUE.put([names[x],links[x],update_at[x]])
- @classmethod
- def getListComicsLinksUpdateAt(cls):
- result = None
- if cls.LIST_COMIC_NAME != None and cls.LIST_COMIC_LINK != None:
- cls.setListComicsLinksUpdateAt(cls.LIST_COMIC_NAME,cls.LIST_COMIC_LINK,cls.LIST_COMIC_UPDATEAT)
- (cls.LIST_COMIC_NAME,cls.LIST_COMIC_LINK,cls.LIST_COMIC_UPDATEAT) = [None,None,None]
- if not cls.LIST_COMIC_QUEUE.empty(): result = cls.LIST_COMIC_QUEUE.get(False)
- return result
-
- @classmethod
- def addListComicChapterLink(cls,name,link,update_at):
- if name != None and link != None:
- cls.LIST_COMIC_QUEUE.put(name,link,update_at)
-
- @classmethod
- def getListValue(cls,result,type,start_add=None,result_type="list"):
- if result == None: return None
- if type == None: return result
- if result_type == "list" and type != None:
- data = []
- for x in range(0, len(result)):
- if start_add != None:
- data.append(start_add+result[x].get(type))
- else:
- data.append(result[x].get(type))
- return data
- return result
-
- @classmethod
- def setListComicName(cls,value,type=None): cls.LIST_COMIC_NAME = cls.getListValue(value,type)
- @classmethod
- def getListComicName(cls): return cls.LIST_COMIC_NAME
- @classmethod
- def setListComicChapterLink(cls,value,type=None,start_add=None): cls.LIST_COMIC_LINK = cls.getListValue(value,type,start_add)
- @classmethod
- def getListComicChapterLink(cls): return cls.LIST_COMIC_LINK
- @classmethod
- def setListComicUpdateAt(cls,value,type=None): cls.LIST_COMIC_UPDATEAT = cls.getListValue(value,type)
- @classmethod
- def getListComicUpdateAt(cls): return cls.LIST_COMIC_UPDATEAT
- @classmethod
- def getListComicChapterLink(cls): return cls.LIST_COMIC_QUEUE.get(False)
-
- #domain end....
\ No newline at end of file
diff --git a/Comics/utils/ComicInfo.py b/Comics/utils/ComicInfo.py
index 03c6755..aa5e22f 100644
--- a/Comics/utils/ComicInfo.py
+++ b/Comics/utils/ComicInfo.py
@@ -1,41 +1,14 @@
import json,os
import logging
from xml.dom.minidom import Document
-from Comics.utils.Comic import Comic
from Comics.utils.Constant import ComicPath
+from itemadapter import is_item, ItemAdapter
-class ComicInfoEntity:
- @classmethod
- def getNodes(cls):
- return [Comic.dict_chapter,Comic.dict_comic_name,Comic.dict_number,Comic.dict_comic_names,
- Comic.dict_dep,Comic.dict_year,Comic.dict_month,Comic.dict_day,Comic.dict_author,
- Comic.dict_cbs,Comic.dict_genre,Comic.dict_tags,Comic.dict_page_count,
- Comic.dict_language,Comic.dict_agerating,Comic.dict_pages]
- @classmethod
- def getJsonNodes(cls):
- return [Comic.dict_chapter,Comic.dict_comic_name,Comic.dict_icon,Comic.dict_number,
- Comic.dict_comic_names,
- Comic.dict_dep,Comic.dict_year,Comic.dict_month,Comic.dict_day,Comic.dict_author,
- Comic.dict_cbs,Comic.dict_genre,Comic.dict_tags,Comic.dict_page_count,
- Comic.dict_language,Comic.dict_agerating,Comic.dict_pages,
- Comic.dict_list_chapter,Comic.dict_chapter_imgs]
-
class ComicInfo:
IS_NEW_ICON = False
document = Document()
path_comic_info = None
- @classmethod
- def parseExec(cls,data,exec,start_add=None,item=True):
- if data !=None and exec != None:
- dots = str(exec).split(".")
- if not isinstance(data,dict): data = json.loads(data)
- for dot in dots:
- data = data.get(dot)
- if start_add != None and data != None:
- data = start_add+data
- return data
-
@classmethod
def setNodeAndValue(cls,node,value):
if value != None:
@@ -50,12 +23,12 @@ class ComicInfo:
#页数
@classmethod
def setPages(cls,values=None):
- if values == None: values = Comic.getChapterFilesName()
+ #if values == None: values = Comic.getChapterFilesName()
if values != None and isinstance(values,list):
suffix = "."+str(values[0]).split(".")[-1]
join_list=",".join(values).replace(suffix,"")
values = join_list.split(",")
- Comic.setPageCount(len(values)+1 if cls.IS_NEW_ICON else len(values))
+ #Comic.setPageCount(len(values)+1 if cls.IS_NEW_ICON else len(values))
root_node = cls.document.createElement("Pages")
if cls.IS_NEW_ICON:
#添加封面
@@ -68,12 +41,12 @@ class ComicInfo:
page = page.split("_")[-1]
c_node.setAttribute("Image",page)
root_node.appendChild(c_node)
- Comic.dict_pages = Comic.setField(Comic.dict_pages,root_node,convert=False)
+ #Comic.dict_pages = Comic.setField(Comic.dict_pages,root_node,convert=False)
@classmethod
def getBaseUrl(cls,url=None):
- if url == None:
- url = Comic.getHomePage()
+ #if url == None:
+ # url = Comic.getHomePage()
(num,index) = [3,0]
for x in range(0, num):
index = str(url).find("/",index)+1
@@ -84,24 +57,30 @@ class ComicInfo:
def root_node(cls,root_value): return cls.document.createElement(root_value)
@classmethod
- def add_nodes(cls,root,list_value):
- if len(list_value) == 0: return list_value
- for value in list_value:
- #Comic.chapter
- if value[0] == None and value[4]:
- #数据为空 value[0], 但不允许为空value[4] = False
- msg = f"#数据为空 key={value[3]} value[0]={value[0]}, 但不允许为空value[4]={value[4]}"
- logger.error(msg)
- exit()
- if value[0] != None: root.appendChild(cls.setNodeAndValue(value[2],value[0]))
+ def add_nodes(cls,root,item):
+ item = ItemAdapter(item)
+ keys = item.keys()
+ files = item.field_names()
+ values = item.values()
+ print("test")
+ #if len(list_value) == 0: return list_value
+ #for value in list_value:
+ # #Comic.chapter
+ # if value[0] == None and value[4]:
+ # #数据为空 value[0], 但不允许为空value[4] = False
+ # msg = f"#数据为空 key={value[3]} value[0]={value[0]}, 但不允许为空value[4]={value[4]}"
+ # logging.error(msg)
+ # exit()
+ # if value[0] != None: root.appendChild(cls.setNodeAndValue(value[2],value[0]))
@classmethod
def initComicInfoXML(cls):
cls.setPages()
@classmethod
- def writeComicInfoXML(cls,overlay=False):
- save_path = ComicPath.getPathComicInfoXML()
+ def writeComicInfoXML(cls,item,overlay=False):
+ #save_path = ComicPath.getPathComicInfoXML()
+ save_path = "ComicInfo.xml"
if os.path.exists(save_path):
if overlay:
os.remove(save_path)
@@ -113,44 +92,8 @@ class ComicInfo:
root = cls.root_node("ComicInfo")
new_document = Document()
new_document.appendChild(root)
- cls.add_nodes(root,ComicInfoEntity.getNodes())
+ cls.add_nodes(root, item)
with open(save_path, "w", encoding="utf-8") as fo:
new_document.writexml(fo, indent='', addindent='\t', newl='\n', encoding="utf-8")
fo.close()
- logging.info(f"已生成文件... {save_path}")
-
- @classmethod
- def setComicInfo(cls,comicname=None,homepage=None,alias=None,author=None,icon=None,tags=None,
- dep=None,genre=None,lang=None,age_rating=None,chapters=None,current_chapter_img=None):
- author = ",".join(set(str(str(author).replace("&",",").replace(" ",",")).split(",")))
- Comic.setHomePage(homepage)
- Comic.setIcon(icon)
- Comic.setListChapter(chapters)
- #Comic.setUpdateAt(update_at)
- Comic.setComicName(str(comicname))
- #if alias != None: comicInfo.setComicNames(alias)
- Comic.setAuthor(author)
- Comic.setTags(tags)
- Comic.setDep(dep)
- #comicInfo.setCBS("韩漫")
- if genre != None: Comic.setGenre(genre)
- Comic.setLanguage(lang)
- Comic.setAgeRating(age_rating)
- Comic.setCurrentChapterImg(current_chapter_img)
-
- @classmethod
- def writeJson(cls):
- dict_data = {}
- nodes = ComicInfoEntity.getJsonNodes()
- for node in nodes:
- key = Comic.getFieldNode(node)
- value = Comic.getFieldOrigin(node)
- if isinstance(value,list):
- value = ",".join(value)
- if key != None and isinstance(value,str):
- child_dict = { key : value}
- dict_data.update(child_dict)
- s = json.dumps(dict_data,ensure_ascii=True)
- logging.debug(f"json={s}")
- with open(ComicPath.getPathConfComicChapterJson(mkdir=True),"w") as fs:
- fs.write(s)
\ No newline at end of file
+ logging.info(f"已生成文件... {save_path}")
\ No newline at end of file
diff --git a/Comics/utils/Constant.py b/Comics/utils/Constant.py
index 4530c41..f178afb 100644
--- a/Comics/utils/Constant.py
+++ b/Comics/utils/Constant.py
@@ -1,5 +1,7 @@
+import os.path
+import re
from opencc import OpenCC
-
+from Comics.settings import IMAGES_STORE
class ComicPath:
@classmethod
def getDirComicChapter(cls):
@@ -13,4 +15,24 @@ class ComicPath:
#繁体中文转简体中文
@classmethod
- def ChineseConvert(cls, text,convert='t2s'): return OpenCC(convert).convert(str(text))
\ No newline at end of file
+ def chinese_convert(cls, text,convert='t2s'): return OpenCC(convert).convert(str(text))
+
+ #处理成符合规定的文件名
+ @classmethod
+ def fix_file_name(cls, filename, replace=None):
+ if not isinstance(filename, str):
+ return filename
+ in_tab = r'[?*/\|.:><]'
+ str_replace = ""
+ if replace is not None:
+ str_replace = replace
+ filename = re.sub(in_tab, str_replace, filename)
+ count = 1
+ while True:
+ str_file = filename[0-count]
+ if str_file == " ":
+ count += 1
+ else:
+ filename = filename[0:len(filename)+1-count]
+ break
+ return filename
\ No newline at end of file
diff --git a/Comics/utils/FileUtils.py b/Comics/utils/FileUtils.py
index dfc1235..4824c6b 100644
--- a/Comics/utils/FileUtils.py
+++ b/Comics/utils/FileUtils.py
@@ -2,7 +2,17 @@ import base64,hashlib,os,shutil
import math,time,json,datetime,logging
from PIL import Image
from tinydb import TinyDB, Query
-from Comics.spiders.utils.Constant import ComicPath
+from Comics.utils.Constant import ComicPath
+
+class fileUtils:
+ @classmethod
+ def save_file(cls,path,data):
+ dir = os.path.dirname(path)
+ if not os.path.exists(dir):
+ os.makedirs(dir)
+ with open(path,'w',encoding='utf-8') as fs:
+ fs.write(str(data))
+ fs.close()
class CommonUtils:
@classmethod
@@ -31,11 +41,9 @@ class imageUtils:
@classmethod
def deScrambleImagesByPath(cls,img_path,img_save=None):
if os.path.basename(img_path).startswith("scramble="):
- imageUtils.encode_scramble_image(img_path,img_save)
- return True
- else:
- return False
-
+ img_path = imageUtils.encode_scramble_image(img_path,img_save)
+ return img_path
+
@classmethod
def encodeImage(cls,str_en):
#print("en",str_en)
@@ -223,4 +231,5 @@ class imageUtils:
print("解密成功=",save_path)
if os.path.exists(imgpath):
os.remove(imgpath)
- print("remove=",imgpath)
\ No newline at end of file
+ print("remove=",imgpath)
+ return save_path
\ No newline at end of file