This commit is contained in:
cwx 2023-10-05 04:48:45 +08:00
parent 76c7aa69a8
commit 38deb27d7e
8 changed files with 74 additions and 37 deletions

View File

@ -1,5 +1,4 @@
import os.path,json,ast
from Comics.settings import COMIC_INFO_FIELDS_TO_EXPORT
from scrapy.exporters import XmlItemExporter
from scrapy.exporters import PythonItemExporter
@ -11,6 +10,16 @@ from Comics.utils.Constant import ComicPath
from scrapy.utils.python import is_listlike, to_bytes, to_unicode
from itemadapter import ItemAdapter
class ItemImport():
def import_obj(self, file):
if os.path.exists(file):
with open(file, "r", encoding="utf-8") as fs:
result = fs.read()
fs.close()
return result
else:
return []
class CommonExporter():
def getPath(self, file , sufix=None):
sufix = "."+sufix

View File

@ -9,20 +9,14 @@ from Comics.utils.FileUtils import imageUtils
from itemloaders.processors import TakeFirst, MapCompose, Join
from scrapy.spiders import Spider
def current_project():
return Spider.name
def current_project(): return Spider.name
def serialize_to_chinese(value):
return ComicPath.chinese_convert(value)
def serialize_to_chinese(value): return ComicPath.chinese_convert(value)
def serialize_to_fix_file(value):
file = ComicPath.chinese_convert(value)
return ComicPath.fix_file_name(file)
def serialize_to_fix_file(value): return ComicPath.fix_file_name(ComicPath.chinese_convert(value))
def _serialize_to_images(value, result_type=None):
count = 1
images_item = []
image_urls = []
(count, images_item, image_urls) = [1,[],[]]
for image in value:
try:
(image_src, scramble) = [image.get("src"), image.get("scramble")]
@ -52,12 +46,6 @@ def serialize_to_images(value): return _serialize_to_images(value)
def serialize_to_image_urls(value): return _serialize_to_images(value, result_type="image_urls")
class ListComicItem(Item):
name = Field()
link = Field()
class ComicItem(Item):
# 工程
current_project = Field()
@ -103,8 +91,7 @@ class ImagesItem(Item):
comic = Field()
def serializer_info_writer(value):
list_value = []
value = str(value).replace("&", " ")
(list_value, value) = [[], str(value).replace("&", " ")]
for v in set(str(value).split(" ")):
list_value.append(v)
return ",".join(list_value)
@ -139,11 +126,9 @@ def serializer_info_images(value): return _serialize_info_images(value)
def serializer_info_images_count(value): return _serialize_info_images(value, "count")
def serializer_info_images_completed(value):
return _serialize_info_images(value, result_type='name')
def serializer_info_images_completed(value): return _serialize_info_images(value, result_type='name')
def serializer_info_images_count(value):
return _serialize_info_images(value, result_type='len')
def serializer_info_images_count(value): return _serialize_info_images(value, result_type='len')
class ComicInfoItem(Item):

View File

@ -11,10 +11,11 @@ from Comics.settings import CBZ_EXPORT_PATH,OUTPUT_DIR,PROJECT_KEY
from Comics.utils.Constant import ComicPath
from Comics.items import ComicItem
from scrapy.pipelines.images import ImagesPipeline
from Comics.exporters import ComicInfoXmlItemExporter,JsonExport,ItemExporter
from Comics.exporters import ComicInfoXmlItemExporter,JsonExport,ItemExporter, ItemImport
from Comics.utils.FileUtils import CBZUtils
from Comics.utils.FileUtils import fileUtils as fu
from Comics.loader import ComicEntity
from Comics.utils.ComicUtils import checkUtils
class ComicsPipeline:
def open_spider(self, spider):
@ -22,6 +23,9 @@ class ComicsPipeline:
# item就是yield后面的对象
def process_item(self, item, spider):
checkUtils().export_error(item)
if isinstance(item, ComicItem):
# 'output/rm_comic/json/壞X/第1話 壞X'
if os.path.exists(ComicPath.CBZ(item=item)):
@ -112,7 +116,9 @@ class ImgDownloadPipeline(ImagesPipeline):
comic_info_images= comic_info['Pages'], remove=True):
self.update_icon(item)
self.pack_icon(item)
# CBZ校验失败
else:
checkUtils().export_error(item)
#sleep_time = random.randint(3,15)
#print(f'等待{sleep_time}秒后进行下一章节')
#time.sleep(int(sleep_time))

View File

@ -1,10 +1,8 @@
import scrapy,logging,time,os
from Comics.items import ComicItem
from Comics.loader import ComicLoader
from Comics.items import ListComicItem
from Comics.utils.Constant import ComicPath
from Comics.settings import PROJECT_KEY
import skip
from Comics.utils.ComicUtils import checkUtils
class RmComicSpider(scrapy.Spider):
name = 'rm_comic'
@ -47,11 +45,12 @@ class RmComicSpider(scrapy.Spider):
comic_item.chapter(value=chapter)
item = comic_item.load_item()
cbz_path = ComicPath.get_file_path(item=item, result_type="cbz", convert=True)
if os.path.exists(cbz_path):
logging.info(f"漫画 {cbz_path} 已存在, 跳过中...")
yield item
else:
yield scrapy.Request(self.main_url+link, meta={'item': item}, callback=self.parse_chapter)
if not checkUtils().is_error(item):
if os.path.exists(cbz_path):
logging.info(f"漫画 {cbz_path} 已存在, 跳过中...")
yield item
else:
yield scrapy.Request(self.main_url+link, meta={'item': item}, callback=self.parse_chapter)
# 读取某章节下的所有图片

View File

@ -1,7 +1,6 @@
import scrapy,logging,time,os
from Comics.items import ComicItem
from Comics.loader import ComicLoader
from Comics.items import ListComicItem
from Comics.utils.Constant import ComicPath
from Comics.settings import PROJECT_KEY
import skip

View File

@ -0,0 +1,32 @@
import os,json
from Comics.settings import CBZ_EXPORT_PATH,OUTPUT_DIR,PROJECT_KEY
from Comics.utils.Constant import ComicPath
from Comics.exporters import ComicInfoXmlItemExporter,JsonExport,ItemExporter, ItemImport
from Comics.utils.FileUtils import fileUtils as fu
from Comics.loader import ComicEntity
class checkUtils:
#
# 检测某一章节是否连续错误
def export_error(self, item):
file = os.path.join(OUTPUT_DIR, item[PROJECT_KEY][0], "error_comics.json")
error_comic = ItemImport().import_obj(file)
error_comic.append({ "name" : item['name'], "chapter" : item['chapter'], "date" : ComicPath().getYearMonthDay()})
fu.save_file(file, json.dumps(error_comic))
def is_error(self, item):
file = os.path.join(OUTPUT_DIR, item[PROJECT_KEY][0], "error_comics.json")
error_comic = ItemImport().import_obj(file)
try:
for error_c in eval(error_comic):
(name, chatper, date) = [error_c['name'], error_c['chapter'], error_c['date']]
if item['name'] == name and item['chapter'] == chatper:
return True
else:
return False
except:
return False

View File

@ -1,9 +1,16 @@
import os.path,logging
import re,requests,time
from datetime import date
from Comics import settings
from opencc import OpenCC
class ComicPath:
PREFIX_SCRAMBLE = "scramble="
@classmethod
def getYearMonthDay(cls):
today = date.today()
# 格式化为年-月-日
return today.strftime("%Y%m%d")
@classmethod
def getDirComicChapter(cls):

View File

@ -10,9 +10,9 @@ from Comics.utils.Constant import ntfy
class fileUtils:
@classmethod
def save_file(cls,path,data):
dir = os.path.dirname(path)
if not os.path.exists(dir):
os.makedirs(dir)
root_dir = os.path.dirname(path)
if not os.path.exists(root_dir):
os.makedirs(root_dir)
with open(path,'w',encoding='utf-8') as fs:
fs.write(str(data))
fs.close()