This commit is contained in:
caiwx86 2024-03-31 20:07:13 +08:00
parent 3298bcc287
commit faee8328a4
2 changed files with 17 additions and 36 deletions

View File

@ -1,28 +0,0 @@
import logging, os
from Comics.utils import Conf
from Comics.utils import ComicPath
from Comics.loader import ComicLoader
from Comics.items import ComicItem
class baseSpider:
def parse_comic_data(self, project, response):
data = []
# 初始化Comic数据并根据工程名称读取配置文件并自动解析
comic_item = Conf().comic(project, ComicLoader(ComicItem(), response))
# 循环遍历根据配置文件自动解析并注入的章节名和章节链接
for chapter, link in zip(comic_item.get_chapters(), comic_item.get_chapter_href()):
# 打包导出item数据
item = comic_item.load_item(chapter=chapter)
# 获取最终存放CBZ的路径
cbz_path = ComicPath(item=item).PATH_CBZ()
# 校验繁体和简体中文CBZ路径是否存在
# if not checkUtils().is_error(item) and os.path.exists(cbz_path):
if cbz_path !=None and os.path.exists(cbz_path):
logging.info(f"漫画 {cbz_path} 已存在, 跳过中...")
data.append({ "success" : False, "item" : item})
else:
# 开始访问章节链接并跳转到self.parse_chapter
# yield scrapy.Request(self.main_url+link, meta={'item': item}, callback=self.parse_chapter)
data.append({ "success" : True, "item" : item, "link" : link})
return data

View File

@ -1,10 +1,10 @@
import scrapy,logging,time,os,skip import scrapy,logging,time,os,skip
from Comics.items import ComicItem from Comics.items import ComicItem
from Comics.loader import ComicLoader from Comics.loader import ComicLoader
from Comics.spiders.base import baseSpider from Comics.utils import ComicPath
from Comics.utils import Conf from Comics.utils import Conf
class RmComicSpider(scrapy.Spider, baseSpider): class RmComicSpider(scrapy.Spider):
name = 'rm_comic' name = 'rm_comic'
allowed_domains = ['roum12.xyz'] allowed_domains = ['roum12.xyz']
main_url = 'https://'+allowed_domains[0] main_url = 'https://'+allowed_domains[0]
@ -27,13 +27,22 @@ class RmComicSpider(scrapy.Spider, baseSpider):
# 获取某个漫画的相关数据 # 获取某个漫画的相关数据
# 获取到多个章节链接后进入下个流程 # 获取到多个章节链接后进入下个流程
def parse_comic(self, response): def parse_comic(self, response):
for comic in super().parse_comic_data(self.name, response): # 初始化Comic数据并根据工程名称读取配置文件并自动解析
if comic.get("success"): comic_item = Conf().comic(self.name, ComicLoader(ComicItem(), response))
# 开始访问章节链接并跳转到self.parse_chapter # 循环遍历根据配置文件自动解析并注入的章节名和章节链接
yield scrapy.Request(self.main_url+comic.get("link"), meta={'item': comic.get("item")}, callback=self.parse_chapter) for chapter, link in zip(comic_item.get_chapters(), comic_item.get_chapter_href()):
# 打包导出item数据
item = comic_item.load_item(chapter=chapter)
# 获取最终存放CBZ的路径
cbz_path = ComicPath(item=item).PATH_CBZ()
# 校验繁体和简体中文CBZ路径是否存在
# if not checkUtils().is_error(item) and os.path.exists(cbz_path):
if cbz_path !=None and os.path.exists(cbz_path):
logging.info(f"漫画 {cbz_path} 已存在, 跳过中...")
yield item
else: else:
yield comic.get("item") # 开始访问章节链接并跳转到self.parse_chapter
yield scrapy.Request(self.main_url+link, meta={'item': item}, callback=self.parse_chapter)
# 读取某章节下的所有图片 # 读取某章节下的所有图片
def parse_chapter(self, response): def parse_chapter(self, response):