This commit is contained in:
caiwx86 2024-03-31 19:38:40 +08:00
parent 7499cd78c7
commit e61ab9173b
2 changed files with 40 additions and 24 deletions

View File

@ -0,0 +1,28 @@
import logging, os
from Comics.utils import Conf
from Comics.utils import ComicPath
from Comics.loader import ComicLoader
from Comics.items import ComicItem
class baseSpider:
def parse_comic_data(self, project, response):
data = []
# 初始化Comic数据并根据工程名称读取配置文件并自动解析
comic_item = Conf().comic(project, ComicLoader(ComicItem(), response))
# 循环遍历根据配置文件自动解析并注入的章节名和章节链接
for chapter, link in zip(comic_item.get_chapters(), comic_item.get_chapter_href()):
# 打包导出item数据
item = comic_item.load_item(chapter=chapter)
# 获取最终存放CBZ的路径
cbz_path = ComicPath(item=item).PATH_CBZ()
# 校验繁体和简体中文CBZ路径是否存在
# if not checkUtils().is_error(item) and os.path.exists(cbz_path):
if cbz_path !=None and os.path.exists(cbz_path):
logging.info(f"漫画 {cbz_path} 已存在, 跳过中...")
data.append({ "success" : False, "item" : item})
else:
# 开始访问章节链接并跳转到self.parse_chapter
# yield scrapy.Request(self.main_url+link, meta={'item': item}, callback=self.parse_chapter)
data.append({ "success" : True, "item" : item, "link" : link})
return data

View File

@ -1,10 +1,10 @@
import scrapy,logging,time,os,skip import scrapy,logging,time,os,skip
from Comics.items import ComicItem from Comics.items import ComicItem
from Comics.loader import ComicLoader from Comics.loader import ComicLoader
from Comics.utils import ComicPath from Comics.spiders.baseSpider import baseSpider
from Comics.utils import Conf from Comics.utils import Conf
class RmComicSpider(scrapy.Spider): class RmComicSpider(scrapy.Spider, baseSpider):
name = 'rm_comic' name = 'rm_comic'
allowed_domains = ['roum12.xyz'] allowed_domains = ['roum12.xyz']
main_url = 'https://'+allowed_domains[0] main_url = 'https://'+allowed_domains[0]
@ -27,32 +27,21 @@ class RmComicSpider(scrapy.Spider):
# 获取某个漫画的相关数据 # 获取某个漫画的相关数据
# 获取到多个章节链接后进入下个流程 # 获取到多个章节链接后进入下个流程
def parse_comic(self, response): def parse_comic(self, response):
# 初始化Comic数据并根据工程名称读取配置文件并自动解析 for comic in super().parse_comic_data(self.name, response):
comic_item = Conf().comic(self.name, ComicLoader(ComicItem(), response)) if comic.get("success"):
# 循环遍历根据配置文件自动解析并注入的章节名和章节链接
for chapter, link in zip(comic_item.get_chapters(), comic_item.get_chapter_href()):
# 打包导出item数据
item = comic_item.load_item(chapter=chapter)
# 获取最终存放CBZ的路径
cbz_path = ComicPath(item=item).PATH_CBZ()
# 校验繁体和简体中文CBZ路径是否存在
# if not checkUtils().is_error(item) and os.path.exists(cbz_path):
if cbz_path !=None and os.path.exists(cbz_path):
logging.info(f"漫画 {cbz_path} 已存在, 跳过中...")
yield item
else:
# 开始访问章节链接并跳转到self.parse_chapter # 开始访问章节链接并跳转到self.parse_chapter
yield scrapy.Request(self.main_url+link, meta={'item': item}, callback=self.parse_chapter) yield scrapy.Request(self.main_url+comic.get("link"), meta={'item': comic.get("item")}, callback=self.parse_chapter)
else:
yield comic.get("item")
# 读取某章节下的所有图片 # 读取某章节下的所有图片
def parse_chapter(self, response): def parse_chapter(self, response):
# 获取传入的漫画item数据 # 获取传入的漫画item数据
comic_item = ComicLoader(item=response.meta['item'], response=response) ci = ComicLoader(item=response.meta['item'], response=response)
data = comic_item.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0] # 再次通过获取的XPATH数据解析并保存到ci(ComicItem)中
item: ComicLoader = Conf().parse_chapter(item=comic_item, value=data) item: ComicLoader = Conf().parse_chapter(item=ci, value=ci.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0])
comic = item.load_item() comic, chapter_api_url = [ item.load_item() ,item.get_chapter_api() ]
chapter_api_url = item.get_chapter_api()
if chapter_api_url is not None and len(chapter_api_url) != 0 : if chapter_api_url is not None and len(chapter_api_url) != 0 :
try: try:
yield scrapy.Request(self.main_url + chapter_api_url, meta={'item': comic}, callback=self.parse_chapter_api) yield scrapy.Request(self.main_url + chapter_api_url, meta={'item': comic}, callback=self.parse_chapter_api)
@ -64,8 +53,7 @@ class RmComicSpider(scrapy.Spider):
# 加密数据API处理 # 加密数据API处理
def parse_chapter_api(self, response): def parse_chapter_api(self, response):
comic_item = ComicLoader(item=response.meta['item'], response=response) comic_item = ComicLoader(item=response.meta['item'], response=response)
item: ComicLoader = Conf().parse_chapter_api(item=comic_item, value=response.text) return Conf().parse_chapter_api(item=comic_item, value=response.text).load_item()
yield item.load_item()
def parse(self, response): def parse(self, response):