From faee8328a47b7807ac1abae8226663fc218ddef7 Mon Sep 17 00:00:00 2001 From: caiwx86 Date: Sun, 31 Mar 2024 20:07:13 +0800 Subject: [PATCH] fix --- Comics/spiders/base.py | 28 ---------------------------- Comics/spiders/rm_comic.py | 25 +++++++++++++++++-------- 2 files changed, 17 insertions(+), 36 deletions(-) delete mode 100644 Comics/spiders/base.py diff --git a/Comics/spiders/base.py b/Comics/spiders/base.py deleted file mode 100644 index 3f2b16a..0000000 --- a/Comics/spiders/base.py +++ /dev/null @@ -1,28 +0,0 @@ -import logging, os -from Comics.utils import Conf -from Comics.utils import ComicPath -from Comics.loader import ComicLoader -from Comics.items import ComicItem - -class baseSpider: - - def parse_comic_data(self, project, response): - data = [] - # 初始化Comic数据并根据工程名称读取配置文件并自动解析 - comic_item = Conf().comic(project, ComicLoader(ComicItem(), response)) - # 循环遍历根据配置文件自动解析并注入的章节名和章节链接 - for chapter, link in zip(comic_item.get_chapters(), comic_item.get_chapter_href()): - # 打包导出item数据 - item = comic_item.load_item(chapter=chapter) - # 获取最终存放CBZ的路径 - cbz_path = ComicPath(item=item).PATH_CBZ() - # 校验繁体和简体中文CBZ路径是否存在 - # if not checkUtils().is_error(item) and os.path.exists(cbz_path): - if cbz_path !=None and os.path.exists(cbz_path): - logging.info(f"漫画 {cbz_path} 已存在, 跳过中...") - data.append({ "success" : False, "item" : item}) - else: - # 开始访问章节链接并跳转到self.parse_chapter - # yield scrapy.Request(self.main_url+link, meta={'item': item}, callback=self.parse_chapter) - data.append({ "success" : True, "item" : item, "link" : link}) - return data \ No newline at end of file diff --git a/Comics/spiders/rm_comic.py b/Comics/spiders/rm_comic.py index 2a66df5..4ca5345 100644 --- a/Comics/spiders/rm_comic.py +++ b/Comics/spiders/rm_comic.py @@ -1,10 +1,10 @@ import scrapy,logging,time,os,skip from Comics.items import ComicItem from Comics.loader import ComicLoader -from Comics.spiders.base import baseSpider +from Comics.utils import ComicPath from Comics.utils import Conf -class RmComicSpider(scrapy.Spider, baseSpider): +class RmComicSpider(scrapy.Spider): name = 'rm_comic' allowed_domains = ['roum12.xyz'] main_url = 'https://'+allowed_domains[0] @@ -27,13 +27,22 @@ class RmComicSpider(scrapy.Spider, baseSpider): # 获取某个漫画的相关数据 # 获取到多个章节链接后进入下个流程 def parse_comic(self, response): - for comic in super().parse_comic_data(self.name, response): - if comic.get("success"): - # 开始访问章节链接并跳转到self.parse_chapter - yield scrapy.Request(self.main_url+comic.get("link"), meta={'item': comic.get("item")}, callback=self.parse_chapter) + # 初始化Comic数据并根据工程名称读取配置文件并自动解析 + comic_item = Conf().comic(self.name, ComicLoader(ComicItem(), response)) + # 循环遍历根据配置文件自动解析并注入的章节名和章节链接 + for chapter, link in zip(comic_item.get_chapters(), comic_item.get_chapter_href()): + # 打包导出item数据 + item = comic_item.load_item(chapter=chapter) + # 获取最终存放CBZ的路径 + cbz_path = ComicPath(item=item).PATH_CBZ() + # 校验繁体和简体中文CBZ路径是否存在 + # if not checkUtils().is_error(item) and os.path.exists(cbz_path): + if cbz_path !=None and os.path.exists(cbz_path): + logging.info(f"漫画 {cbz_path} 已存在, 跳过中...") + yield item else: - yield comic.get("item") - + # 开始访问章节链接并跳转到self.parse_chapter + yield scrapy.Request(self.main_url+link, meta={'item': item}, callback=self.parse_chapter) # 读取某章节下的所有图片 def parse_chapter(self, response):