import scrapy,logging,time,os,skip from Comics.items import ComicItem from Comics.loader import ComicLoader from Comics.utils import ComicPath from Comics.utils import Conf class RmComicSpider(scrapy.Spider): name = 'rm_comic' allowed_domains = ['roum12.xyz'] main_url = 'https://'+allowed_domains[0] start_urls = main_url+'/books' # 遍历网站页数数据 def start_requests(self): for x in range(0,60): yield scrapy.Request(self.start_urls+"?&page="+str(x), callback=self.books_comic) # 获取多个漫画信息 def books_comic(self, response): comics = ComicLoader(item=ComicItem(), response=response) # 获取漫画网站//script[@id]内的json数据并获取props.pageProps.books数据并作偱环解析 for book in comics.get_exec(comics.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0], str_exec="props.pageProps.books"): # 排除指定的漫画名 if book['name'] not in skip.skip_comic: yield scrapy.Request(url=self.start_urls+"/"+book['id'], callback=self.parse_comic) # 获取某个漫画的相关数据 # 获取到多个章节链接后进入下个流程 def parse_comic(self, response): # 初始化Comic数据并根据工程名称读取配置文件并自动解析 comic_item = Conf().comic(self.name, ComicLoader(ComicItem(), response)) # 循环遍历根据配置文件自动解析并注入的章节名和章节链接 for chapter, link in zip(comic_item.get_chapters(), comic_item.get_chapter_href()): # 打包导出item数据 item = comic_item.load_item(chapter=chapter) # 获取最终存放CBZ的路径 #cbz_path = ComicPath(item=item).file_path(ComicPath.PATH_CBZ, convert=True, chapter=chapter) cbz_path = ComicPath(item=item).PATH_CBZ() # 校验繁体和简体中文CBZ路径是否存在 # if not checkUtils().is_error(item) and os.path.exists(cbz_path): if cbz_path !=None and os.path.exists(cbz_path): logging.info(f"漫画 {cbz_path} 已存在, 跳过中...") yield item else: # 开始访问章节链接并跳转到self.parse_chapter yield scrapy.Request(self.main_url+link, meta={'item': item}, callback=self.parse_chapter) # 读取某章节下的所有图片 def parse_chapter(self, response): # 获取传入的漫画item数据 comic_item = ComicLoader(item=response.meta['item'], response=response) data = comic_item.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0] item: ComicLoader = Conf().parse_chapter(item=comic_item, value=data) comic = item.load_item() chapter_api_url = item.get_chapter_api() if chapter_api_url is not None and len(chapter_api_url) != 0 : try: yield scrapy.Request(self.main_url + chapter_api_url, meta={'item': comic}, callback=self.parse_chapter_api) except: logging.warning(f"yield scrapy.Request({self.main_url} + {chapter_api_url}, meta={comic}, callback=self.parse_chapter_api)") else: yield comic # 加密数据API处理 def parse_chapter_api(self, response): comic_item = ComicLoader(item=response.meta['item'], response=response) item: ComicLoader = Conf().parse_chapter(item=comic_item, value=response.text) yield item.load_item() def parse(self, response): raise NotImplementedError def error_parse(self, response): raise NotImplementedError