import scrapy,logging,time,os from Comics.items import ComicItem from Comics.loader import ComicLoader from Comics.items import ListComicItem from Comics.utils.Constant import ComicPath from Comics.settings import PROJECT_KEY import skip class RmComicSpider(scrapy.Spider): name = 'rm_comic' allowed_domains = ['roum1.xyz'] main_url = 'https://'+allowed_domains[0] start_urls = main_url+'/books' def start_requests(self): yield scrapy.Request(self.start_urls, callback=self.books_comic) # 获取多个漫画信息 def books_comic(self, response): comics = ComicLoader(item=ComicItem(), response=response) data = comics.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0] for book in comics.get_exec(data, str_exec="props.pageProps.books"): comics.add_value('link', self.start_urls+"/"+book['id']) if not book['name'] in skip.skip_comic: yield scrapy.Request(url=self.start_urls+"/"+book['id'], callback=self.parse_comic) # 获取某个漫画的相关数据 # 获取到多个章节链接后进入下个流程 def parse_comic(self, response): comic_item = ComicLoader(item=ComicItem(), response=response) comic_item.add_value(PROJECT_KEY, self.name) comic_item.add_xpath('name', '//div[@class="col"]/h5/text()') comic_item.add_xpath('icon', '//img[@class="img-thumbnail"]/@src') comic_item.add_xpath('author', '//div[contains(@class,"bookid_bookInfo")]/p[1]/text()', index=1) comic_item.add_xpath('tags', '//div[contains(@class,"bookid_bookInfo")]/p[3]/b/text()') comic_item.add_xpath('dep', '//div[contains(@class,"bookid_bookInfo")]/p[4]/text()', index=1) comic_item.add_xpath('date', '//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()', index=1) comic_item.add_value('genre', "韩漫") comic_item.add_value('age_rating', "R18+") chapter_href = comic_item.get_xpath('//div[contains(@class,"bookid_chapterBox")]' '//div[contains(@class,"bookid_chapter")]/a/@href') #chapters = response.xpath('//div[contains(@class,"bookid_chapterBox")]' # '//div[contains(@class,"bookid_chapter")]/a/text()').extract() chapters = comic_item.get_xpath('//div[contains(@class,"bookid_chapterBox")]' '//div[contains(@class,"bookid_chapter")]/a/text()') for chapter, link in zip(chapters, chapter_href): #for i, link in enumerate(chapter_href, start=1): # yield scrapy.Request(self.main_url+link, meta={'item': comic_item.load_item(), 'num': i}, callback=self.parse_chapter) comic_item.add_value('chapter', chapter) comic_item.add_value('chapters', chapters) item = comic_item.load_item() cbz_path = ComicPath.get_file_path(item=item, result_type="cbz", convert=True) if os.path.exists(cbz_path): logging.info(f"漫画 {cbz_path} 已存在, 跳过中...") return else: yield scrapy.Request(self.main_url+link, meta={'item': item}, callback=self.parse_chapter) # 读取某章节下的所有图片 def parse_chapter(self, response): comic_item = ComicLoader(item=response.meta['item'], response=response) data = comic_item.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0] str_exec = "props.pageProps." comic_item.add_exec('name', data, str_exec=str_exec+"bookName") comic_item.add_exec('dep', data, str_exec=str_exec+"description") comic_item.add_exec('chapter', data, str_exec=str_exec + "chapterName") comic_item.add_exec('image_urls', data, str_exec+"images") comic_item.add_exec('images', data, str_exec+"images") comic = comic_item.load_item() chapter_api_url = comic_item.get_exec(data, str_exec+"chapterAPIPath") if chapter_api_url is not None: yield scrapy.Request(self.main_url + chapter_api_url, meta={'item': comic}, callback=self.parse_chapter_api) else: yield comic # 加密数据API处理 def parse_chapter_api(self, response): comic_item = ComicLoader(item=response.meta['item'], response=response) comic_item.add_exec('chapter', response.text, str_exec='chapter.name') comic_item.add_exec('image_urls', response.text, str_exec='chapter.images') comic_item.add_exec('images', response.text, str_exec='chapter.images') yield comic_item.load_item() def parse(self, response): raise NotImplementedError def error_parse(self, response): raise NotImplementedError