63 lines
2.9 KiB
Python
63 lines
2.9 KiB
Python
import scrapy,logging,time,os,skip
|
|
from Comics.items import ComicItem
|
|
from Comics.loader import ComicLoader
|
|
from Comics.spiders.baseSpider import baseSpider
|
|
from Comics.utils import Conf
|
|
|
|
class RmComicSpider(scrapy.Spider, baseSpider):
|
|
name = 'rm_comic'
|
|
allowed_domains = ['roum12.xyz']
|
|
main_url = 'https://'+allowed_domains[0]
|
|
start_urls = main_url+'/books'
|
|
|
|
# 遍历网站页数数据
|
|
def start_requests(self):
|
|
for x in range(0,60):
|
|
yield scrapy.Request(self.start_urls+"?&page="+str(x), callback=self.books_comic)
|
|
|
|
# 获取多个漫画信息
|
|
def books_comic(self, response):
|
|
comics = ComicLoader(item=ComicItem(), response=response)
|
|
# 获取漫画网站//script[@id]内的json数据并获取props.pageProps.books数据并作偱环解析
|
|
for book in comics.get_exec(comics.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0], str_exec="props.pageProps.books"):
|
|
# 排除指定的漫画名
|
|
if book['name'] not in skip.skip_comic:
|
|
yield scrapy.Request(url=self.start_urls+"/"+book['id'], callback=self.parse_comic)
|
|
|
|
# 获取某个漫画的相关数据
|
|
# 获取到多个章节链接后进入下个流程
|
|
def parse_comic(self, response):
|
|
for comic in super().parse_comic_data(self.name, response):
|
|
if comic.get("success"):
|
|
# 开始访问章节链接并跳转到self.parse_chapter
|
|
yield scrapy.Request(self.main_url+comic.get("link"), meta={'item': comic.get("item")}, callback=self.parse_chapter)
|
|
else:
|
|
yield comic.get("item")
|
|
|
|
|
|
# 读取某章节下的所有图片
|
|
def parse_chapter(self, response):
|
|
# 获取传入的漫画item数据
|
|
ci = ComicLoader(item=response.meta['item'], response=response)
|
|
# 再次通过获取的XPATH数据解析并保存到ci(ComicItem)中
|
|
item: ComicLoader = Conf().parse_chapter(item=ci, value=ci.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0])
|
|
comic, chapter_api_url = [ item.load_item() ,item.get_chapter_api() ]
|
|
if chapter_api_url is not None and len(chapter_api_url) != 0 :
|
|
try:
|
|
yield scrapy.Request(self.main_url + chapter_api_url, meta={'item': comic}, callback=self.parse_chapter_api)
|
|
except:
|
|
logging.warning(f"yield scrapy.Request({self.main_url} + {chapter_api_url}, meta={comic}, callback=self.parse_chapter_api)")
|
|
else:
|
|
yield comic
|
|
|
|
# 加密数据API处理
|
|
def parse_chapter_api(self, response):
|
|
comic_item = ComicLoader(item=response.meta['item'], response=response)
|
|
return Conf().parse_chapter_api(item=comic_item, value=response.text).load_item()
|
|
|
|
|
|
def parse(self, response):
|
|
raise NotImplementedError
|
|
|
|
def error_parse(self, response):
|
|
raise NotImplementedError |