74 lines
3.9 KiB
Python
74 lines
3.9 KiB
Python
import scrapy,logging,time
|
|
from Comics.items import ComicItem
|
|
from Comics.loader import ComicLoader
|
|
from Comics.items import ListComicItem
|
|
|
|
class RmComicSpider(scrapy.Spider):
|
|
name = 'rm_comic'
|
|
allowed_domains = ['rm01.xyz']
|
|
main_url = 'https://rm01.xyz'
|
|
start_urls = 'https://rm01.xyz/books'
|
|
|
|
def start_requests(self):
|
|
yield scrapy.Request(self.start_urls, callback=self.books_comic)
|
|
|
|
def books_comic(self, response):
|
|
books_comic = ComicLoader(item=ListComicItem(), response=response)
|
|
data = books_comic.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0]
|
|
str_exec = "props.pageProps.books"
|
|
books = books_comic.get_exec(data, str_exec=str_exec)
|
|
for book in books:
|
|
books_comic.add_value('link', book['id'])
|
|
time.sleep(3)
|
|
yield scrapy.Request(url=self.start_urls+"/"+book['id'], callback=self.parse_comic)
|
|
|
|
|
|
# 获取某个漫画的相关数据
|
|
# 获取到多个章节链接后进入下个流程
|
|
def parse_comic(self, response):
|
|
comic_item = ComicLoader(item=ComicItem(), response=response)
|
|
comic_item.add_xpath('name', '//div[@class="col"]/h5/text()')
|
|
comic_item.add_xpath('icon', '//img[@class="img-thumbnail"]/@src')
|
|
comic_item.add_xpath('author', '//div[contains(@class,"bookid_bookInfo")]/p[1]/text()', index=1)
|
|
comic_item.add_xpath('tags', '//div[contains(@class,"bookid_bookInfo")]/p[3]/b/text()')
|
|
comic_item.add_xpath('dep', '//div[contains(@class,"bookid_bookInfo")]/p[4]/text()', index=1)
|
|
comic_item.add_xpath('date', '//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()', index=1)
|
|
comic_item.add_value('genre', "韩漫")
|
|
comic_item.add_value('age_rating', "R18+")
|
|
chapter_href = comic_item.get_xpath('//div[contains(@class,"bookid_chapterBox")]'
|
|
'//div[contains(@class,"bookid_chapter")]/a/@href')
|
|
#chapters = response.xpath('//div[contains(@class,"bookid_chapterBox")]'
|
|
# '//div[contains(@class,"bookid_chapter")]/a/text()').extract()
|
|
#for chapter, link in zip(chapters, chapter_href):
|
|
for i, link in enumerate(chapter_href, start=1):
|
|
yield scrapy.Request(self.main_url+link, meta={'item': comic_item.load_item(), 'num': i}, callback=self.parse_chapter)
|
|
|
|
# 读取某章节下的所有图片
|
|
def parse_chapter(self, response):
|
|
comic_item = ComicLoader(item=response.meta['item'], response=response)
|
|
data = comic_item.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0]
|
|
str_exec = "props.pageProps."
|
|
comic_item.add_exec('name', data, str_exec=str_exec+"bookName")
|
|
comic_item.add_exec('dep', data, str_exec=str_exec+"description")
|
|
comic_item.add_value('index', response.meta['num'])
|
|
comic_item.add_exec('chapter', data, str_exec=str_exec + "chapterName")
|
|
comic_item.add_exec('image_urls', data, str_exec+"images")
|
|
comic_item.add_exec('images', data, str_exec+"images")
|
|
comic = comic_item.load_item()
|
|
chapter_api_url = comic_item.get_exec(data, str_exec+"chapterAPIPath")
|
|
if chapter_api_url is not None:
|
|
yield scrapy.Request(self.main_url + chapter_api_url, meta={'item': comic}, callback=self.parse_chapter_api)
|
|
else:
|
|
yield comic
|
|
|
|
# 加密数据API处理
|
|
def parse_chapter_api(self, response):
|
|
comic_item = ComicLoader(item=response.meta['item'], response=response)
|
|
comic_item.add_exec('chapter', response.text, str_exec='chapter.name')
|
|
comic_item.add_exec('image_urls', response.text, str_exec='chapter.images')
|
|
comic_item.add_exec('images', response.text, str_exec='chapter.images')
|
|
yield comic_item.load_item()
|
|
|
|
|
|
def parse(self, response):
|
|
raise NotImplementedError |