ComicScrapy/Comics/spiders/rm_comic.py
2023-05-15 10:45:47 +08:00

44 lines
2.4 KiB
Python

import scrapy,json
from Comics.items import ComicItem
from Comics.spiders.utils.CommonUtils import CommonUtils
class RmComicSpider(scrapy.Spider):
name = 'rm_comic'
allowed_domains = ['rm01.xyz']
main_url = 'https://rm01.xyz'
#start_urls = ['https://rm01.xyz/books/63b65185-f798-4c8f-a0b0-8811615908fd/0']
def start_requests(self):
yield scrapy.Request(self.main_url + '/books/63b65185-f798-4c8f-a0b0-8811615908fd', callback=self.parse_comic)
def parse_comic(self, response):
comic = ComicItem()
comic['name'] = response.xpath('//div[@class="col"]/h5/text()').extract_first()
comic['icon'] = response.xpath('//img[@class="img-thumbnail"]/@src').extract_first()
comic['author'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[1]/text()').extract()[1]
comic['tags'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[3]/b/text()').extract_first()
comic['dep'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[4]/text()').extract()[1]
comic['date'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()').extract()[1]
comic['chapters'] = response.xpath('//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/text()').extract()
comic['chapter_href'] = response.xpath('//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/@href').extract()
list_img = []
for link in comic['chapter_href']:
yield list_img.append(scrapy.Request(self.main_url+link,meta={'item' : comic} , callback=self.parse_chapter,errback=self.err))
def err(self):
print("Error=====")
def parse_chapter(self, response):
item = response.meta['item']
data = response.xpath('//script[@id="__NEXT_DATA__"]/text()').extract_first()
str_exec="props.pageProps."
comic_name = CommonUtils.parseExec(data,str_exec+"bookName")
chapterName = CommonUtils.parseExec(data,str_exec+"chapterName")
description = CommonUtils.parseExec(data,str_exec+"description")
images = CommonUtils.parseExec(data,str_exec+"images")
chapter_api_url = CommonUtils.parseExec(data,str_exec+"chapterAPIPath")
item['list_img'] = images
yield item
def parse(self, response):
raise NotImplementedError