44 lines
2.4 KiB
Python
44 lines
2.4 KiB
Python
import scrapy,json
|
|
from Comics.items import ComicItem
|
|
from Comics.spiders.utils.CommonUtils import CommonUtils
|
|
|
|
class RmComicSpider(scrapy.Spider):
|
|
name = 'rm_comic'
|
|
allowed_domains = ['rm01.xyz']
|
|
main_url = 'https://rm01.xyz'
|
|
#start_urls = ['https://rm01.xyz/books/63b65185-f798-4c8f-a0b0-8811615908fd/0']
|
|
|
|
def start_requests(self):
|
|
yield scrapy.Request(self.main_url + '/books/63b65185-f798-4c8f-a0b0-8811615908fd', callback=self.parse_comic)
|
|
|
|
def parse_comic(self, response):
|
|
comic = ComicItem()
|
|
comic['name'] = response.xpath('//div[@class="col"]/h5/text()').extract_first()
|
|
comic['icon'] = response.xpath('//img[@class="img-thumbnail"]/@src').extract_first()
|
|
comic['author'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[1]/text()').extract()[1]
|
|
comic['tags'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[3]/b/text()').extract_first()
|
|
comic['dep'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[4]/text()').extract()[1]
|
|
comic['date'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()').extract()[1]
|
|
comic['chapters'] = response.xpath('//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/text()').extract()
|
|
comic['chapter_href'] = response.xpath('//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/@href').extract()
|
|
list_img = []
|
|
for link in comic['chapter_href']:
|
|
yield list_img.append(scrapy.Request(self.main_url+link,meta={'item' : comic} , callback=self.parse_chapter,errback=self.err))
|
|
|
|
def err(self):
|
|
print("Error=====")
|
|
|
|
def parse_chapter(self, response):
|
|
item = response.meta['item']
|
|
data = response.xpath('//script[@id="__NEXT_DATA__"]/text()').extract_first()
|
|
str_exec="props.pageProps."
|
|
comic_name = CommonUtils.parseExec(data,str_exec+"bookName")
|
|
chapterName = CommonUtils.parseExec(data,str_exec+"chapterName")
|
|
description = CommonUtils.parseExec(data,str_exec+"description")
|
|
images = CommonUtils.parseExec(data,str_exec+"images")
|
|
chapter_api_url = CommonUtils.parseExec(data,str_exec+"chapterAPIPath")
|
|
item['list_img'] = images
|
|
yield item
|
|
|
|
def parse(self, response):
|
|
raise NotImplementedError |