68 lines
3.4 KiB
Python
68 lines
3.4 KiB
Python
import scrapy,logging,time,os
|
|
from Comics.items import ComicItem
|
|
from Comics.loader import ComicLoader
|
|
from Comics.utils import ComicPath
|
|
from Comics.settings import PROJECT_KEY
|
|
import skip
|
|
|
|
class RmComicSpider(scrapy.Spider):
|
|
name = 'yh_comic'
|
|
allowed_domains = ['www.shuanglilock.com.cn']
|
|
main_url = 'https://'+allowed_domains[0]
|
|
start_urls = main_url+'/info'
|
|
|
|
def start_requests(self):
|
|
# for x in range(0,60):
|
|
yield scrapy.Request("https://www.shuanglilock.com.cn/info/27145/", callback=self.parse_comic)
|
|
|
|
# 获取多个漫画信息
|
|
# def books_comic(self, response):
|
|
# comics = ComicLoader(item=ComicItem(), response=response)
|
|
# data = comics.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0]
|
|
# for book in comics.get_exec(data, str_exec="props.pageProps.books"):
|
|
# comics.add_value('link', self.start_urls+"/"+book['id'])
|
|
# if book['name'] not in skip.skip_comic:
|
|
# yield scrapy.Request(url=self.start_urls+"/"+book['id'], callback=self.parse_comic)
|
|
|
|
# 获取某个漫画的相关数据
|
|
# 获取到多个章节链接后进入下个流程
|
|
def parse_comic(self, response):
|
|
comic_item = ComicLoader(item=ComicItem(), response=response)
|
|
comic_item.project_name(self.name)
|
|
comic_item.name(xpath='//div[@class="comics-detail__info"]/h1[@class="comics-detail__title"]/text()')
|
|
comic_item.icon(xpath='//div[@class="pure-u-1-1 pure-u-sm-1-3 pure-u-md-1-6"]/img/@src')
|
|
comic_item.author(xpath='//div[@class="comics-detail__info"]/h2[@class="comics-detail__author"]/text()')
|
|
comic_item.tags(xpath='//div[@class="tag-list"]/a[@class="tag"]/text()')
|
|
comic_item.dep(xpath='//p[contains(@class,"comics-detail__desc")]/text()')
|
|
#comic_item.date(xpath='//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()', index=1)
|
|
comic_item.genre(value="樱花漫画")
|
|
#comic_item.age_rating(value="R18+")
|
|
chapter_href = comic_item.get_xpath('//div[contains(@id,"chapter-items")]'
|
|
'//a[@class="comics-chapters__item"]/@href')
|
|
chapters = comic_item.get_xpath('//div[contains(@id,"chapter-items")]'
|
|
'//a[@class="comics-chapters__item"]//span/text()')
|
|
for chapter, link in zip(chapters, chapter_href):
|
|
comic_item.chapters(value=chapters)
|
|
comic_item.chapter(value=chapter)
|
|
item = comic_item.load_item()
|
|
cbz_path = ComicPath(item).get_file_path(result_type="cbz", convert=True)
|
|
if os.path.exists(cbz_path):
|
|
logging.info(f"漫画 {cbz_path} 已存在, 跳过中...")
|
|
yield item
|
|
else:
|
|
yield scrapy.Request(self.main_url+link, meta={'item': item}, callback=self.parse_chapter)
|
|
|
|
|
|
# 读取某章节下的所有图片
|
|
def parse_chapter(self, response):
|
|
comic_item = ComicLoader(item=response.meta['item'], response=response)
|
|
comic_item.image_urls(xpath='//div[@class="comiclist"]/div[@class="comicpage"]/div/img/@data-original')
|
|
comic_item.images(xpath='//div[@class="comiclist"]/div[@class="comicpage"]/div/img/@data-original')
|
|
comic = comic_item.load_item()
|
|
yield comic
|
|
|
|
def parse(self, response):
|
|
raise NotImplementedError
|
|
|
|
def error_parse(self, response):
|
|
raise NotImplementedError |