import scrapy,logging,time,os from Comics.items import ComicItem from Comics.loader import ComicLoader from Comics.utils import ComicPath from Comics.settings import PROJECT_KEY import skip class RmComicSpider(scrapy.Spider): name = 'yh_comic' allowed_domains = ['www.shuanglilock.com.cn'] main_url = 'https://'+allowed_domains[0] start_urls = main_url+'/info' def start_requests(self): # for x in range(0,60): yield scrapy.Request("https://www.shuanglilock.com.cn/info/27145/", callback=self.parse_comic) # 获取多个漫画信息 # def books_comic(self, response): # comics = ComicLoader(item=ComicItem(), response=response) # data = comics.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0] # for book in comics.get_exec(data, str_exec="props.pageProps.books"): # comics.add_value('link', self.start_urls+"/"+book['id']) # if book['name'] not in skip.skip_comic: # yield scrapy.Request(url=self.start_urls+"/"+book['id'], callback=self.parse_comic) # 获取某个漫画的相关数据 # 获取到多个章节链接后进入下个流程 def parse_comic(self, response): comic_item = ComicLoader(item=ComicItem(), response=response) comic_item.project_name(self.name) comic_item.name(xpath='//div[@class="comics-detail__info"]/h1[@class="comics-detail__title"]/text()') comic_item.icon(xpath='//div[@class="pure-u-1-1 pure-u-sm-1-3 pure-u-md-1-6"]/img/@src') comic_item.author(xpath='//div[@class="comics-detail__info"]/h2[@class="comics-detail__author"]/text()') comic_item.tags(xpath='//div[@class="tag-list"]/a[@class="tag"]/text()') comic_item.dep(xpath='//p[contains(@class,"comics-detail__desc")]/text()') #comic_item.date(xpath='//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()', index=1) comic_item.genre(value="樱花漫画") #comic_item.age_rating(value="R18+") chapter_href = comic_item.get_xpath('//div[contains(@id,"chapter-items")]' '//a[@class="comics-chapters__item"]/@href') chapters = comic_item.get_xpath('//div[contains(@id,"chapter-items")]' '//a[@class="comics-chapters__item"]//span/text()') for chapter, link in zip(chapters, chapter_href): comic_item.chapters(value=chapters) comic_item.chapter(value=chapter) item = comic_item.load_item() cbz_path = ComicPath(item).get_file_path(result_type="cbz", convert=True) if os.path.exists(cbz_path): logging.info(f"漫画 {cbz_path} 已存在, 跳过中...") yield item else: yield scrapy.Request(self.main_url+link, meta={'item': item}, callback=self.parse_chapter) # 读取某章节下的所有图片 def parse_chapter(self, response): comic_item = ComicLoader(item=response.meta['item'], response=response) comic_item.image_urls(xpath='//div[@class="comiclist"]/div[@class="comicpage"]/div/img/@data-original') comic_item.images(xpath='//div[@class="comiclist"]/div[@class="comicpage"]/div/img/@data-original') comic = comic_item.load_item() yield comic def parse(self, response): raise NotImplementedError def error_parse(self, response): raise NotImplementedError