ComicScrapy/Comics/spiders/rm_comic.py
2024-10-28 02:41:52 +08:00

83 lines
4.2 KiB
Python

import scrapy,logging,os,skip,re
from Comics._utils.items import ComicItem
from Comics._utils.items import BooksItem
from Comics._utils.loader import ComicLoader
from Comics._utils.loader import BooksLoader
from Comics._utils.utils import ComicPath
from Comics._utils.utils import Conf
from Comics._utils.utils import oldUtils
class RmComicSpider(scrapy.Spider):
name = 'rm_comic'
allowed_domains = ['rouman5.com']
main_url = 'https://'+allowed_domains[0]
start_urls = main_url+"/books"
# 遍历网站页数数据
def start_requests(self):
for x in range(0,70):
yield scrapy.Request(self.start_urls+"?&page="+str(x), callback=self.books_comic)
# 获取多个漫画信息
def books_comic(self, response):
books_item = Conf().books(self.name, BooksLoader(BooksItem(), response))
# 获取漫画网站//script[@id]内的json数据并获取props.pageProps.books数据并作偱环解析
for book,url in zip(books_item.get_names(), books_item.get_urls()):
# 排除指定的漫画名
if book not in skip.skip_comic: yield scrapy.Request(url=self.main_url+"/"+url, callback=self.parse_comic)
# 获取某个漫画的相关数据
# 获取到多个章节链接后进入下个流程
def parse_comic(self, response):
# 初始化Comic数据并根据工程名称读取配置文件并自动解析
comic_item = Conf().comic(self.name, ComicLoader(ComicItem(), response))
comic_item.set_domain(self.main_url)
path_comic = comic_item.load_item()
cbz_dir = ComicPath(path_comic).file_path(result_type=ComicPath.MAPPING_CBZ_DIR)
move_folder = ComicPath(path_comic).file_path(result_type=ComicPath.MAPPING_OLD_CBZ_DIR)
# 循环遍历根据配置文件自动解析并注入的章节名和章节链接
new_chapter = oldUtils().new_files(files=comic_item.get_chapters(), folder=cbz_dir)
# 清理多余章节
oldUtils().clean_old_files(files=comic_item.get_chapters(), folder=cbz_dir, move_folder=move_folder)
for chapter, link in zip(comic_item.get_chapters(), comic_item.get_chapter_href()):
if ComicPath.chinese_convert(ComicPath.fix_file_name(chapter)) in new_chapter:
# 打包导出item数据
item = comic_item.load_item(chapter=chapter)
# 获取最终存放CBZ的路径
cbz_path = ComicPath(item=item).PATH_CBZ()
# 校验繁体和简体中文CBZ路径是否存在
if cbz_path !=None and os.path.exists(cbz_path):
logging.info(f"漫画 {cbz_path} 已存在, 跳过中...")
yield item
else:
# 开始访问章节链接并跳转到self.parse_chapter
yield scrapy.Request(self.main_url+link, meta={'item': item}, callback=self.parse_chapter)
# 读取某章节下的所有图片
def parse_chapter(self, response):
# 获取传入的漫画item数据
ci = ComicLoader(item=response.meta['item'], response=response)
reuslt_json = None
for data_json in ci.get_xpath('//script/text()'):
if data_json.startswith('self.__next_f.push([1,"5') : reuslt_json = data_json
# 再次通过获取的XPATH数据解析并保存到ci(ComicItem)中
# 正则表达式匹配 .jpg 链接
jpg_links = re.findall(r'(https?://\S+\.jpg)', reuslt_json)
images_urls = []
# 打印提取的 .jpg 链接
for link in jpg_links:
sr_value = re.search(r'sr:(\d+)', link)
# 打印提取到的 sr: 的值
if sr_value:
sr = sr_value.group(1) # group(1) 返回第一个捕获组,即数字部分
else:
print("No match found")
images_urls.append(ci.setImageItem(url=link, scramble=sr.replace("0", "False").replace("1", "True")))
ci.image_urls(value=images_urls)
yield ci.load_item()
def parse(self, response):
raise NotImplementedError
def error_parse(self, response):
raise NotImplementedError