update new Rouman5
This commit is contained in:
parent
6638254416
commit
5f5b6cb6ec
@ -3,6 +3,7 @@ from xml.dom import minidom
|
||||
from typing import List
|
||||
import json,os
|
||||
from lxml import etree
|
||||
from Comics.settings import COMIC_INFO_XML_FILE,COMIC_INFO_XSD_FILE,OUTPUT_DIR,PROJECT_KEY
|
||||
|
||||
# Define the ComicInfo and ComicPageInfo classes
|
||||
class ComicInfo:
|
||||
@ -94,7 +95,7 @@ class ComicInfoXml:
|
||||
if remove:
|
||||
os.remove(xml_file)
|
||||
|
||||
def parse_comicinfo(self, comic: ComicInfo, save_dir=None, xml_filename="ComicInfo.xml", xsd_filename="ComicInfo_2.1.xsd"):
|
||||
def parse_comicinfo(self, comic: ComicInfo, save_dir=None, xml_filename="ComicInfo.xml", xsd_filename="ComicInfo.xsd"):
|
||||
"""_summary_
|
||||
|
||||
Args:
|
||||
@ -143,7 +144,7 @@ class ComicInfoXml:
|
||||
#xml_data = json_to_xml_with_declaration(json_data)
|
||||
#print(xml_data)
|
||||
|
||||
def scrapy_xml_by_json(self, json_data, save_dir=None):
|
||||
def scrapy_xml_by_json(self, json_data, save_dir=None, xsd_file=COMIC_INFO_XSD_FILE):
|
||||
comic = ComicInfo()
|
||||
comic.Title = json_data.get("chapter", "")
|
||||
comic.Series = json_data.get("name", "")
|
||||
@ -163,5 +164,5 @@ class ComicInfoXml:
|
||||
page.Image = image_name.split(".")[0].split("_")[-1]
|
||||
pages.append(page.Image)
|
||||
comic.Pages.append(page)
|
||||
self.parse_comicinfo(comic, save_dir=save_dir)
|
||||
self.parse_comicinfo(comic, save_dir=save_dir, xsd_filename=xsd_file)
|
||||
return pages
|
||||
@ -28,7 +28,8 @@ def _serialize_to_images(value, result_type=None):
|
||||
# suffix = "."+str(image_src).split(".")[-1]
|
||||
suffix = ".jpg"
|
||||
image_name = count_image + suffix
|
||||
if scramble:
|
||||
#if scramble:
|
||||
if scramble == "True":
|
||||
de_str = str(image_src).split("/")[-1].replace(suffix, "==")
|
||||
blocks_num = imageUtils.encodeImage(de_str)
|
||||
image_name = ComicPath.getFileScrambleImageName(count=count_image, block=blocks_num, suffix=suffix)
|
||||
@ -93,11 +94,27 @@ class ComicItem(Item):
|
||||
# 图像名
|
||||
images_name = Field()
|
||||
|
||||
domain = Field()
|
||||
#章节链接
|
||||
chapter_href = Field()
|
||||
#章节API
|
||||
chapter_api = Field()
|
||||
|
||||
class BooksItem(Item):
|
||||
current_project = Field()
|
||||
names = Field()
|
||||
urls = Field()
|
||||
|
||||
class ImageItem(Item):
|
||||
image_url = Field()
|
||||
image_name = Field()
|
||||
image_path = Field()
|
||||
image_type = Field()
|
||||
isScramble = Field()
|
||||
|
||||
class Image():
|
||||
def setImage(self, url, scramble): return { "src" : url, "scramble": scramble}
|
||||
|
||||
# 序列化-作者
|
||||
def serializer_info_writer(value):
|
||||
(list_value, value) = [[], str(value).replace("&", " ")]
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
import json,logging
|
||||
import json,logging,os
|
||||
from scrapy.loader import ItemLoader
|
||||
from Comics.settings import PROJECT_KEY
|
||||
from Comics.settings import PROJECT_KEY,IMAGES_STORE
|
||||
|
||||
class ComicLoader(ItemLoader):
|
||||
class BaseLoader(ItemLoader):
|
||||
def parseExec(self,data,exec):
|
||||
if data !=None and exec != None:
|
||||
dots = str(exec).split(".")
|
||||
@ -68,15 +68,29 @@ class ComicLoader(ItemLoader):
|
||||
|
||||
# 设置漫画属性
|
||||
def set_properties(self, name, value=None, xpath=None, index=None, sexec=None):
|
||||
if value != None and sexec==None:
|
||||
if value != None:
|
||||
self.add_value(field_name=name, value=value)
|
||||
if xpath != None:
|
||||
self.add_xpath(field_name=name, xpath=xpath, index=index)
|
||||
if sexec != None:
|
||||
self.add_exec(field_name=name, value=value, str_exec=sexec)
|
||||
|
||||
def get_output_value(self, field_name, skip_field=["chapter"]):
|
||||
value = super().get_output_value(field_name)
|
||||
try:
|
||||
if isinstance(value, list) and len(value) == 1:
|
||||
if field_name not in skip_field: value = value[0]
|
||||
else: value = "".join(value)
|
||||
except:
|
||||
print(f"get_output_value value={value} type={type(value)}")
|
||||
return value
|
||||
|
||||
# 工程名
|
||||
def project_name(self, project_name): self.add_value(PROJECT_KEY, project_name)
|
||||
# 工程名
|
||||
def get_project_name(self): return self.get_output_value(PROJECT_KEY)
|
||||
|
||||
class ComicLoader(BaseLoader):
|
||||
# 漫画名
|
||||
def name(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('name', value, xpath, index, sexec)
|
||||
# 漫画封面链接
|
||||
@ -101,7 +115,9 @@ class ComicLoader(ItemLoader):
|
||||
# 图像名称
|
||||
def images(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('images', value, xpath, index, sexec)
|
||||
# 图像链接
|
||||
def image_urls(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('image_urls', value, xpath, index, sexec)
|
||||
def image_urls(self, value=None, xpath=None, index=None, sexec=None):
|
||||
self.set_properties('images', value, xpath, index, sexec)
|
||||
self.set_properties('image_urls', value, xpath, index, sexec)
|
||||
|
||||
def get_output_value(self, field_name, skip_field=["chapter"]):
|
||||
value = super().get_output_value(field_name)
|
||||
@ -121,8 +137,6 @@ class ComicLoader(ItemLoader):
|
||||
def get_schapter(self): return self.get_output_value("s_chapter")
|
||||
# 漫画封面
|
||||
def get_icon(self): return self.get_output_value("icon")
|
||||
# 工程名
|
||||
def get_project_name(self): return self.get_output_value(PROJECT_KEY)
|
||||
# 章节链接
|
||||
def get_chapter_href(self): return self.get_output_value("chapter_href")
|
||||
# 全部章节
|
||||
@ -143,6 +157,8 @@ class ComicLoader(ItemLoader):
|
||||
|
||||
def set_chapter(self, value): self.set_properties('chapter', value=value)
|
||||
def set_schapter(self, value): self.set_properties('s_chapter', value=value)
|
||||
def set_domain(self, value): self.set_properties('domain', value=value)
|
||||
def get_domain(self): return self.get_output_value("domain")
|
||||
|
||||
# 章节页码
|
||||
def count(self):
|
||||
@ -162,6 +178,26 @@ class ComicLoader(ItemLoader):
|
||||
def load_item(self, chapter=None):
|
||||
self.count()
|
||||
self.index()
|
||||
if not self.get_icon().startswith("http"): self.icon(self.get_domain()+ self.get_icon())
|
||||
if chapter != None: self.set_chapter(chapter)
|
||||
self.save_sname_schapter()
|
||||
return super().load_item()
|
||||
|
||||
def set_image_item(self, image_url, image_path, image_name, image_scramble="False", image_type="Image"):
|
||||
return { "image_url" : image_url, "image_path" : image_path, "image_name" : image_name, "image_scramble" : image_scramble , "image_type" : image_type}
|
||||
|
||||
# 图像链接处理
|
||||
def parse_images(self):
|
||||
images_item = []
|
||||
icon_path = os.path.join(self.get_project_name(), "icons", self.get_name(), self.get_name()+".jpg")
|
||||
images_item.append(self.set_image_item(image_url= self.get_icon() , image_path = icon_path , image_name=self.get_name()+".jpg", image_scramble="False", image_type="Icon"))
|
||||
for url, name in zip(self.get_image_urls(), self.get_images()):
|
||||
image_path = os.path.join(self.get_project_name(), "images", self.get_name(), self.get_chapter(), name)
|
||||
images_item.append(self.set_image_item(image_url= url , image_path= image_path, image_name=name))
|
||||
return images_item
|
||||
|
||||
class BooksLoader(BaseLoader):
|
||||
|
||||
def get_names(self): return self.get_output_value("names")
|
||||
|
||||
def get_urls(self): return self.get_output_value("urls")
|
||||
@ -7,11 +7,11 @@
|
||||
# useful for handling different item types with a single interface
|
||||
import os,scrapy,logging,shutil
|
||||
from Comics import settings
|
||||
from Comics.items import ComicItem
|
||||
from Comics.items import ComicItem,ImageItem
|
||||
from Comics.loader import ComicLoader
|
||||
from Comics.utils import CBZUtils,fileUtils as fu
|
||||
from Comics.utils import ComicPath
|
||||
from Comics.utils import checkUtils,oldUtils
|
||||
from Comics.utils import oldUtils
|
||||
from Comics.exporters import JsonExport,ItemExporter
|
||||
from scrapy.pipelines.images import ImagesPipeline
|
||||
from Comics._utils.ComicInfo import ComicInfoXml
|
||||
@ -21,13 +21,11 @@ class ComicsPipeline():
|
||||
# item就是yield后面的对象
|
||||
def process_item(self, item: ComicItem, spider):
|
||||
if isinstance(item, ComicItem):
|
||||
# 'output/rm_comic/json/壞X/第1話 壞X'
|
||||
# 已存在漫画CBZ文件 调用转换
|
||||
result_item = None
|
||||
if fu.exists(ComicPath(item).PATH_CBZ()): result_item = ItemExporter().export_obj(item)
|
||||
# 不存在漫画CBZ文件
|
||||
else: result_item = JsonExport(file=ComicPath(item).getDirJosnComicChapter()).export_json(ComicLoader(item).load_item(), if_return=True)
|
||||
#oldUtils().clean_old_files(files=result_item["chapters"], folder=ComicPath(item).file_path(result_type=ComicPath.MAPPING_CBZ_DIR), move_folder=ComicPath(item).file_path(result_type=ComicPath.MAPPING_OLD_CBZ_DIR))
|
||||
return result_item
|
||||
|
||||
class BaseImagesPipeline(ImagesPipeline):
|
||||
@ -60,41 +58,20 @@ class BaseImagesPipeline(ImagesPipeline):
|
||||
if len(fail_data) == 0 and len(results) != 0: is_success = True
|
||||
return is_success
|
||||
|
||||
# 封面下载操作类
|
||||
class IconDownloadPipeline(BaseImagesPipeline):
|
||||
|
||||
# 数据处理
|
||||
def get_media_requests(self, item, info):
|
||||
comic = ComicLoader(item=item)
|
||||
# 获取封面链接和封面保存路径
|
||||
icon_url, icon_cache_path = [ comic.get_icon(), super().get_file_path(item, result_type="icon_cache") ]
|
||||
# 封面已存在
|
||||
if fu.exists(icon_cache_path): return False
|
||||
else: yield scrapy.Request(url=icon_url, meta={'path': icon_cache_path })
|
||||
|
||||
def item_completed(self, results, item, info):
|
||||
if super().success_completed(item, results):
|
||||
print(" icon download success")
|
||||
# 更新封面到Icon文件夹内
|
||||
super().update_icon(item)
|
||||
return item
|
||||
|
||||
|
||||
class ImgDownloadPipeline(BaseImagesPipeline):
|
||||
|
||||
|
||||
def get_media_requests(self, item, info):
|
||||
comic = ComicLoader(item=item)
|
||||
self.image_urls, self.images = [ comic.get_image_urls(), comic.get_images() ]
|
||||
# 添加封面下载信息至下载列表中
|
||||
# self.add_download_icon(item)
|
||||
for image_url,image in zip(self.image_urls,self.images):
|
||||
if_down, image_path = [ True, super().get_file_path(item, image)]
|
||||
images_item = comic.parse_images()
|
||||
for image_item in images_item:
|
||||
if_down = True
|
||||
image_url = image_item["image_url"]
|
||||
image_path = image_item["image_path"]
|
||||
if image_item["image_type"] == "Icon":
|
||||
image_path = super().get_file_path(item, result_type="icon_cache")
|
||||
if fu.exists(image_path): return False
|
||||
# 图像(含加密图像)已存在
|
||||
if super().image_scramble_exits(item, image_path):
|
||||
#if image_path == self.get_file_path(item, result_type="icon_cache"):
|
||||
# logging.info(f"icon file exists: IMAGE_STORE {image_path}")
|
||||
#else:
|
||||
if_down = False
|
||||
logging.info(f"file exists: IMAGE_STORE {image_path}")
|
||||
if if_down:
|
||||
@ -122,10 +99,12 @@ class ImgDownloadPipeline(BaseImagesPipeline):
|
||||
item (_type_): Comic item数据
|
||||
info (_type_): 信息
|
||||
"""
|
||||
if super().success_completed(item, results): super().update_icon(item)
|
||||
|
||||
cbz_path = super().get_file_path(item, result_type="cbz")
|
||||
chapter_dir = ComicPath(item=item).file_path(result_type=ComicPath().MAPPING_IMAGES_DIR)
|
||||
images_file = oldUtils().old_images(folder=chapter_dir)
|
||||
if len(images_file) != len(ComicLoader(item=item).get_image_urls()): return
|
||||
if images_file == None or len(images_file) != len(ComicLoader(item=item).get_image_urls()): return
|
||||
if fu.exists(cbz_path):
|
||||
#self.update_icon(item)
|
||||
chapter = os.path.basename(cbz_path).split(".")[0]
|
||||
@ -135,19 +114,9 @@ class ImgDownloadPipeline(BaseImagesPipeline):
|
||||
self.pack_icon(item)
|
||||
else:
|
||||
# ComicInfoXml 生成
|
||||
#comic_info = ComicInfoXmlItemExporter(dir=super().get_file_path(item=item, result_type="comic_info")).export_xml(item)
|
||||
comic_pages = ComicInfoXml().scrapy_xml_by_json(item, save_dir=super().get_file_path(item=item, result_type="images_dir"))
|
||||
#if CBZUtils.packComicChapterCBZ(src_dir= super().get_file_path(item, result_type="images_dir"),
|
||||
# dts_path= cbz_path,
|
||||
# comic_info_images= comic_info['Pages'], remove=True):
|
||||
if CBZUtils.packComicChapterCBZ(src_dir= super().get_file_path(item, result_type="images_dir"),
|
||||
dts_path= cbz_path,
|
||||
comic_info_images= comic_pages, remove=True):
|
||||
super().update_icon(item)
|
||||
self.pack_icon(item)
|
||||
# CBZ校验失败
|
||||
#else:
|
||||
# checkUtils().export_error(item)
|
||||
#sleep_time = random.randint(3,15)
|
||||
#print(f'等待{sleep_time}秒后进行下一章节')
|
||||
#time.sleep(int(sleep_time))
|
||||
@ -97,7 +97,7 @@ ITEM_PIPELINES = {
|
||||
# 'scrapy.pipelines.images.ImagesPipeline' : 1,
|
||||
'Comics.pipelines.ComicsPipeline': 300,
|
||||
# 'Comics.pipelines.ImageParsePipeline': 400,
|
||||
'Comics.pipelines.IconDownloadPipeline': 400,
|
||||
# 'Comics.pipelines.IconDownloadPipeline': 400,
|
||||
'Comics.pipelines.ImgDownloadPipeline': 500,
|
||||
}
|
||||
|
||||
@ -131,3 +131,4 @@ CBZ_EXPORT_PATH = os.path.join(BASE_OUTPUT,"CBZ")
|
||||
OLD_CBZ_EXPORT_PATH = os.path.join(BASE_OUTPUT,"Old_CBZ")
|
||||
#数据导出类 排序
|
||||
COMIC_INFO_XML_FILE = "ComicInfo.xml"
|
||||
COMIC_INFO_XSD_FILE = "Comics/assets/ComicInfo_2.1.xsd"
|
||||
@ -1,6 +1,8 @@
|
||||
import scrapy,logging,os,skip
|
||||
from Comics.items import ComicItem
|
||||
import scrapy,logging,os,skip,json,re
|
||||
from Comics.items import ComicItem,Image
|
||||
from Comics.items import BooksItem
|
||||
from Comics.loader import ComicLoader
|
||||
from Comics.loader import BooksLoader
|
||||
from Comics.utils import ComicPath
|
||||
from Comics.utils import Conf
|
||||
from Comics.utils import oldUtils
|
||||
@ -9,7 +11,7 @@ class RmComicSpider(scrapy.Spider):
|
||||
name = 'rm_comic'
|
||||
allowed_domains = ['rouman5.com']
|
||||
main_url = 'https://'+allowed_domains[0]
|
||||
start_urls = main_url+'/books'
|
||||
start_urls = main_url+"/books"
|
||||
|
||||
# 遍历网站页数数据
|
||||
def start_requests(self):
|
||||
@ -18,18 +20,18 @@ class RmComicSpider(scrapy.Spider):
|
||||
|
||||
# 获取多个漫画信息
|
||||
def books_comic(self, response):
|
||||
comics = ComicLoader(item=ComicItem(), response=response)
|
||||
books_item = Conf().books(self.name, BooksLoader(BooksItem(), response))
|
||||
# 获取漫画网站//script[@id]内的json数据并获取props.pageProps.books数据并作偱环解析
|
||||
for book in comics.get_exec(comics.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0], str_exec="props.pageProps.books"):
|
||||
for book,url in zip(books_item.get_names(), books_item.get_urls()):
|
||||
# 排除指定的漫画名
|
||||
if book['name'] not in skip.skip_comic:
|
||||
yield scrapy.Request(url=self.start_urls+"/"+book['id'], callback=self.parse_comic)
|
||||
if book not in skip.skip_comic: yield scrapy.Request(url=self.main_url+"/"+url, callback=self.parse_comic)
|
||||
|
||||
# 获取某个漫画的相关数据
|
||||
# 获取到多个章节链接后进入下个流程
|
||||
def parse_comic(self, response):
|
||||
# 初始化Comic数据并根据工程名称读取配置文件并自动解析
|
||||
comic_item = Conf().comic(self.name, ComicLoader(ComicItem(), response))
|
||||
comic_item.set_domain(self.main_url)
|
||||
path_comic = comic_item.load_item()
|
||||
cbz_dir = ComicPath(path_comic).file_path(result_type=ComicPath.MAPPING_CBZ_DIR)
|
||||
move_folder = ComicPath(path_comic).file_path(result_type=ComicPath.MAPPING_OLD_CBZ_DIR)
|
||||
@ -44,7 +46,6 @@ class RmComicSpider(scrapy.Spider):
|
||||
# 获取最终存放CBZ的路径
|
||||
cbz_path = ComicPath(item=item).PATH_CBZ()
|
||||
# 校验繁体和简体中文CBZ路径是否存在
|
||||
# if not checkUtils().is_error(item) and os.path.exists(cbz_path):
|
||||
if cbz_path !=None and os.path.exists(cbz_path):
|
||||
logging.info(f"漫画 {cbz_path} 已存在, 跳过中...")
|
||||
yield item
|
||||
@ -56,22 +57,24 @@ class RmComicSpider(scrapy.Spider):
|
||||
def parse_chapter(self, response):
|
||||
# 获取传入的漫画item数据
|
||||
ci = ComicLoader(item=response.meta['item'], response=response)
|
||||
reuslt_json = None
|
||||
for data_json in ci.get_xpath('//script/text()'):
|
||||
if data_json.startswith('self.__next_f.push([1,"5') : reuslt_json = data_json
|
||||
# 再次通过获取的XPATH数据解析并保存到ci(ComicItem)中
|
||||
item: ComicLoader = Conf().parse_chapter(item=ci, value=ci.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0])
|
||||
comic, chapter_api_url = [ item.load_item() ,item.get_chapter_api() ]
|
||||
if chapter_api_url is not None and len(chapter_api_url) != 0 :
|
||||
try:
|
||||
yield scrapy.Request(self.main_url + chapter_api_url, meta={'item': comic}, callback=self.parse_chapter_api)
|
||||
except:
|
||||
logging.warning(f"yield scrapy.Request({self.main_url} + {chapter_api_url}, meta={comic}, callback=self.parse_chapter_api)")
|
||||
# 正则表达式匹配 .jpg 链接
|
||||
jpg_links = re.findall(r'(https?://\S+\.jpg)', reuslt_json)
|
||||
images_urls = []
|
||||
# 打印提取的 .jpg 链接
|
||||
for link in jpg_links:
|
||||
sr_value = re.search(r'sr:(\d+)', link)
|
||||
# 打印提取到的 sr: 的值
|
||||
if sr_value:
|
||||
sr = sr_value.group(1) # group(1) 返回第一个捕获组,即数字部分
|
||||
else:
|
||||
yield comic
|
||||
|
||||
# 加密数据API处理
|
||||
def parse_chapter_api(self, response):
|
||||
comic_item = ComicLoader(item=response.meta['item'], response=response)
|
||||
return Conf().parse_chapter_api(item=comic_item, value=response.text).load_item()
|
||||
|
||||
print("No match found")
|
||||
images_urls.append(Image().setImage(url=link, scramble=sr.replace("0", "False").replace("1", "True")))
|
||||
ci.image_urls(value=images_urls)
|
||||
yield ci.load_item()
|
||||
|
||||
def parse(self, response):
|
||||
raise NotImplementedError
|
||||
|
||||
@ -1,22 +1,29 @@
|
||||
books:
|
||||
names: '//div[@class="truncate"]/text()'
|
||||
urls: '//div[@class="grid grid-cols-1 sm:grid-cols-4 md:grid-cols-6 gap-2 sm:gap-4"]//a/@href'
|
||||
|
||||
|
||||
data:
|
||||
name: '//div[@class="col"]/h5/text()'
|
||||
icon: '//img[@class="img-thumbnail"]/@src'
|
||||
name: '//div[@class="basis-3/5 text-sm sm:text-base"]//div[@class="text-xl text-gray-900"]/text()'
|
||||
icon: '//div[@class="flex flex-row gap-3 sm:gap-4"]//div[@class="basis-2/5"]/img[@class="rounded"]/@src'
|
||||
author:
|
||||
xpath: '//div[contains(@class,"bookid_bookInfo")]/p[1]/text()'
|
||||
index: 1
|
||||
tags: '//div[contains(@class,"bookid_bookInfo")]/p[3]/b/text()'
|
||||
xpath: '//div[@class="flex flex-row gap-3 sm:gap-4"]//span[@class="text-gray-800"]/text()'
|
||||
index: 0
|
||||
tags:
|
||||
xpath: '//div[@class="flex flex-row gap-3 sm:gap-4"]//span[@class="text-gray-800"]/text()'
|
||||
index: 3
|
||||
dep:
|
||||
xpath: '//div[contains(@class,"bookid_bookInfo")]/p[4]/text()'
|
||||
xpath: '//div[@class="my-2 text-gray-800 text-sm sm:text-base"]/p/text()'
|
||||
index: 1
|
||||
date:
|
||||
xpath: '//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()'
|
||||
xpath: '//div[@class="text-gray-500 text-sm mt-2"]/div/text()'
|
||||
index: 1
|
||||
genre:
|
||||
value: "韩漫"
|
||||
age_rating:
|
||||
value: "R18+"
|
||||
chapter_href: '//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/@href'
|
||||
chapters: '//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/text()'
|
||||
chapter_href: '//div[@class="grid grid-cols-1 sm:grid-cols-2 md:grid-cols-3 gap-2 px-2 py-4"]//a/@href'
|
||||
chapters: '//div[@class="grid grid-cols-1 sm:grid-cols-2 md:grid-cols-3 gap-2 px-2 py-4"]//div[@class="text truncate bg-slate-300 p-2 hover:bg-rose-100"]/text()'
|
||||
|
||||
parse_chapter:
|
||||
name:
|
||||
|
||||
@ -1,68 +0,0 @@
|
||||
import scrapy,logging,time,os
|
||||
from Comics.items import ComicItem
|
||||
from Comics.loader import ComicLoader
|
||||
from Comics.utils import ComicPath
|
||||
from Comics.settings import PROJECT_KEY
|
||||
import skip
|
||||
|
||||
class RmComicSpider(scrapy.Spider):
|
||||
name = 'yh_comic'
|
||||
allowed_domains = ['www.shuanglilock.com.cn']
|
||||
main_url = 'https://'+allowed_domains[0]
|
||||
start_urls = main_url+'/info'
|
||||
|
||||
def start_requests(self):
|
||||
# for x in range(0,60):
|
||||
yield scrapy.Request("https://www.shuanglilock.com.cn/info/27145/", callback=self.parse_comic)
|
||||
|
||||
# 获取多个漫画信息
|
||||
# def books_comic(self, response):
|
||||
# comics = ComicLoader(item=ComicItem(), response=response)
|
||||
# data = comics.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0]
|
||||
# for book in comics.get_exec(data, str_exec="props.pageProps.books"):
|
||||
# comics.add_value('link', self.start_urls+"/"+book['id'])
|
||||
# if book['name'] not in skip.skip_comic:
|
||||
# yield scrapy.Request(url=self.start_urls+"/"+book['id'], callback=self.parse_comic)
|
||||
|
||||
# 获取某个漫画的相关数据
|
||||
# 获取到多个章节链接后进入下个流程
|
||||
def parse_comic(self, response):
|
||||
comic_item = ComicLoader(item=ComicItem(), response=response)
|
||||
comic_item.project_name(self.name)
|
||||
comic_item.name(xpath='//div[@class="comics-detail__info"]/h1[@class="comics-detail__title"]/text()')
|
||||
comic_item.icon(xpath='//div[@class="pure-u-1-1 pure-u-sm-1-3 pure-u-md-1-6"]/img/@src')
|
||||
comic_item.author(xpath='//div[@class="comics-detail__info"]/h2[@class="comics-detail__author"]/text()')
|
||||
comic_item.tags(xpath='//div[@class="tag-list"]/a[@class="tag"]/text()')
|
||||
comic_item.dep(xpath='//p[contains(@class,"comics-detail__desc")]/text()')
|
||||
#comic_item.date(xpath='//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()', index=1)
|
||||
comic_item.genre(value="樱花漫画")
|
||||
#comic_item.age_rating(value="R18+")
|
||||
chapter_href = comic_item.get_xpath('//div[contains(@id,"chapter-items")]'
|
||||
'//a[@class="comics-chapters__item"]/@href')
|
||||
chapters = comic_item.get_xpath('//div[contains(@id,"chapter-items")]'
|
||||
'//a[@class="comics-chapters__item"]//span/text()')
|
||||
for chapter, link in zip(chapters, chapter_href):
|
||||
comic_item.chapters(value=chapters)
|
||||
comic_item.chapter(value=chapter)
|
||||
item = comic_item.load_item()
|
||||
cbz_path = ComicPath(item).get_file_path(result_type="cbz", convert=True)
|
||||
if os.path.exists(cbz_path):
|
||||
logging.info(f"漫画 {cbz_path} 已存在, 跳过中...")
|
||||
yield item
|
||||
else:
|
||||
yield scrapy.Request(self.main_url+link, meta={'item': item}, callback=self.parse_chapter)
|
||||
|
||||
|
||||
# 读取某章节下的所有图片
|
||||
def parse_chapter(self, response):
|
||||
comic_item = ComicLoader(item=response.meta['item'], response=response)
|
||||
comic_item.image_urls(xpath='//div[@class="comiclist"]/div[@class="comicpage"]/div/img/@data-original')
|
||||
comic_item.images(xpath='//div[@class="comiclist"]/div[@class="comicpage"]/div/img/@data-original')
|
||||
comic = comic_item.load_item()
|
||||
yield comic
|
||||
|
||||
def parse(self, response):
|
||||
raise NotImplementedError
|
||||
|
||||
def error_parse(self, response):
|
||||
raise NotImplementedError
|
||||
@ -7,9 +7,11 @@ from opencc import OpenCC
|
||||
from PIL import Image
|
||||
from pathlib import Path
|
||||
from zipfile import ZipFile
|
||||
from Comics.settings import COMIC_INFO_XML_FILE,OUTPUT_DIR,PROJECT_KEY
|
||||
from Comics.settings import COMIC_INFO_XML_FILE,COMIC_INFO_XSD_FILE,OUTPUT_DIR,PROJECT_KEY
|
||||
import yaml
|
||||
from Comics.loader import BaseLoader
|
||||
from Comics.loader import ComicLoader
|
||||
from Comics.loader import BooksLoader
|
||||
from tinydb import TinyDB, Query
|
||||
|
||||
# 配置类
|
||||
@ -47,7 +49,7 @@ class Conf():
|
||||
return None
|
||||
|
||||
# 根据读取的配置数据导入到ComicLoader中
|
||||
def comic(self, project, item: ComicLoader, child_data='data', val=None):
|
||||
def base_data(self, project, item: BaseLoader, child_data='data', val=None):
|
||||
item.project_name(project)
|
||||
data = self.get_config_value(project, child_data)
|
||||
for key, xpath_data in data.items():
|
||||
@ -59,6 +61,12 @@ class Conf():
|
||||
item.set_properties(name=key, value=value, xpath=xpath, index=index, sexec=sexec)
|
||||
return item
|
||||
|
||||
def books(self, project, item: BooksLoader, child_data='books', val=None):
|
||||
return self.base_data(project, item, child_data, val)
|
||||
|
||||
def comic(self, project, item: ComicLoader, child_data='data', val=None):
|
||||
return self.base_data(project, item, child_data, val)
|
||||
|
||||
def parse_chapter(self,item: ComicLoader, value):
|
||||
return self.comic(item.get_project_name(), item, "parse_chapter", value)
|
||||
|
||||
@ -245,7 +253,7 @@ class CommonUtils:
|
||||
|
||||
@classmethod
|
||||
def validate_comicinfo_xml(cls, xml_file):
|
||||
cls._validate_xml(xml_file, "ComicInfo.xsd")
|
||||
cls._validate_xml(xml_file, COMIC_INFO_XSD_FILE)
|
||||
|
||||
|
||||
# 图片处理类
|
||||
@ -772,7 +780,6 @@ class ntfy:
|
||||
print("Notification sent successfully!")
|
||||
else:
|
||||
print(f"Failed to send notification. Status code: {response.status_code}")
|
||||
print(response.json())
|
||||
|
||||
|
||||
class logger:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user