update new Rouman5

This commit is contained in:
caiwx86 2024-10-28 00:03:20 +08:00
parent 6638254416
commit 5f5b6cb6ec
10 changed files with 142 additions and 169 deletions

View File

@ -3,6 +3,7 @@ from xml.dom import minidom
from typing import List from typing import List
import json,os import json,os
from lxml import etree from lxml import etree
from Comics.settings import COMIC_INFO_XML_FILE,COMIC_INFO_XSD_FILE,OUTPUT_DIR,PROJECT_KEY
# Define the ComicInfo and ComicPageInfo classes # Define the ComicInfo and ComicPageInfo classes
class ComicInfo: class ComicInfo:
@ -94,7 +95,7 @@ class ComicInfoXml:
if remove: if remove:
os.remove(xml_file) os.remove(xml_file)
def parse_comicinfo(self, comic: ComicInfo, save_dir=None, xml_filename="ComicInfo.xml", xsd_filename="ComicInfo_2.1.xsd"): def parse_comicinfo(self, comic: ComicInfo, save_dir=None, xml_filename="ComicInfo.xml", xsd_filename="ComicInfo.xsd"):
"""_summary_ """_summary_
Args: Args:
@ -143,7 +144,7 @@ class ComicInfoXml:
#xml_data = json_to_xml_with_declaration(json_data) #xml_data = json_to_xml_with_declaration(json_data)
#print(xml_data) #print(xml_data)
def scrapy_xml_by_json(self, json_data, save_dir=None): def scrapy_xml_by_json(self, json_data, save_dir=None, xsd_file=COMIC_INFO_XSD_FILE):
comic = ComicInfo() comic = ComicInfo()
comic.Title = json_data.get("chapter", "") comic.Title = json_data.get("chapter", "")
comic.Series = json_data.get("name", "") comic.Series = json_data.get("name", "")
@ -163,5 +164,5 @@ class ComicInfoXml:
page.Image = image_name.split(".")[0].split("_")[-1] page.Image = image_name.split(".")[0].split("_")[-1]
pages.append(page.Image) pages.append(page.Image)
comic.Pages.append(page) comic.Pages.append(page)
self.parse_comicinfo(comic, save_dir=save_dir) self.parse_comicinfo(comic, save_dir=save_dir, xsd_filename=xsd_file)
return pages return pages

View File

@ -28,7 +28,8 @@ def _serialize_to_images(value, result_type=None):
# suffix = "."+str(image_src).split(".")[-1] # suffix = "."+str(image_src).split(".")[-1]
suffix = ".jpg" suffix = ".jpg"
image_name = count_image + suffix image_name = count_image + suffix
if scramble: #if scramble:
if scramble == "True":
de_str = str(image_src).split("/")[-1].replace(suffix, "==") de_str = str(image_src).split("/")[-1].replace(suffix, "==")
blocks_num = imageUtils.encodeImage(de_str) blocks_num = imageUtils.encodeImage(de_str)
image_name = ComicPath.getFileScrambleImageName(count=count_image, block=blocks_num, suffix=suffix) image_name = ComicPath.getFileScrambleImageName(count=count_image, block=blocks_num, suffix=suffix)
@ -46,7 +47,7 @@ def _serialize_to_images(value, result_type=None):
def serialize_to_images(value): return _serialize_to_images(value) def serialize_to_images(value): return _serialize_to_images(value)
# 图像链接处理方法 # 图像链接处理方法
def serialize_to_image_urls(value): return _serialize_to_images(value, result_type="image_urls") def serialize_to_image_urls(value): return _serialize_to_images(value, result_type="image_urls")
# ComicItem # ComicItem
class ComicItem(Item): class ComicItem(Item):
@ -93,11 +94,27 @@ class ComicItem(Item):
# 图像名 # 图像名
images_name = Field() images_name = Field()
domain = Field()
#章节链接 #章节链接
chapter_href = Field() chapter_href = Field()
#章节API #章节API
chapter_api = Field() chapter_api = Field()
class BooksItem(Item):
current_project = Field()
names = Field()
urls = Field()
class ImageItem(Item):
image_url = Field()
image_name = Field()
image_path = Field()
image_type = Field()
isScramble = Field()
class Image():
def setImage(self, url, scramble): return { "src" : url, "scramble": scramble}
# 序列化-作者 # 序列化-作者
def serializer_info_writer(value): def serializer_info_writer(value):
(list_value, value) = [[], str(value).replace("&", " ")] (list_value, value) = [[], str(value).replace("&", " ")]

View File

@ -1,8 +1,8 @@
import json,logging import json,logging,os
from scrapy.loader import ItemLoader from scrapy.loader import ItemLoader
from Comics.settings import PROJECT_KEY from Comics.settings import PROJECT_KEY,IMAGES_STORE
class ComicLoader(ItemLoader): class BaseLoader(ItemLoader):
def parseExec(self,data,exec): def parseExec(self,data,exec):
if data !=None and exec != None: if data !=None and exec != None:
dots = str(exec).split(".") dots = str(exec).split(".")
@ -53,7 +53,7 @@ class ComicLoader(ItemLoader):
def get_exec(self, value, str_exec): def get_exec(self, value, str_exec):
return self.parseExec(value, str_exec) return self.parseExec(value, str_exec)
def add_value(self, field_name, value, *processors, re=None, **kw): def add_value(self, field_name, value, *processors, re=None, **kw):
if self.auto_replace_value(field_name, value): if self.auto_replace_value(field_name, value):
return super().add_value(field_name, value, *processors, re=re, **kw) return super().add_value(field_name, value, *processors, re=re, **kw)
@ -68,15 +68,29 @@ class ComicLoader(ItemLoader):
# 设置漫画属性 # 设置漫画属性
def set_properties(self, name, value=None, xpath=None, index=None, sexec=None): def set_properties(self, name, value=None, xpath=None, index=None, sexec=None):
if value != None and sexec==None: if value != None:
self.add_value(field_name=name, value=value) self.add_value(field_name=name, value=value)
if xpath != None: if xpath != None:
self.add_xpath(field_name=name, xpath=xpath, index=index) self.add_xpath(field_name=name, xpath=xpath, index=index)
if sexec != None: if sexec != None:
self.add_exec(field_name=name, value=value, str_exec=sexec) self.add_exec(field_name=name, value=value, str_exec=sexec)
def get_output_value(self, field_name, skip_field=["chapter"]):
value = super().get_output_value(field_name)
try:
if isinstance(value, list) and len(value) == 1:
if field_name not in skip_field: value = value[0]
else: value = "".join(value)
except:
print(f"get_output_value value={value} type={type(value)}")
return value
# 工程名 # 工程名
def project_name(self, project_name): self.add_value(PROJECT_KEY, project_name) def project_name(self, project_name): self.add_value(PROJECT_KEY, project_name)
# 工程名
def get_project_name(self): return self.get_output_value(PROJECT_KEY)
class ComicLoader(BaseLoader):
# 漫画名 # 漫画名
def name(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('name', value, xpath, index, sexec) def name(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('name', value, xpath, index, sexec)
# 漫画封面链接 # 漫画封面链接
@ -101,7 +115,9 @@ class ComicLoader(ItemLoader):
# 图像名称 # 图像名称
def images(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('images', value, xpath, index, sexec) def images(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('images', value, xpath, index, sexec)
# 图像链接 # 图像链接
def image_urls(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('image_urls', value, xpath, index, sexec) def image_urls(self, value=None, xpath=None, index=None, sexec=None):
self.set_properties('images', value, xpath, index, sexec)
self.set_properties('image_urls', value, xpath, index, sexec)
def get_output_value(self, field_name, skip_field=["chapter"]): def get_output_value(self, field_name, skip_field=["chapter"]):
value = super().get_output_value(field_name) value = super().get_output_value(field_name)
@ -121,8 +137,6 @@ class ComicLoader(ItemLoader):
def get_schapter(self): return self.get_output_value("s_chapter") def get_schapter(self): return self.get_output_value("s_chapter")
# 漫画封面 # 漫画封面
def get_icon(self): return self.get_output_value("icon") def get_icon(self): return self.get_output_value("icon")
# 工程名
def get_project_name(self): return self.get_output_value(PROJECT_KEY)
# 章节链接 # 章节链接
def get_chapter_href(self): return self.get_output_value("chapter_href") def get_chapter_href(self): return self.get_output_value("chapter_href")
# 全部章节 # 全部章节
@ -143,7 +157,9 @@ class ComicLoader(ItemLoader):
def set_chapter(self, value): self.set_properties('chapter', value=value) def set_chapter(self, value): self.set_properties('chapter', value=value)
def set_schapter(self, value): self.set_properties('s_chapter', value=value) def set_schapter(self, value): self.set_properties('s_chapter', value=value)
def set_domain(self, value): self.set_properties('domain', value=value)
def get_domain(self): return self.get_output_value("domain")
# 章节页码 # 章节页码
def count(self): def count(self):
len_images = len(self.get_images()) len_images = len(self.get_images())
@ -162,6 +178,26 @@ class ComicLoader(ItemLoader):
def load_item(self, chapter=None): def load_item(self, chapter=None):
self.count() self.count()
self.index() self.index()
if not self.get_icon().startswith("http"): self.icon(self.get_domain()+ self.get_icon())
if chapter != None: self.set_chapter(chapter) if chapter != None: self.set_chapter(chapter)
self.save_sname_schapter() self.save_sname_schapter()
return super().load_item() return super().load_item()
def set_image_item(self, image_url, image_path, image_name, image_scramble="False", image_type="Image"):
return { "image_url" : image_url, "image_path" : image_path, "image_name" : image_name, "image_scramble" : image_scramble , "image_type" : image_type}
# 图像链接处理
def parse_images(self):
images_item = []
icon_path = os.path.join(self.get_project_name(), "icons", self.get_name(), self.get_name()+".jpg")
images_item.append(self.set_image_item(image_url= self.get_icon() , image_path = icon_path , image_name=self.get_name()+".jpg", image_scramble="False", image_type="Icon"))
for url, name in zip(self.get_image_urls(), self.get_images()):
image_path = os.path.join(self.get_project_name(), "images", self.get_name(), self.get_chapter(), name)
images_item.append(self.set_image_item(image_url= url , image_path= image_path, image_name=name))
return images_item
class BooksLoader(BaseLoader):
def get_names(self): return self.get_output_value("names")
def get_urls(self): return self.get_output_value("urls")

View File

@ -7,11 +7,11 @@
# useful for handling different item types with a single interface # useful for handling different item types with a single interface
import os,scrapy,logging,shutil import os,scrapy,logging,shutil
from Comics import settings from Comics import settings
from Comics.items import ComicItem from Comics.items import ComicItem,ImageItem
from Comics.loader import ComicLoader from Comics.loader import ComicLoader
from Comics.utils import CBZUtils,fileUtils as fu from Comics.utils import CBZUtils,fileUtils as fu
from Comics.utils import ComicPath from Comics.utils import ComicPath
from Comics.utils import checkUtils,oldUtils from Comics.utils import oldUtils
from Comics.exporters import JsonExport,ItemExporter from Comics.exporters import JsonExport,ItemExporter
from scrapy.pipelines.images import ImagesPipeline from scrapy.pipelines.images import ImagesPipeline
from Comics._utils.ComicInfo import ComicInfoXml from Comics._utils.ComicInfo import ComicInfoXml
@ -21,13 +21,11 @@ class ComicsPipeline():
# item就是yield后面的对象 # item就是yield后面的对象
def process_item(self, item: ComicItem, spider): def process_item(self, item: ComicItem, spider):
if isinstance(item, ComicItem): if isinstance(item, ComicItem):
# 'output/rm_comic/json/壞X/第1話 壞X'
# 已存在漫画CBZ文件 调用转换 # 已存在漫画CBZ文件 调用转换
result_item = None result_item = None
if fu.exists(ComicPath(item).PATH_CBZ()): result_item = ItemExporter().export_obj(item) if fu.exists(ComicPath(item).PATH_CBZ()): result_item = ItemExporter().export_obj(item)
# 不存在漫画CBZ文件 # 不存在漫画CBZ文件
else: result_item = JsonExport(file=ComicPath(item).getDirJosnComicChapter()).export_json(ComicLoader(item).load_item(), if_return=True) else: result_item = JsonExport(file=ComicPath(item).getDirJosnComicChapter()).export_json(ComicLoader(item).load_item(), if_return=True)
#oldUtils().clean_old_files(files=result_item["chapters"], folder=ComicPath(item).file_path(result_type=ComicPath.MAPPING_CBZ_DIR), move_folder=ComicPath(item).file_path(result_type=ComicPath.MAPPING_OLD_CBZ_DIR))
return result_item return result_item
class BaseImagesPipeline(ImagesPipeline): class BaseImagesPipeline(ImagesPipeline):
@ -59,47 +57,26 @@ class BaseImagesPipeline(ImagesPipeline):
if not result[0]: fail_data.append(result[1]) if not result[0]: fail_data.append(result[1])
if len(fail_data) == 0 and len(results) != 0: is_success = True if len(fail_data) == 0 and len(results) != 0: is_success = True
return is_success return is_success
# 封面下载操作类
class IconDownloadPipeline(BaseImagesPipeline):
# 数据处理
def get_media_requests(self, item, info):
comic = ComicLoader(item=item)
# 获取封面链接和封面保存路径
icon_url, icon_cache_path = [ comic.get_icon(), super().get_file_path(item, result_type="icon_cache") ]
# 封面已存在
if fu.exists(icon_cache_path): return False
else: yield scrapy.Request(url=icon_url, meta={'path': icon_cache_path })
def item_completed(self, results, item, info):
if super().success_completed(item, results):
print(" icon download success")
# 更新封面到Icon文件夹内
super().update_icon(item)
return item
class ImgDownloadPipeline(BaseImagesPipeline): class ImgDownloadPipeline(BaseImagesPipeline):
def get_media_requests(self, item, info): def get_media_requests(self, item, info):
comic = ComicLoader(item=item) comic = ComicLoader(item=item)
self.image_urls, self.images = [ comic.get_image_urls(), comic.get_images() ] images_item = comic.parse_images()
# 添加封面下载信息至下载列表中 for image_item in images_item:
# self.add_download_icon(item) if_down = True
for image_url,image in zip(self.image_urls,self.images): image_url = image_item["image_url"]
if_down, image_path = [ True, super().get_file_path(item, image)] image_path = image_item["image_path"]
if image_item["image_type"] == "Icon":
image_path = super().get_file_path(item, result_type="icon_cache")
if fu.exists(image_path): return False
# 图像(含加密图像)已存在 # 图像(含加密图像)已存在
if super().image_scramble_exits(item, image_path): if super().image_scramble_exits(item, image_path):
#if image_path == self.get_file_path(item, result_type="icon_cache"):
# logging.info(f"icon file exists: IMAGE_STORE {image_path}")
#else:
if_down = False if_down = False
logging.info(f"file exists: IMAGE_STORE {image_path}") logging.info(f"file exists: IMAGE_STORE {image_path}")
if if_down: if if_down:
logging.info(f"downloading {image_url} --> IMAGE_STORE {image_path}") logging.info(f"downloading {image_url} --> IMAGE_STORE {image_path}")
yield scrapy.Request(url=image_url, meta={'path': image_path}) yield scrapy.Request(url=image_url, meta={'path': image_path})
# 打包cbz封面 # 打包cbz封面
def pack_icon(self, item): def pack_icon(self, item):
@ -122,10 +99,12 @@ class ImgDownloadPipeline(BaseImagesPipeline):
item (_type_): Comic item数据 item (_type_): Comic item数据
info (_type_): 信息 info (_type_): 信息
""" """
if super().success_completed(item, results): super().update_icon(item)
cbz_path = super().get_file_path(item, result_type="cbz") cbz_path = super().get_file_path(item, result_type="cbz")
chapter_dir = ComicPath(item=item).file_path(result_type=ComicPath().MAPPING_IMAGES_DIR) chapter_dir = ComicPath(item=item).file_path(result_type=ComicPath().MAPPING_IMAGES_DIR)
images_file = oldUtils().old_images(folder=chapter_dir) images_file = oldUtils().old_images(folder=chapter_dir)
if len(images_file) != len(ComicLoader(item=item).get_image_urls()): return if images_file == None or len(images_file) != len(ComicLoader(item=item).get_image_urls()): return
if fu.exists(cbz_path): if fu.exists(cbz_path):
#self.update_icon(item) #self.update_icon(item)
chapter = os.path.basename(cbz_path).split(".")[0] chapter = os.path.basename(cbz_path).split(".")[0]
@ -135,19 +114,9 @@ class ImgDownloadPipeline(BaseImagesPipeline):
self.pack_icon(item) self.pack_icon(item)
else: else:
# ComicInfoXml 生成 # ComicInfoXml 生成
#comic_info = ComicInfoXmlItemExporter(dir=super().get_file_path(item=item, result_type="comic_info")).export_xml(item)
comic_pages = ComicInfoXml().scrapy_xml_by_json(item, save_dir=super().get_file_path(item=item, result_type="images_dir")) comic_pages = ComicInfoXml().scrapy_xml_by_json(item, save_dir=super().get_file_path(item=item, result_type="images_dir"))
#if CBZUtils.packComicChapterCBZ(src_dir= super().get_file_path(item, result_type="images_dir"),
# dts_path= cbz_path,
# comic_info_images= comic_info['Pages'], remove=True):
if CBZUtils.packComicChapterCBZ(src_dir= super().get_file_path(item, result_type="images_dir"), if CBZUtils.packComicChapterCBZ(src_dir= super().get_file_path(item, result_type="images_dir"),
dts_path= cbz_path, dts_path= cbz_path,
comic_info_images= comic_pages, remove=True): comic_info_images= comic_pages, remove=True):
super().update_icon(item) super().update_icon(item)
self.pack_icon(item) self.pack_icon(item)
# CBZ校验失败
#else:
# checkUtils().export_error(item)
#sleep_time = random.randint(3,15)
#print(f'等待{sleep_time}秒后进行下一章节')
#time.sleep(int(sleep_time))

View File

@ -97,7 +97,7 @@ ITEM_PIPELINES = {
# 'scrapy.pipelines.images.ImagesPipeline' : 1, # 'scrapy.pipelines.images.ImagesPipeline' : 1,
'Comics.pipelines.ComicsPipeline': 300, 'Comics.pipelines.ComicsPipeline': 300,
# 'Comics.pipelines.ImageParsePipeline': 400, # 'Comics.pipelines.ImageParsePipeline': 400,
'Comics.pipelines.IconDownloadPipeline': 400, # 'Comics.pipelines.IconDownloadPipeline': 400,
'Comics.pipelines.ImgDownloadPipeline': 500, 'Comics.pipelines.ImgDownloadPipeline': 500,
} }
@ -130,4 +130,5 @@ LOG_STDOUT = True # 标准化输出
CBZ_EXPORT_PATH = os.path.join(BASE_OUTPUT,"CBZ") CBZ_EXPORT_PATH = os.path.join(BASE_OUTPUT,"CBZ")
OLD_CBZ_EXPORT_PATH = os.path.join(BASE_OUTPUT,"Old_CBZ") OLD_CBZ_EXPORT_PATH = os.path.join(BASE_OUTPUT,"Old_CBZ")
#数据导出类 排序 #数据导出类 排序
COMIC_INFO_XML_FILE = "ComicInfo.xml" COMIC_INFO_XML_FILE = "ComicInfo.xml"
COMIC_INFO_XSD_FILE = "Comics/assets/ComicInfo_2.1.xsd"

View File

@ -1,6 +1,8 @@
import scrapy,logging,os,skip import scrapy,logging,os,skip,json,re
from Comics.items import ComicItem from Comics.items import ComicItem,Image
from Comics.items import BooksItem
from Comics.loader import ComicLoader from Comics.loader import ComicLoader
from Comics.loader import BooksLoader
from Comics.utils import ComicPath from Comics.utils import ComicPath
from Comics.utils import Conf from Comics.utils import Conf
from Comics.utils import oldUtils from Comics.utils import oldUtils
@ -9,7 +11,7 @@ class RmComicSpider(scrapy.Spider):
name = 'rm_comic' name = 'rm_comic'
allowed_domains = ['rouman5.com'] allowed_domains = ['rouman5.com']
main_url = 'https://'+allowed_domains[0] main_url = 'https://'+allowed_domains[0]
start_urls = main_url+'/books' start_urls = main_url+"/books"
# 遍历网站页数数据 # 遍历网站页数数据
def start_requests(self): def start_requests(self):
@ -18,18 +20,18 @@ class RmComicSpider(scrapy.Spider):
# 获取多个漫画信息 # 获取多个漫画信息
def books_comic(self, response): def books_comic(self, response):
comics = ComicLoader(item=ComicItem(), response=response) books_item = Conf().books(self.name, BooksLoader(BooksItem(), response))
# 获取漫画网站//script[@id]内的json数据并获取props.pageProps.books数据并作偱环解析 # 获取漫画网站//script[@id]内的json数据并获取props.pageProps.books数据并作偱环解析
for book in comics.get_exec(comics.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0], str_exec="props.pageProps.books"): for book,url in zip(books_item.get_names(), books_item.get_urls()):
# 排除指定的漫画名 # 排除指定的漫画名
if book['name'] not in skip.skip_comic: if book not in skip.skip_comic: yield scrapy.Request(url=self.main_url+"/"+url, callback=self.parse_comic)
yield scrapy.Request(url=self.start_urls+"/"+book['id'], callback=self.parse_comic)
# 获取某个漫画的相关数据 # 获取某个漫画的相关数据
# 获取到多个章节链接后进入下个流程 # 获取到多个章节链接后进入下个流程
def parse_comic(self, response): def parse_comic(self, response):
# 初始化Comic数据并根据工程名称读取配置文件并自动解析 # 初始化Comic数据并根据工程名称读取配置文件并自动解析
comic_item = Conf().comic(self.name, ComicLoader(ComicItem(), response)) comic_item = Conf().comic(self.name, ComicLoader(ComicItem(), response))
comic_item.set_domain(self.main_url)
path_comic = comic_item.load_item() path_comic = comic_item.load_item()
cbz_dir = ComicPath(path_comic).file_path(result_type=ComicPath.MAPPING_CBZ_DIR) cbz_dir = ComicPath(path_comic).file_path(result_type=ComicPath.MAPPING_CBZ_DIR)
move_folder = ComicPath(path_comic).file_path(result_type=ComicPath.MAPPING_OLD_CBZ_DIR) move_folder = ComicPath(path_comic).file_path(result_type=ComicPath.MAPPING_OLD_CBZ_DIR)
@ -44,7 +46,6 @@ class RmComicSpider(scrapy.Spider):
# 获取最终存放CBZ的路径 # 获取最终存放CBZ的路径
cbz_path = ComicPath(item=item).PATH_CBZ() cbz_path = ComicPath(item=item).PATH_CBZ()
# 校验繁体和简体中文CBZ路径是否存在 # 校验繁体和简体中文CBZ路径是否存在
# if not checkUtils().is_error(item) and os.path.exists(cbz_path):
if cbz_path !=None and os.path.exists(cbz_path): if cbz_path !=None and os.path.exists(cbz_path):
logging.info(f"漫画 {cbz_path} 已存在, 跳过中...") logging.info(f"漫画 {cbz_path} 已存在, 跳过中...")
yield item yield item
@ -56,22 +57,24 @@ class RmComicSpider(scrapy.Spider):
def parse_chapter(self, response): def parse_chapter(self, response):
# 获取传入的漫画item数据 # 获取传入的漫画item数据
ci = ComicLoader(item=response.meta['item'], response=response) ci = ComicLoader(item=response.meta['item'], response=response)
reuslt_json = None
for data_json in ci.get_xpath('//script/text()'):
if data_json.startswith('self.__next_f.push([1,"5') : reuslt_json = data_json
# 再次通过获取的XPATH数据解析并保存到ci(ComicItem)中 # 再次通过获取的XPATH数据解析并保存到ci(ComicItem)中
item: ComicLoader = Conf().parse_chapter(item=ci, value=ci.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0]) # 正则表达式匹配 .jpg 链接
comic, chapter_api_url = [ item.load_item() ,item.get_chapter_api() ] jpg_links = re.findall(r'(https?://\S+\.jpg)', reuslt_json)
if chapter_api_url is not None and len(chapter_api_url) != 0 : images_urls = []
try: # 打印提取的 .jpg 链接
yield scrapy.Request(self.main_url + chapter_api_url, meta={'item': comic}, callback=self.parse_chapter_api) for link in jpg_links:
except: sr_value = re.search(r'sr:(\d+)', link)
logging.warning(f"yield scrapy.Request({self.main_url} + {chapter_api_url}, meta={comic}, callback=self.parse_chapter_api)") # 打印提取到的 sr: 的值
else: if sr_value:
yield comic sr = sr_value.group(1) # group(1) 返回第一个捕获组,即数字部分
else:
# 加密数据API处理 print("No match found")
def parse_chapter_api(self, response): images_urls.append(Image().setImage(url=link, scramble=sr.replace("0", "False").replace("1", "True")))
comic_item = ComicLoader(item=response.meta['item'], response=response) ci.image_urls(value=images_urls)
return Conf().parse_chapter_api(item=comic_item, value=response.text).load_item() yield ci.load_item()
def parse(self, response): def parse(self, response):
raise NotImplementedError raise NotImplementedError

View File

@ -1,22 +1,29 @@
books:
names: '//div[@class="truncate"]/text()'
urls: '//div[@class="grid grid-cols-1 sm:grid-cols-4 md:grid-cols-6 gap-2 sm:gap-4"]//a/@href'
data: data:
name: '//div[@class="col"]/h5/text()' name: '//div[@class="basis-3/5 text-sm sm:text-base"]//div[@class="text-xl text-gray-900"]/text()'
icon: '//img[@class="img-thumbnail"]/@src' icon: '//div[@class="flex flex-row gap-3 sm:gap-4"]//div[@class="basis-2/5"]/img[@class="rounded"]/@src'
author: author:
xpath: '//div[contains(@class,"bookid_bookInfo")]/p[1]/text()' xpath: '//div[@class="flex flex-row gap-3 sm:gap-4"]//span[@class="text-gray-800"]/text()'
index: 1 index: 0
tags: '//div[contains(@class,"bookid_bookInfo")]/p[3]/b/text()' tags:
xpath: '//div[@class="flex flex-row gap-3 sm:gap-4"]//span[@class="text-gray-800"]/text()'
index: 3
dep: dep:
xpath: '//div[contains(@class,"bookid_bookInfo")]/p[4]/text()' xpath: '//div[@class="my-2 text-gray-800 text-sm sm:text-base"]/p/text()'
index: 1 index: 1
date: date:
xpath: '//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()' xpath: '//div[@class="text-gray-500 text-sm mt-2"]/div/text()'
index: 1 index: 1
genre: genre:
value: "韩漫" value: "韩漫"
age_rating: age_rating:
value: "R18+" value: "R18+"
chapter_href: '//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/@href' chapter_href: '//div[@class="grid grid-cols-1 sm:grid-cols-2 md:grid-cols-3 gap-2 px-2 py-4"]//a/@href'
chapters: '//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/text()' chapters: '//div[@class="grid grid-cols-1 sm:grid-cols-2 md:grid-cols-3 gap-2 px-2 py-4"]//div[@class="text truncate bg-slate-300 p-2 hover:bg-rose-100"]/text()'
parse_chapter: parse_chapter:
name: name:

View File

@ -1,68 +0,0 @@
import scrapy,logging,time,os
from Comics.items import ComicItem
from Comics.loader import ComicLoader
from Comics.utils import ComicPath
from Comics.settings import PROJECT_KEY
import skip
class RmComicSpider(scrapy.Spider):
name = 'yh_comic'
allowed_domains = ['www.shuanglilock.com.cn']
main_url = 'https://'+allowed_domains[0]
start_urls = main_url+'/info'
def start_requests(self):
# for x in range(0,60):
yield scrapy.Request("https://www.shuanglilock.com.cn/info/27145/", callback=self.parse_comic)
# 获取多个漫画信息
# def books_comic(self, response):
# comics = ComicLoader(item=ComicItem(), response=response)
# data = comics.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0]
# for book in comics.get_exec(data, str_exec="props.pageProps.books"):
# comics.add_value('link', self.start_urls+"/"+book['id'])
# if book['name'] not in skip.skip_comic:
# yield scrapy.Request(url=self.start_urls+"/"+book['id'], callback=self.parse_comic)
# 获取某个漫画的相关数据
# 获取到多个章节链接后进入下个流程
def parse_comic(self, response):
comic_item = ComicLoader(item=ComicItem(), response=response)
comic_item.project_name(self.name)
comic_item.name(xpath='//div[@class="comics-detail__info"]/h1[@class="comics-detail__title"]/text()')
comic_item.icon(xpath='//div[@class="pure-u-1-1 pure-u-sm-1-3 pure-u-md-1-6"]/img/@src')
comic_item.author(xpath='//div[@class="comics-detail__info"]/h2[@class="comics-detail__author"]/text()')
comic_item.tags(xpath='//div[@class="tag-list"]/a[@class="tag"]/text()')
comic_item.dep(xpath='//p[contains(@class,"comics-detail__desc")]/text()')
#comic_item.date(xpath='//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()', index=1)
comic_item.genre(value="樱花漫画")
#comic_item.age_rating(value="R18+")
chapter_href = comic_item.get_xpath('//div[contains(@id,"chapter-items")]'
'//a[@class="comics-chapters__item"]/@href')
chapters = comic_item.get_xpath('//div[contains(@id,"chapter-items")]'
'//a[@class="comics-chapters__item"]//span/text()')
for chapter, link in zip(chapters, chapter_href):
comic_item.chapters(value=chapters)
comic_item.chapter(value=chapter)
item = comic_item.load_item()
cbz_path = ComicPath(item).get_file_path(result_type="cbz", convert=True)
if os.path.exists(cbz_path):
logging.info(f"漫画 {cbz_path} 已存在, 跳过中...")
yield item
else:
yield scrapy.Request(self.main_url+link, meta={'item': item}, callback=self.parse_chapter)
# 读取某章节下的所有图片
def parse_chapter(self, response):
comic_item = ComicLoader(item=response.meta['item'], response=response)
comic_item.image_urls(xpath='//div[@class="comiclist"]/div[@class="comicpage"]/div/img/@data-original')
comic_item.images(xpath='//div[@class="comiclist"]/div[@class="comicpage"]/div/img/@data-original')
comic = comic_item.load_item()
yield comic
def parse(self, response):
raise NotImplementedError
def error_parse(self, response):
raise NotImplementedError

View File

@ -7,9 +7,11 @@ from opencc import OpenCC
from PIL import Image from PIL import Image
from pathlib import Path from pathlib import Path
from zipfile import ZipFile from zipfile import ZipFile
from Comics.settings import COMIC_INFO_XML_FILE,OUTPUT_DIR,PROJECT_KEY from Comics.settings import COMIC_INFO_XML_FILE,COMIC_INFO_XSD_FILE,OUTPUT_DIR,PROJECT_KEY
import yaml import yaml
from Comics.loader import BaseLoader
from Comics.loader import ComicLoader from Comics.loader import ComicLoader
from Comics.loader import BooksLoader
from tinydb import TinyDB, Query from tinydb import TinyDB, Query
# 配置类 # 配置类
@ -47,7 +49,7 @@ class Conf():
return None return None
# 根据读取的配置数据导入到ComicLoader中 # 根据读取的配置数据导入到ComicLoader中
def comic(self, project, item: ComicLoader, child_data='data', val=None): def base_data(self, project, item: BaseLoader, child_data='data', val=None):
item.project_name(project) item.project_name(project)
data = self.get_config_value(project, child_data) data = self.get_config_value(project, child_data)
for key, xpath_data in data.items(): for key, xpath_data in data.items():
@ -59,6 +61,12 @@ class Conf():
item.set_properties(name=key, value=value, xpath=xpath, index=index, sexec=sexec) item.set_properties(name=key, value=value, xpath=xpath, index=index, sexec=sexec)
return item return item
def books(self, project, item: BooksLoader, child_data='books', val=None):
return self.base_data(project, item, child_data, val)
def comic(self, project, item: ComicLoader, child_data='data', val=None):
return self.base_data(project, item, child_data, val)
def parse_chapter(self,item: ComicLoader, value): def parse_chapter(self,item: ComicLoader, value):
return self.comic(item.get_project_name(), item, "parse_chapter", value) return self.comic(item.get_project_name(), item, "parse_chapter", value)
@ -245,7 +253,7 @@ class CommonUtils:
@classmethod @classmethod
def validate_comicinfo_xml(cls, xml_file): def validate_comicinfo_xml(cls, xml_file):
cls._validate_xml(xml_file, "ComicInfo.xsd") cls._validate_xml(xml_file, COMIC_INFO_XSD_FILE)
# 图片处理类 # 图片处理类
@ -772,7 +780,6 @@ class ntfy:
print("Notification sent successfully!") print("Notification sent successfully!")
else: else:
print(f"Failed to send notification. Status code: {response.status_code}") print(f"Failed to send notification. Status code: {response.status_code}")
print(response.json())
class logger: class logger: