diff --git a/Comics/_utils/ComicInfo.py b/Comics/_utils/ComicInfo.py index 44d4f63..d57056a 100644 --- a/Comics/_utils/ComicInfo.py +++ b/Comics/_utils/ComicInfo.py @@ -3,6 +3,7 @@ from xml.dom import minidom from typing import List import json,os from lxml import etree +from Comics.settings import COMIC_INFO_XML_FILE,COMIC_INFO_XSD_FILE,OUTPUT_DIR,PROJECT_KEY # Define the ComicInfo and ComicPageInfo classes class ComicInfo: @@ -94,7 +95,7 @@ class ComicInfoXml: if remove: os.remove(xml_file) - def parse_comicinfo(self, comic: ComicInfo, save_dir=None, xml_filename="ComicInfo.xml", xsd_filename="ComicInfo_2.1.xsd"): + def parse_comicinfo(self, comic: ComicInfo, save_dir=None, xml_filename="ComicInfo.xml", xsd_filename="ComicInfo.xsd"): """_summary_ Args: @@ -143,7 +144,7 @@ class ComicInfoXml: #xml_data = json_to_xml_with_declaration(json_data) #print(xml_data) - def scrapy_xml_by_json(self, json_data, save_dir=None): + def scrapy_xml_by_json(self, json_data, save_dir=None, xsd_file=COMIC_INFO_XSD_FILE): comic = ComicInfo() comic.Title = json_data.get("chapter", "") comic.Series = json_data.get("name", "") @@ -163,5 +164,5 @@ class ComicInfoXml: page.Image = image_name.split(".")[0].split("_")[-1] pages.append(page.Image) comic.Pages.append(page) - self.parse_comicinfo(comic, save_dir=save_dir) + self.parse_comicinfo(comic, save_dir=save_dir, xsd_filename=xsd_file) return pages \ No newline at end of file diff --git a/ComicInfo_2.1.xsd b/Comics/assets/ComicInfo_2.1.xsd similarity index 100% rename from ComicInfo_2.1.xsd rename to Comics/assets/ComicInfo_2.1.xsd diff --git a/Comics/items.py b/Comics/items.py index 59b3fa9..aa0fe52 100644 --- a/Comics/items.py +++ b/Comics/items.py @@ -28,7 +28,8 @@ def _serialize_to_images(value, result_type=None): # suffix = "."+str(image_src).split(".")[-1] suffix = ".jpg" image_name = count_image + suffix - if scramble: + #if scramble: + if scramble == "True": de_str = str(image_src).split("/")[-1].replace(suffix, "==") blocks_num = imageUtils.encodeImage(de_str) image_name = ComicPath.getFileScrambleImageName(count=count_image, block=blocks_num, suffix=suffix) @@ -46,7 +47,7 @@ def _serialize_to_images(value, result_type=None): def serialize_to_images(value): return _serialize_to_images(value) # 图像链接处理方法 -def serialize_to_image_urls(value): return _serialize_to_images(value, result_type="image_urls") +def serialize_to_image_urls(value): return _serialize_to_images(value, result_type="image_urls") # ComicItem class ComicItem(Item): @@ -93,11 +94,27 @@ class ComicItem(Item): # 图像名 images_name = Field() + domain = Field() #章节链接 chapter_href = Field() #章节API chapter_api = Field() + +class BooksItem(Item): + current_project = Field() + names = Field() + urls = Field() +class ImageItem(Item): + image_url = Field() + image_name = Field() + image_path = Field() + image_type = Field() + isScramble = Field() + +class Image(): + def setImage(self, url, scramble): return { "src" : url, "scramble": scramble} + # 序列化-作者 def serializer_info_writer(value): (list_value, value) = [[], str(value).replace("&", " ")] diff --git a/Comics/loader.py b/Comics/loader.py index f9fefc9..a9f32ea 100644 --- a/Comics/loader.py +++ b/Comics/loader.py @@ -1,8 +1,8 @@ -import json,logging +import json,logging,os from scrapy.loader import ItemLoader -from Comics.settings import PROJECT_KEY +from Comics.settings import PROJECT_KEY,IMAGES_STORE -class ComicLoader(ItemLoader): +class BaseLoader(ItemLoader): def parseExec(self,data,exec): if data !=None and exec != None: dots = str(exec).split(".") @@ -53,7 +53,7 @@ class ComicLoader(ItemLoader): def get_exec(self, value, str_exec): return self.parseExec(value, str_exec) - + def add_value(self, field_name, value, *processors, re=None, **kw): if self.auto_replace_value(field_name, value): return super().add_value(field_name, value, *processors, re=re, **kw) @@ -68,15 +68,29 @@ class ComicLoader(ItemLoader): # 设置漫画属性 def set_properties(self, name, value=None, xpath=None, index=None, sexec=None): - if value != None and sexec==None: + if value != None: self.add_value(field_name=name, value=value) if xpath != None: self.add_xpath(field_name=name, xpath=xpath, index=index) if sexec != None: self.add_exec(field_name=name, value=value, str_exec=sexec) - + + def get_output_value(self, field_name, skip_field=["chapter"]): + value = super().get_output_value(field_name) + try: + if isinstance(value, list) and len(value) == 1: + if field_name not in skip_field: value = value[0] + else: value = "".join(value) + except: + print(f"get_output_value value={value} type={type(value)}") + return value + # 工程名 def project_name(self, project_name): self.add_value(PROJECT_KEY, project_name) + # 工程名 + def get_project_name(self): return self.get_output_value(PROJECT_KEY) + +class ComicLoader(BaseLoader): # 漫画名 def name(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('name', value, xpath, index, sexec) # 漫画封面链接 @@ -101,7 +115,9 @@ class ComicLoader(ItemLoader): # 图像名称 def images(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('images', value, xpath, index, sexec) # 图像链接 - def image_urls(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('image_urls', value, xpath, index, sexec) + def image_urls(self, value=None, xpath=None, index=None, sexec=None): + self.set_properties('images', value, xpath, index, sexec) + self.set_properties('image_urls', value, xpath, index, sexec) def get_output_value(self, field_name, skip_field=["chapter"]): value = super().get_output_value(field_name) @@ -121,8 +137,6 @@ class ComicLoader(ItemLoader): def get_schapter(self): return self.get_output_value("s_chapter") # 漫画封面 def get_icon(self): return self.get_output_value("icon") - # 工程名 - def get_project_name(self): return self.get_output_value(PROJECT_KEY) # 章节链接 def get_chapter_href(self): return self.get_output_value("chapter_href") # 全部章节 @@ -143,7 +157,9 @@ class ComicLoader(ItemLoader): def set_chapter(self, value): self.set_properties('chapter', value=value) def set_schapter(self, value): self.set_properties('s_chapter', value=value) - + def set_domain(self, value): self.set_properties('domain', value=value) + def get_domain(self): return self.get_output_value("domain") + # 章节页码 def count(self): len_images = len(self.get_images()) @@ -162,6 +178,26 @@ class ComicLoader(ItemLoader): def load_item(self, chapter=None): self.count() self.index() + if not self.get_icon().startswith("http"): self.icon(self.get_domain()+ self.get_icon()) if chapter != None: self.set_chapter(chapter) self.save_sname_schapter() - return super().load_item() \ No newline at end of file + return super().load_item() + + def set_image_item(self, image_url, image_path, image_name, image_scramble="False", image_type="Image"): + return { "image_url" : image_url, "image_path" : image_path, "image_name" : image_name, "image_scramble" : image_scramble , "image_type" : image_type} + + # 图像链接处理 + def parse_images(self): + images_item = [] + icon_path = os.path.join(self.get_project_name(), "icons", self.get_name(), self.get_name()+".jpg") + images_item.append(self.set_image_item(image_url= self.get_icon() , image_path = icon_path , image_name=self.get_name()+".jpg", image_scramble="False", image_type="Icon")) + for url, name in zip(self.get_image_urls(), self.get_images()): + image_path = os.path.join(self.get_project_name(), "images", self.get_name(), self.get_chapter(), name) + images_item.append(self.set_image_item(image_url= url , image_path= image_path, image_name=name)) + return images_item + +class BooksLoader(BaseLoader): + + def get_names(self): return self.get_output_value("names") + + def get_urls(self): return self.get_output_value("urls") \ No newline at end of file diff --git a/Comics/pipelines.py b/Comics/pipelines.py index 3b7725b..1502fd5 100644 --- a/Comics/pipelines.py +++ b/Comics/pipelines.py @@ -7,11 +7,11 @@ # useful for handling different item types with a single interface import os,scrapy,logging,shutil from Comics import settings -from Comics.items import ComicItem +from Comics.items import ComicItem,ImageItem from Comics.loader import ComicLoader from Comics.utils import CBZUtils,fileUtils as fu from Comics.utils import ComicPath -from Comics.utils import checkUtils,oldUtils +from Comics.utils import oldUtils from Comics.exporters import JsonExport,ItemExporter from scrapy.pipelines.images import ImagesPipeline from Comics._utils.ComicInfo import ComicInfoXml @@ -21,13 +21,11 @@ class ComicsPipeline(): # item就是yield后面的对象 def process_item(self, item: ComicItem, spider): if isinstance(item, ComicItem): - # 'output/rm_comic/json/壞X/第1話 壞X' # 已存在漫画CBZ文件 调用转换 result_item = None if fu.exists(ComicPath(item).PATH_CBZ()): result_item = ItemExporter().export_obj(item) # 不存在漫画CBZ文件 else: result_item = JsonExport(file=ComicPath(item).getDirJosnComicChapter()).export_json(ComicLoader(item).load_item(), if_return=True) - #oldUtils().clean_old_files(files=result_item["chapters"], folder=ComicPath(item).file_path(result_type=ComicPath.MAPPING_CBZ_DIR), move_folder=ComicPath(item).file_path(result_type=ComicPath.MAPPING_OLD_CBZ_DIR)) return result_item class BaseImagesPipeline(ImagesPipeline): @@ -59,47 +57,26 @@ class BaseImagesPipeline(ImagesPipeline): if not result[0]: fail_data.append(result[1]) if len(fail_data) == 0 and len(results) != 0: is_success = True return is_success - -# 封面下载操作类 -class IconDownloadPipeline(BaseImagesPipeline): - - # 数据处理 - def get_media_requests(self, item, info): - comic = ComicLoader(item=item) - # 获取封面链接和封面保存路径 - icon_url, icon_cache_path = [ comic.get_icon(), super().get_file_path(item, result_type="icon_cache") ] - # 封面已存在 - if fu.exists(icon_cache_path): return False - else: yield scrapy.Request(url=icon_url, meta={'path': icon_cache_path }) - - def item_completed(self, results, item, info): - if super().success_completed(item, results): - print(" icon download success") - # 更新封面到Icon文件夹内 - super().update_icon(item) - return item - class ImgDownloadPipeline(BaseImagesPipeline): - def get_media_requests(self, item, info): comic = ComicLoader(item=item) - self.image_urls, self.images = [ comic.get_image_urls(), comic.get_images() ] - # 添加封面下载信息至下载列表中 - # self.add_download_icon(item) - for image_url,image in zip(self.image_urls,self.images): - if_down, image_path = [ True, super().get_file_path(item, image)] + images_item = comic.parse_images() + for image_item in images_item: + if_down = True + image_url = image_item["image_url"] + image_path = image_item["image_path"] + if image_item["image_type"] == "Icon": + image_path = super().get_file_path(item, result_type="icon_cache") + if fu.exists(image_path): return False # 图像(含加密图像)已存在 if super().image_scramble_exits(item, image_path): - #if image_path == self.get_file_path(item, result_type="icon_cache"): - # logging.info(f"icon file exists: IMAGE_STORE {image_path}") - #else: if_down = False logging.info(f"file exists: IMAGE_STORE {image_path}") if if_down: logging.info(f"downloading {image_url} --> IMAGE_STORE {image_path}") - yield scrapy.Request(url=image_url, meta={'path': image_path}) + yield scrapy.Request(url=image_url, meta={'path': image_path}) # 打包cbz封面 def pack_icon(self, item): @@ -122,10 +99,12 @@ class ImgDownloadPipeline(BaseImagesPipeline): item (_type_): Comic item数据 info (_type_): 信息 """ + if super().success_completed(item, results): super().update_icon(item) + cbz_path = super().get_file_path(item, result_type="cbz") chapter_dir = ComicPath(item=item).file_path(result_type=ComicPath().MAPPING_IMAGES_DIR) - images_file = oldUtils().old_images(folder=chapter_dir) - if len(images_file) != len(ComicLoader(item=item).get_image_urls()): return + images_file = oldUtils().old_images(folder=chapter_dir) + if images_file == None or len(images_file) != len(ComicLoader(item=item).get_image_urls()): return if fu.exists(cbz_path): #self.update_icon(item) chapter = os.path.basename(cbz_path).split(".")[0] @@ -135,19 +114,9 @@ class ImgDownloadPipeline(BaseImagesPipeline): self.pack_icon(item) else: # ComicInfoXml 生成 - #comic_info = ComicInfoXmlItemExporter(dir=super().get_file_path(item=item, result_type="comic_info")).export_xml(item) comic_pages = ComicInfoXml().scrapy_xml_by_json(item, save_dir=super().get_file_path(item=item, result_type="images_dir")) - #if CBZUtils.packComicChapterCBZ(src_dir= super().get_file_path(item, result_type="images_dir"), - # dts_path= cbz_path, - # comic_info_images= comic_info['Pages'], remove=True): if CBZUtils.packComicChapterCBZ(src_dir= super().get_file_path(item, result_type="images_dir"), dts_path= cbz_path, comic_info_images= comic_pages, remove=True): super().update_icon(item) - self.pack_icon(item) - # CBZ校验失败 - #else: - # checkUtils().export_error(item) - #sleep_time = random.randint(3,15) - #print(f'等待{sleep_time}秒后进行下一章节') - #time.sleep(int(sleep_time)) \ No newline at end of file + self.pack_icon(item) \ No newline at end of file diff --git a/Comics/settings.py b/Comics/settings.py index 6d49eab..c789234 100644 --- a/Comics/settings.py +++ b/Comics/settings.py @@ -97,7 +97,7 @@ ITEM_PIPELINES = { # 'scrapy.pipelines.images.ImagesPipeline' : 1, 'Comics.pipelines.ComicsPipeline': 300, # 'Comics.pipelines.ImageParsePipeline': 400, - 'Comics.pipelines.IconDownloadPipeline': 400, +# 'Comics.pipelines.IconDownloadPipeline': 400, 'Comics.pipelines.ImgDownloadPipeline': 500, } @@ -130,4 +130,5 @@ LOG_STDOUT = True # 标准化输出 CBZ_EXPORT_PATH = os.path.join(BASE_OUTPUT,"CBZ") OLD_CBZ_EXPORT_PATH = os.path.join(BASE_OUTPUT,"Old_CBZ") #数据导出类 排序 -COMIC_INFO_XML_FILE = "ComicInfo.xml" \ No newline at end of file +COMIC_INFO_XML_FILE = "ComicInfo.xml" +COMIC_INFO_XSD_FILE = "Comics/assets/ComicInfo_2.1.xsd" \ No newline at end of file diff --git a/Comics/spiders/rm_comic.py b/Comics/spiders/rm_comic.py index bae755e..bb4431a 100644 --- a/Comics/spiders/rm_comic.py +++ b/Comics/spiders/rm_comic.py @@ -1,6 +1,8 @@ -import scrapy,logging,os,skip -from Comics.items import ComicItem +import scrapy,logging,os,skip,json,re +from Comics.items import ComicItem,Image +from Comics.items import BooksItem from Comics.loader import ComicLoader +from Comics.loader import BooksLoader from Comics.utils import ComicPath from Comics.utils import Conf from Comics.utils import oldUtils @@ -9,7 +11,7 @@ class RmComicSpider(scrapy.Spider): name = 'rm_comic' allowed_domains = ['rouman5.com'] main_url = 'https://'+allowed_domains[0] - start_urls = main_url+'/books' + start_urls = main_url+"/books" # 遍历网站页数数据 def start_requests(self): @@ -18,18 +20,18 @@ class RmComicSpider(scrapy.Spider): # 获取多个漫画信息 def books_comic(self, response): - comics = ComicLoader(item=ComicItem(), response=response) + books_item = Conf().books(self.name, BooksLoader(BooksItem(), response)) # 获取漫画网站//script[@id]内的json数据并获取props.pageProps.books数据并作偱环解析 - for book in comics.get_exec(comics.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0], str_exec="props.pageProps.books"): + for book,url in zip(books_item.get_names(), books_item.get_urls()): # 排除指定的漫画名 - if book['name'] not in skip.skip_comic: - yield scrapy.Request(url=self.start_urls+"/"+book['id'], callback=self.parse_comic) + if book not in skip.skip_comic: yield scrapy.Request(url=self.main_url+"/"+url, callback=self.parse_comic) # 获取某个漫画的相关数据 # 获取到多个章节链接后进入下个流程 def parse_comic(self, response): # 初始化Comic数据并根据工程名称读取配置文件并自动解析 comic_item = Conf().comic(self.name, ComicLoader(ComicItem(), response)) + comic_item.set_domain(self.main_url) path_comic = comic_item.load_item() cbz_dir = ComicPath(path_comic).file_path(result_type=ComicPath.MAPPING_CBZ_DIR) move_folder = ComicPath(path_comic).file_path(result_type=ComicPath.MAPPING_OLD_CBZ_DIR) @@ -44,7 +46,6 @@ class RmComicSpider(scrapy.Spider): # 获取最终存放CBZ的路径 cbz_path = ComicPath(item=item).PATH_CBZ() # 校验繁体和简体中文CBZ路径是否存在 - # if not checkUtils().is_error(item) and os.path.exists(cbz_path): if cbz_path !=None and os.path.exists(cbz_path): logging.info(f"漫画 {cbz_path} 已存在, 跳过中...") yield item @@ -56,22 +57,24 @@ class RmComicSpider(scrapy.Spider): def parse_chapter(self, response): # 获取传入的漫画item数据 ci = ComicLoader(item=response.meta['item'], response=response) + reuslt_json = None + for data_json in ci.get_xpath('//script/text()'): + if data_json.startswith('self.__next_f.push([1,"5') : reuslt_json = data_json # 再次通过获取的XPATH数据解析并保存到ci(ComicItem)中 - item: ComicLoader = Conf().parse_chapter(item=ci, value=ci.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0]) - comic, chapter_api_url = [ item.load_item() ,item.get_chapter_api() ] - if chapter_api_url is not None and len(chapter_api_url) != 0 : - try: - yield scrapy.Request(self.main_url + chapter_api_url, meta={'item': comic}, callback=self.parse_chapter_api) - except: - logging.warning(f"yield scrapy.Request({self.main_url} + {chapter_api_url}, meta={comic}, callback=self.parse_chapter_api)") - else: - yield comic - - # 加密数据API处理 - def parse_chapter_api(self, response): - comic_item = ComicLoader(item=response.meta['item'], response=response) - return Conf().parse_chapter_api(item=comic_item, value=response.text).load_item() - + # 正则表达式匹配 .jpg 链接 + jpg_links = re.findall(r'(https?://\S+\.jpg)', reuslt_json) + images_urls = [] + # 打印提取的 .jpg 链接 + for link in jpg_links: + sr_value = re.search(r'sr:(\d+)', link) + # 打印提取到的 sr: 的值 + if sr_value: + sr = sr_value.group(1) # group(1) 返回第一个捕获组,即数字部分 + else: + print("No match found") + images_urls.append(Image().setImage(url=link, scramble=sr.replace("0", "False").replace("1", "True"))) + ci.image_urls(value=images_urls) + yield ci.load_item() def parse(self, response): raise NotImplementedError diff --git a/Comics/spiders/rm_comic.yml b/Comics/spiders/rm_comic.yml index 6368b57..087ed24 100644 --- a/Comics/spiders/rm_comic.yml +++ b/Comics/spiders/rm_comic.yml @@ -1,22 +1,29 @@ +books: + names: '//div[@class="truncate"]/text()' + urls: '//div[@class="grid grid-cols-1 sm:grid-cols-4 md:grid-cols-6 gap-2 sm:gap-4"]//a/@href' + + data: - name: '//div[@class="col"]/h5/text()' - icon: '//img[@class="img-thumbnail"]/@src' + name: '//div[@class="basis-3/5 text-sm sm:text-base"]//div[@class="text-xl text-gray-900"]/text()' + icon: '//div[@class="flex flex-row gap-3 sm:gap-4"]//div[@class="basis-2/5"]/img[@class="rounded"]/@src' author: - xpath: '//div[contains(@class,"bookid_bookInfo")]/p[1]/text()' - index: 1 - tags: '//div[contains(@class,"bookid_bookInfo")]/p[3]/b/text()' + xpath: '//div[@class="flex flex-row gap-3 sm:gap-4"]//span[@class="text-gray-800"]/text()' + index: 0 + tags: + xpath: '//div[@class="flex flex-row gap-3 sm:gap-4"]//span[@class="text-gray-800"]/text()' + index: 3 dep: - xpath: '//div[contains(@class,"bookid_bookInfo")]/p[4]/text()' + xpath: '//div[@class="my-2 text-gray-800 text-sm sm:text-base"]/p/text()' index: 1 date: - xpath: '//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()' + xpath: '//div[@class="text-gray-500 text-sm mt-2"]/div/text()' index: 1 genre: value: "韩漫" age_rating: value: "R18+" - chapter_href: '//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/@href' - chapters: '//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/text()' + chapter_href: '//div[@class="grid grid-cols-1 sm:grid-cols-2 md:grid-cols-3 gap-2 px-2 py-4"]//a/@href' + chapters: '//div[@class="grid grid-cols-1 sm:grid-cols-2 md:grid-cols-3 gap-2 px-2 py-4"]//div[@class="text truncate bg-slate-300 p-2 hover:bg-rose-100"]/text()' parse_chapter: name: diff --git a/Comics/spiders/yh_comic.py b/Comics/spiders/yh_comic.py deleted file mode 100644 index 20a182a..0000000 --- a/Comics/spiders/yh_comic.py +++ /dev/null @@ -1,68 +0,0 @@ -import scrapy,logging,time,os -from Comics.items import ComicItem -from Comics.loader import ComicLoader -from Comics.utils import ComicPath -from Comics.settings import PROJECT_KEY -import skip - -class RmComicSpider(scrapy.Spider): - name = 'yh_comic' - allowed_domains = ['www.shuanglilock.com.cn'] - main_url = 'https://'+allowed_domains[0] - start_urls = main_url+'/info' - - def start_requests(self): -# for x in range(0,60): - yield scrapy.Request("https://www.shuanglilock.com.cn/info/27145/", callback=self.parse_comic) - - # 获取多个漫画信息 -# def books_comic(self, response): -# comics = ComicLoader(item=ComicItem(), response=response) -# data = comics.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0] -# for book in comics.get_exec(data, str_exec="props.pageProps.books"): -# comics.add_value('link', self.start_urls+"/"+book['id']) -# if book['name'] not in skip.skip_comic: -# yield scrapy.Request(url=self.start_urls+"/"+book['id'], callback=self.parse_comic) - - # 获取某个漫画的相关数据 - # 获取到多个章节链接后进入下个流程 - def parse_comic(self, response): - comic_item = ComicLoader(item=ComicItem(), response=response) - comic_item.project_name(self.name) - comic_item.name(xpath='//div[@class="comics-detail__info"]/h1[@class="comics-detail__title"]/text()') - comic_item.icon(xpath='//div[@class="pure-u-1-1 pure-u-sm-1-3 pure-u-md-1-6"]/img/@src') - comic_item.author(xpath='//div[@class="comics-detail__info"]/h2[@class="comics-detail__author"]/text()') - comic_item.tags(xpath='//div[@class="tag-list"]/a[@class="tag"]/text()') - comic_item.dep(xpath='//p[contains(@class,"comics-detail__desc")]/text()') - #comic_item.date(xpath='//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()', index=1) - comic_item.genre(value="樱花漫画") - #comic_item.age_rating(value="R18+") - chapter_href = comic_item.get_xpath('//div[contains(@id,"chapter-items")]' - '//a[@class="comics-chapters__item"]/@href') - chapters = comic_item.get_xpath('//div[contains(@id,"chapter-items")]' - '//a[@class="comics-chapters__item"]//span/text()') - for chapter, link in zip(chapters, chapter_href): - comic_item.chapters(value=chapters) - comic_item.chapter(value=chapter) - item = comic_item.load_item() - cbz_path = ComicPath(item).get_file_path(result_type="cbz", convert=True) - if os.path.exists(cbz_path): - logging.info(f"漫画 {cbz_path} 已存在, 跳过中...") - yield item - else: - yield scrapy.Request(self.main_url+link, meta={'item': item}, callback=self.parse_chapter) - - - # 读取某章节下的所有图片 - def parse_chapter(self, response): - comic_item = ComicLoader(item=response.meta['item'], response=response) - comic_item.image_urls(xpath='//div[@class="comiclist"]/div[@class="comicpage"]/div/img/@data-original') - comic_item.images(xpath='//div[@class="comiclist"]/div[@class="comicpage"]/div/img/@data-original') - comic = comic_item.load_item() - yield comic - - def parse(self, response): - raise NotImplementedError - - def error_parse(self, response): - raise NotImplementedError \ No newline at end of file diff --git a/Comics/utils.py b/Comics/utils.py index 5d03a43..21ba595 100644 --- a/Comics/utils.py +++ b/Comics/utils.py @@ -7,9 +7,11 @@ from opencc import OpenCC from PIL import Image from pathlib import Path from zipfile import ZipFile -from Comics.settings import COMIC_INFO_XML_FILE,OUTPUT_DIR,PROJECT_KEY +from Comics.settings import COMIC_INFO_XML_FILE,COMIC_INFO_XSD_FILE,OUTPUT_DIR,PROJECT_KEY import yaml +from Comics.loader import BaseLoader from Comics.loader import ComicLoader +from Comics.loader import BooksLoader from tinydb import TinyDB, Query # 配置类 @@ -47,7 +49,7 @@ class Conf(): return None # 根据读取的配置数据导入到ComicLoader中 - def comic(self, project, item: ComicLoader, child_data='data', val=None): + def base_data(self, project, item: BaseLoader, child_data='data', val=None): item.project_name(project) data = self.get_config_value(project, child_data) for key, xpath_data in data.items(): @@ -59,6 +61,12 @@ class Conf(): item.set_properties(name=key, value=value, xpath=xpath, index=index, sexec=sexec) return item + def books(self, project, item: BooksLoader, child_data='books', val=None): + return self.base_data(project, item, child_data, val) + + def comic(self, project, item: ComicLoader, child_data='data', val=None): + return self.base_data(project, item, child_data, val) + def parse_chapter(self,item: ComicLoader, value): return self.comic(item.get_project_name(), item, "parse_chapter", value) @@ -245,7 +253,7 @@ class CommonUtils: @classmethod def validate_comicinfo_xml(cls, xml_file): - cls._validate_xml(xml_file, "ComicInfo.xsd") + cls._validate_xml(xml_file, COMIC_INFO_XSD_FILE) # 图片处理类 @@ -772,7 +780,6 @@ class ntfy: print("Notification sent successfully!") else: print(f"Failed to send notification. Status code: {response.status_code}") - print(response.json()) class logger: