init

2022-12-04 22:29:04 +08:00 · 2022-12-04 22:29:04 +08:00 · 499cb29fa3
commit 499cb29fa3
11 changed files with 628 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+COMICOUT/
+**/__pycache__/**
--- a/main.py
+++ b/main.py
@ -0,0 +1,72 @@
+from utils.HtmlUtils import htmlUtils
+from utils.comic.ComicStr import comicStr
+from utils.FileUtils import fileUtils
+import json,os
+from utils.NetUtils import netUtils
+from utils.ImageUtils import imageUtils
+from utils.ComicUtils import comicUtils
+
+comic_chapter_path = "COMICOUT" 
+
+def oneComic(c_url):
+    global comic_chapter_path
+#   #漫画名
+    data = htmlUtils.xpathData(comicStr.result,
+                       '//script[@id="__NEXT_DATA__"]/text()',url=c_url)
+#    #
+    data = json.loads(data[0])
+    data = data.get("props")
+    data = data.get("pageProps")
+    print(data)
+    fileUtils.saveConfComicChapterInfo("1話 親子餐廳的媽媽們",data,"親子餐廳的媽媽們")
+    x = fileUtils.getInfoConfComicChapter("1話 親子餐廳的媽媽們","親子餐廳的媽媽們")
+    bookName = x.get("bookName")
+    alias = x.get("alias")
+    chapterName = x.get("chapterName")
+    description = x.get("description")
+    images = x.get("images")
+    chapterAPIPath = x.get("chapterAPIPath")
+    print(chapterAPIPath)
+    if not chapterAPIPath == None:
+        base_url = comicUtils.getBaseUrl(c_url)
+        chapterAPIUrl = base_url+chapterAPIPath
+        data = htmlUtils.getJSON(chapterAPIUrl)
+        data = data.get("chapter")
+        chapterName = data.get("name")
+        images  = data.get("images")
+    print("test")
+    if images == None:
+        print("fsf")
+    totalChapter = x.get("totalChapter")
+    tags = x.get("tags")
+    x = tags
+    print(x)
+    count_image = 1
+    list_img = []
+    list_scramble = []
+    for image in images:
+        image_src = image.get("src")
+        scramble = image.get("scramble")
+        print("count=",count_image)
+        list_img.append(image_src)
+        list_scramble.append(scramble)
+        print(image_src)
+        print(scramble)
+        count_image+=1
+        print(count_image)
+    print(list_img)
+    print(totalChapter)
+    netUtils.downloadComicChapterImages(bookName,chapterName,list_img,scrambles=list_scramble)
+    comic_chapter_path = os.path.join("COMICOUT",bookName,chapterName)
+     
+if __name__ == '__main__':
+    oneComic("https://rm01.xyz/books/f08668a4-0cbc-488e-95a7-3c71de0c7a31/23")
+#    oneComic()
+#    path = "COMICOUT\好友的私生活\第1話 好友的私生活"
+    path = comic_chapter_path
+    dirs = os.listdir(path)
+    for dir in dirs:
+        isScramble = str(dir).startswith("scramble=")
+        if isScramble:
+            c_path = os.path.join(path, dir)
+            imageUtils.getScrambleImage(c_path)
--- a/utils/ComicUtils.py
+++ b/utils/ComicUtils.py
@ -0,0 +1,88 @@
+from fake_useragent import UserAgent
+import requests, os, shutil,re,json
+from lxml import html
+from utils.comic.ComicStr import comicStr
+from utils.FileUtils import fileUtils
+
+class comicUtils:
+    comic_title = None
+    temp_url = ""
+   
+    @classmethod
+    def setKeyValue(cls,name, c_value,bool_list=None):
+        count = 0
+        len_value = len(c_value)
+        if bool_list:
+            count += 1
+            if cls.comics.get(count) == None:
+                cls.comics[count] = {name: c_value}
+            else:
+                cls.comics[count].update({name: c_value})
+        else:
+            for x in range(0, len_value):
+                value = c_value[x]
+                count += 1
+                if cls.comics.get(count) == None:
+                    cls.comics[count] = {name: value}
+                else:
+                    cls.comics[count].update({name: value})
+
+    @classmethod
+    def listImgDownload(cls,save_dir_name, links):
+        print("save_dir_name:", save_dir_name)
+        print("list_link:", links)
+        len_links = len(links)
+        count = 1
+        for link in links:
+            print("link:",count,":", link)
+            file_cout = ("{:0>3d}".format(count))
+            file_path = os.path.join("Comic", save_dir_name,str(file_cout)+".jpg")
+            netUtils.download(link, file_path)
+            count += 1
+    
+    @classmethod
+    def downloadComic(cls,comic_name):
+        comic = fileUtils.read_comic(comic_name)
+        list_img = comic.get(comicStr.list_img)
+        len_list_img = len(list_img)
+        icon = comic.get(comicStr.icon)
+        for x in range(0, len_list_img):
+            chapter_img = list_img[x]
+            keys = chapter_img.keys()
+            for key in keys:
+                if not icon == None:
+                    icon = netUtils.downloadComicIcon(comic_name,key,icon)
+                else:
+                    print("icon已跳过")
+                netUtils.downloadComicChapterImages(comic_name, key, chapter_img[key])
+     
+    '''
+    传入 comic_name读取配置文件中的各类信息
+    输出 "ComicInfo.xml" 并将图片打包成CBZ
+    '''           
+    @classmethod 
+    def packComicCBZ(cls,comic_name):
+        data = fileUtils.read_comic(comic_name)
+        title = data.get(comicStr.title)
+        author = data.get(comicStr.author)
+        author = str(author).replace("/",",").replace(" ","")
+        dep = data.get(comicStr.dep)
+        chapters = data.get(comicStr.chapters)
+        tags = "韩漫"
+        c_publisher = "韩漫"
+        print("title:", title,"author=",author,"dep=",dep,"chapters:",chapters)
+        for chapter in chapters:
+            CBZUtils.writeComicInfoXML(title, chapter, dep, author,
+                      tags, c_publisher)
+            CBZUtils.packComicCBZ(title,chapter)
+            
+    '''
+    获取网站主页
+    '''
+    @classmethod
+    def getBaseUrl(cls,url):
+        num = 3
+        index = 0
+        for x in range(0, num):
+            index = str(url).find("/",index)+1
+        return url[0:index-1]
--- a/utils/FileUtils.py
+++ b/utils/FileUtils.py
@ -0,0 +1,77 @@
+import json
+import requests,os
+
+class fileUtils:
+    base_path = "COMIC_OUT"
+    conf_path = os.path.join(base_path,".conf")
+    comic_path = os.path.join(base_path,".conf","comic")
+    comic_name = ""
+    
+    #文件保存
+    @classmethod
+    def file_save(cls,path,data,mode=None):
+        result = {}
+        f = {}
+        dir_name = os.path.dirname(path)
+        if not os.path.exists(dir_name):
+            os.makedirs(dir_name)
+        save_path = os.path.join(path)
+        data = json.dumps(data)
+        if mode == None:
+            mode = "w+"
+        try:
+            f = open(save_path, mode, encoding="utf-8")
+            f.write(data)
+            f.close()
+            print("data=",data)
+            result = path + "文件写入成功"
+        except:
+            result = path + "文件写入失败"
+        return result
+    
+    @classmethod
+    def save_conf(cls,file, data, mode=None):
+        file = os.path.join(cls.conf_path,file)
+        cls.file_save(file,data,mode)
+    
+    @classmethod
+    def save_comic(cls,name, data, mode=None):
+        print("comic_save=", name)
+        cls.file_save(cls.get_utl_save_comic(name),data,mode)
+    
+    @classmethod
+    def saveConfComicChapterInfo(cls,chapter,data,comic_name=None,mode=None):
+        cls.file_save(cls.getPathConfComicChapterInfo(chapter,comic_name), data,mode)
+    
+    @classmethod
+    def getPathConfComicChapterInfo(cls,chapter,comic_name=None):
+        if comic_name == None:
+            comic_name = cls.comic_name
+        return os.path.join(cls.comic_path,comic_name,"info_"+chapter)
+    
+    @classmethod
+    def getInfoConfComicChapter(cls,chapter,comic_name=None):
+        data = None
+        path = cls.getPathConfComicChapterInfo(chapter,comic_name)
+        with open(path,"r",encoding="utf-8") as fs:
+           data = json.loads(fs.read())
+        return data
+    
+    @classmethod
+    def read_comic(cls,name):
+        file = os.path.join(cls.comic_path,name)
+        data = None
+        try:
+            with open(file,"r",encoding="utf-8") as fs:
+                data = json.loads(fs.read())
+        except:
+            print("文件出错了 file=", file)
+        return data
+   
+    '''
+    返回漫画保存路径
+    ''' 
+    @classmethod
+    def get_utl_save_comic(cls,name):
+        file = os.path.join(cls.comic_path,name)
+        return file    
--- a/utils/HtmlUtils.py
+++ b/utils/HtmlUtils.py
@ -0,0 +1,54 @@
+from fake_useragent import UserAgent
+import requests
+from lxml import html
+from utils.comic.ComicStr import comicStr
+
+class htmlUtils:
+    headers = {'User-Agent': UserAgent().random}
+    url_data = {}
+   
+    @classmethod
+    def getHTML(cls, curl):
+        rstr = r"[\/\\\:\*\?\"\<\>\|\.]"  #  '/ \ : * ? " < > |'
+        #file_url = re.sub(rstr, "_", curl)
+        keys = cls.url_data.keys()
+        url_text = cls.url_data.get(curl)
+        #数据为空则获取数据
+        if url_text == None:
+            print("请求地址：",curl)
+            res = requests.get(curl, headers=cls.headers)
+            url_text = html.fromstring(res.text)
+        data = { curl : url_text}
+        cls.url_data.update(data)
+        return url_text
+   
+    @classmethod
+    def getJSON(cls,curl):
+        res = requests.get(curl, headers=cls.headers)
+        data_json = res.json()
+        return data_json
+     
+    @classmethod
+    def xpathData(cls,c_title, c_xpath,url=None,num=None,type=None,not_eq=None):
+        if url == None:
+            url = cls.temp_url
+        else:
+            cls.temp_url = url            
+        result = []
+        #获取html实体数据
+        et = cls.getHTML(url)
+        #比对数据
+        count = 1
+        xpaths = et.xpath(c_xpath)
+        for x in xpaths:
+            if not x == not_eq:
+                result.append(x)
+                count +=1
+        data = {c_title : result}
+        if not num == None:
+            data = {c_title : result[num]}
+        if not type == None:
+            data = { type : result }
+        if c_title == comicStr.result:
+            data = result
+        return data
--- a/utils/ImageUtils.py
+++ b/utils/ImageUtils.py
@ -0,0 +1,115 @@
+import base64,hashlib,os,shutil
+from PIL import Image
+
+class imageUtils:
+    
+    @classmethod
+    def encodeImage(cls,enStr):
+        print("en",enStr)
+        enc = base64.b64decode(enStr)
+        print("解密：",enc)
+        m = hashlib.md5()
+        m.update(enc)
+        md5 = m.digest()
+        d = md5[-1]
+        print(md5)
+        try:
+            blocks = d % 10 + 5
+        except:
+            blocks = 0  %10 + 5
+        print("blocks=",blocks)
+        return blocks 
+    
+    @classmethod
+    def scrambleImage(cls,file_path):
+        file_str = str(file_path).split("=")
+        #10_29.jpg
+        baseDir = file_str[0].replace("scramble","")
+        baseName = file_str[-1]
+        baseFN = baseName.split("_") 
+        save_name = baseFN[1]
+        save_name_delesu = baseName.split(".")[0]
+        blocks = int(baseFN[0])
+        save_file_path = os.path.join(baseDir,save_name)
+        print("sva",save_file_path)
+        if os.path.exists(save_file_path):
+            print("图片已解密，已跳过:", save_file_path)
+            return None
+        image_su = str(file_path).split(".")[-1]
+        img = Image.open(file_path)
+        width = img.width
+        height = img.height
+        #blocks = cls.encodeImage(enStr)
+        print("blocks=",blocks)
+        blockHeight = int(height / blocks)
+        blockWidth = int(width / blocks)
+        print("blockHeight=",blockHeight)
+        su = str(file_path).split(".")[-1]
+        split_path = os.path(baseDir,save_name_delesu+"split")
+        cls.splitimage(file_path,blocks,1,split_path)
+        cls.image_compose(split_path+"/",blocks,1,save_file_path,blockHeight,width)
+        #完成后清空
+        return file_path 
+    
+    @classmethod
+    def splitimage(cls,src,rownum,colnum,dstpath):
+            img=Image.open(src)
+            w,h=img.size
+            if rownum<= h and colnum<=w:
+                s=os.path.split(src)
+                if dstpath=='':
+                    dstpath = s[0]
+                if not os.path.exists(dstpath):
+                    os.makedirs(dstpath)
+                fn=s[1].split('.')
+                basename=fn[0]
+                ext=fn[-1]
+                num=0
+                rowheight=h//rownum
+                colwidth=w//colnum
+                for r in range(rownum):
+                    for c in range(colnum):
+                        box=(c*colwidth,r*rowheight,(c+1)*colwidth,(r+1)*rowheight)
+                        file_path = os.path.join(dstpath,basename+'_'+str(num)+'.'+ext)
+                        print("file_path=",file_path)
+                        img.crop(box).save(file_path)
+                        num=num+1
+            else:
+                print('不数！')
+            img.close
+    
+    @classmethod
+    def image_compose(cls,src,row,column,save_path,image_height,image_width):
+        image_size = image_height
+        #image_height = 376
+        #image_width = 720
+        images_format = ['.png','.jpg']
+        image_names = [name for name in os.listdir(src) for item in images_format if
+                   os.path.splitext(name)[1] == item][::-1]
+        # 简单的对于参数的设定和实际图片集的大小进行数量判断
+        if len(image_names) < row * column:
+            raise ValueError("合成图片的参数和要求的数量不能匹配！")
+        
+        to_image = Image.new('RGB', (column * image_width, row * image_height)) #创建一个新图
+        # 循环遍历，把每张图片按顺序粘贴到对应位置上
+        for y in range(1, row + 1):
+            for x in range(1, column + 1):
+                #1 * (row=1 -1)   col=1 -1
+                image_path = src + image_names[column * (y - 1) + x - 1]
+                print("split_image=",image_path)
+                from_image = Image.open(image_path)
+                #保持原图片大小
+                #.resize(
+                #    (image_size, image_size),Image.ANTIALIAS)
+                to_image.paste(from_image, ((x - 1) * image_size, (y - 1) * image_size))
+        to_image.save(save_path)
+        print("图片合并完成：", save_path)
+        shutil.rmtree(src)
+        # 保存新图
+    
+    @classmethod
+    def getScrambleImage(cls,path):
+        scrambleFileCache = cls.scrambleImage(path)  
+        if not scrambleFileCache == None:
+            os.remove(str(scrambleFileCache))
+ 
--- a/utils/NetUtils.py
+++ b/utils/NetUtils.py
@ -0,0 +1,86 @@
+import os.path,shutil
+import requests
+from concurrent.futures import ThreadPoolExecutor,wait,ALL_COMPLETED
+import time
+import imghdr
+from utils.comic.PathStr import pathStr
+from utils.ImageUtils import imageUtils
+
+class netUtils:
+    
+    @classmethod
+    # 定义下载函数
+    def download(cls,url,path,fileType=None):
+       if os.path.exists(path):
+           if imghdr.what(path):
+                msg = "已存在同路径文件,已跳过:"+path
+                print(msg)
+                return msg
+           else:
+               print("文件已损坏，已重试："+path)
+       tmp_file = path+".downloads"
+       if os.path.exists(tmp_file):
+           os.remove(tmp_file)
+           print("存在缓存文件，已删除：",tmp_file)
+       repair_count = 1
+       res = requests.get(url, stream=True)
+       while res.status_code != 200 and repair_count <= 5:
+           res = requests.get(url, stream=True)
+           print(f'重试：第{repair_count}次 {url}')
+           repair_count += 1
+       #判断是否为图片
+       if fileType == "image":
+          if 'image' not in res.headers.get("content-type",""):
+              print(f"url= {url} Error: URL doesnot appear to be an image")
+       basedir= os.path.dirname(path)
+       if not os.path.exists(basedir):
+           os.makedirs(basedir)
+       with open(tmp_file, 'wb') as f:
+        for ch in res:
+            f.write(ch)
+        f.close()
+        shutil.move(tmp_file, path)
+        print(f"url={url} 保存至：{path}")
+        return path
+    
+    @classmethod
+    def threadDownload(cls,url,path,fileType=None):
+        executor = ThreadPoolExecutor(max_workers=3)
+        tasks = executor.submit(cls.download, url,path,fileType)
+        #wait(tasks, return_when=ALL_COMPLETED)
+    
+    @classmethod
+    def downloadComicChapterImages(cls,comic_name, chapter_name, imgs,scrambles=None):
+        file_path = os.path.join(pathStr.base_comic_out, comic_name, chapter_name)
+        print("files=",file_path)
+        
+        count_img = 1
+        for img in imgs:
+            count = ("{:0>3d}".format(count_img))
+            file_name = count + os.path.splitext(img)[-1]
+            save_file_path = os.path.join(file_path, file_name)
+            if scrambles[count_img -1]:
+                su = "."+str(img).split(".")[-1]
+                de_str = str(img).split("/")[-1].replace(su,"==") 
+                blockInt = imageUtils.encodeImage(de_str)
+                save_file_path = os.path.join(file_path,"scramble="+str(blockInt)+"_"+file_name)
+            cls.threadDownload(img, save_file_path, fileType="image")
+            time.sleep(0.1)
+            count_img += 1
+        return os.path.dirname(save_file_path)
+    
+    @classmethod
+    def downloadComicIcon(cls,comic_name,chapter,img):
+        file_su = os.path.splitext(img)[-1]
+        icon_name = "cover"+ file_su
+        save_file_path = os.path.join(cls.save_comic_img_basePath,comic_name,icon_name)
+        if os.path.exists(save_file_path):
+            "已存在，跳过下载"
+            return None
+        else:
+            cls.download(img,save_file_path,fileType="image")
+        target_dir = os.path.join(cls.CBZ_path,comic_name)
+        target_file = os.path.join(target_dir,chapter+file_su)
+        if not os.path.exists(target_dir):
+            os.makedirs(target_dir)
+        shutil.copy(save_file_path,target_file)
--- a/utils/comic/ComicInfo.py
+++ b/utils/comic/ComicInfo.py
@ -0,0 +1,37 @@
+from xml.dom.minidom import Document
+
+document = Document()
+
+class comicInfoXmlNode():
+    chapter = "Title" 
+    comic_name = "Series"
+    dep = "Summary"    
+    author = "Writer"
+    tags = "Genre"
+    cbs = "Publisher"
+    lang = "LanguageISO"
+    
+    @classmethod
+    def setNodeAndValue(cls,node,value):
+       node =  document.createElement(node)
+       node_text =  document.createTextNode(value)
+       node.appendChild(node_text)
+    
+    @classmethod 
+    def setChapter(cls,value):
+        cls.setNodeAndValue(cls.chapter,value)
+    
+    @classmethod
+    def setComicName(cls,value):
+        cls.setNodeAndValue(cls.comic_name, value)
+    
+    @classmethod
+    def getComicInfoXML(cls):
+        cls.setChapter()
+class comicInfo():
+    
+    #输出xml
+    @classmethod 
+    def writeComicInfoXML(cls,c_title,chapter,dep,author,tags="韩漫",c_publisher="韩漫",language="zh"):
+        #file 预期 CBZ/"comic_name"/chapter
+        print()
--- a/utils/comic/ComicStr.py
+++ b/utils/comic/ComicStr.py
@ -0,0 +1,28 @@
+class comicStr:
+    url = "url"
+    xpath = "xpath"
+    title = "title"
+    icon = "icon"
+    author = "author"
+    dep = "dep"
+    chapters = "chapters"
+    homepage = "homepage"
+    chapter_href = "chapter_href"
+    list_img = "list_img"
+    img = "img"
+    last_update = "last_update"
+    alias = "alias"
+    tags = "tags"
+    action = "action"
+    chapters_href = "chapters_href"
+    base_url = ""
+    data = "data"
+    result = "result"
+     
+    @classmethod
+    def setBaseUrl(cls,url):
+        cls.base_url = url
+    
+    @classmethod
+    def getBaseUrl(cls):
+        return cls.base_url
--- a/utils/comic/PathStr.py
+++ b/utils/comic/PathStr.py
@ -0,0 +1,6 @@
+import os
+
+class pathStr:
+    base_comic_out = "COMICOUT" 
+    base_CBZ = os.path.join(base_comic_out,"CBZ")
+    base_comic_img = os.path.join(base_comic_out,"outputComic")
--- a/utils/entity/RouMan.py
+++ b/utils/entity/RouMan.py
@ -0,0 +1,63 @@
+import json,os
+from utils.comic.ComicStr import comicStr
+from utils.ComicUtils import comicUtils
+from utils.FileUtils import fileUtils
+
+class comicEntityRM:
+    
+    @classmethod
+    def oneComic(cls,c_url):
+            #漫画名
+        title = comicUtils.xpathData(comicStr.title,
+                        '//div[@class="col"]/h5/text()',url=c_url,num=0)
+        #别名
+        alias = comicUtils.xpathData(comicStr.alias,
+                                         '//span[contains(@class,"bookid_alias")]/text()',num=1)
+        icon = comicUtils.xpathData(comicStr.icon,
+                                    '//img[@class="img-thumbnail"]/@src')
+        
+        author = comicUtils.xpathData(comicStr.author,
+                                         '//div[contains(@class,"bookid_bookInfo")]/p[1]/text()',num=1)
+        tags = comicUtils.xpathData(comicStr.tags,
+                                '//div[contains(@class,"bookid_bookInfo")]/p[3]/b/text()')
+        action = comicUtils.xpathData(comicStr.action,
+                                  '//div[contains(@class,"bookid_bookInfo")]/p[2]/text()',num=1)
+        dep = comicUtils.xpathData(comicStr.dep,
+                               '//div[contains(@class,"bookid_bookInfo")]/p[4]/text()',num=1)
+        update_date = comicUtils.xpathData(comicStr.last_update,
+                                       '//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()',num=1)
+        chapters = comicUtils.xpathData(comicStr.chapters,
+                                    '//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/text()')
+        chapter_href = comicUtils.xpathData(comicStr.chapter_href,
+                                        '//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/@href')
+     
+        comicUtils.setComic(title,alias,icon,author,tags,action,dep,update_date,chapters,chapter_href)
+        
+        #{'title': ['社區重建協會']}
+        #主页
+        #homepage = {comicStr.homepage : [c_url] }
+        #图片
+        #comicUtils.setComic(titles,homepage,icons,authors,deps,chapters, chapter_hrefs,last_update)
+        comicData = comicUtils.getComic()
+        print(comicData)
+        wait = input("数据暂停查看y/n")
+        if not wait == "y":
+            exit()
+        return comicData
+    
+        '''
+    
+    读取某章节下所有图片
+    ''' 
+    @classmethod
+    def comicChapter(cls,c_url,chapter):
+        xpath_str = '//img[contains(@class,"id_comicImage")]/@src'
+        not_eq = "/loading.jpg"
+        #章节下所有图片链接
+        list_img = comicUtils.xpathData(comicStr.list_img,
+                                      xpath_str,url=c_url,type=chapter,not_eq=not_eq)
+        print(list_img)
+        wait = input("暂停查看数据y/n")
+        if not wait == "y":
+            exit()
+        return list_img