PyComicPackRouMan/jmdowning.py

import io

import requests
import time
import os,re
from multiprocessing import Queue
import threading
from threading import Lock
from lxml import etree
import math
import execjs
from PIL import Image
from myran import Myran
import get_jm_url
from utils.ComicInfo import comicInfo
from utils.PathStr import pathStr
from utils.CBZUtils import CBZUtils
from utils.HtmlUtils import htmlUtils

os.environ['EXECJS_RUNTIME'] = "JScript"

class Data:

    @classmethod
    def oneChapter(cls,*args):
        book_name = comicInfo.getComicName()
        chapter_name = comicInfo.getChapter()
        chapter_href = comicInfo.getWeb()
        try:
            #print(data[2])
            #response = requests.get(url=chapter_href, headers=headers, proxies=proxy)
            path_album = os.path.join(pathStr.base_comic_img,book_name)
            path_photo = os.path.join(path_album, chapter_name)
                 # path_img = "path_photo\\%s.jpg" %img_name
            with lock:  # 判断文件夹是否存在要加锁
                if not os.path.exists(path_album): os.makedirs(path_album)
                if not os.path.exists(path_photo): os.makedirs(path_photo)
#                    comicInfo.writeComicInfoXML(data[0],path=path_photo)
            cls.parse(chapter_href,path_photo,args[0])
        # except requests.exceptions.ConnectionError:
        #     print("重新抛入queue：",data)
        #     data_queue.put(data)
        except Exception as e:
            print(e.__traceback__.tb_lineno,e)
            print("重新抛入data_queue：")
            cls.oneChapter(args)

    @classmethod
    def parse(cls,rsp,path_photo,photoid):
        img_list =htmlUtils.xpathData("//div[@class='panel-body']/div/div[contains(@class,'center')]/img",url=rsp)

        pages_imgs =htmlUtils.xpathData("//div[@class='center scramble-page']/@id",url=rsp)
        comicInfo.setPages(pages_imgs)
        comicInfo.writeComicInfoXML(comicInfo.str_chapter,path=path_photo)
        #print("img_list:",len(img_list))
        for i in img_list:
            img_url= i.attrib['data-original']
            img_name = os.path.basename(img_url).split('.')[0]
            path_img = "%s\\%s.jpg" % (path_photo, img_name)
            #print([img_url,photoid,path_img])
            down_queue.put([img_url,photoid,path_img])

class Download(threading.Thread):
    def __init__(self,thread_name):
        super().__init__()
        self.thread_name = thread_name
    def run(self):
        print("%s开始了！"%self.thread_name)
        while not down_empty:
            try:
                print("还剩余%s张图片"%down_queue.qsize())
                if not down_queue.empty():
                    down = down_queue.get(False)
                else:
                    time.sleep(3)
                    down = down_queue.get(False)
                try:
                    print("down",down)
                    if not os.path.exists(down[2]):
                        #scramble_id=220980 网页固定值
                        if int(down[1])>220980:#albumid>aid就使用拼接函数 否则直接下载
                            print("拼接图片")
                            self.pjdown(down[0],down[1],down[2])
                        else:
                            print("直接下载图片")
                            self.dowm_img(down[0],down[2])

                except  Exception as e:
                    print(e.__traceback__.tb_lineno,e)
                    print("重新抛入queue：",down)
                    down_queue.put(down)
            except:
                pass
    def dowm_img(self,url,path_img):
        # s=random.choice(list(range(3)))+1+random.random()
        # time.sleep(s)
        #print("time.sleep=%d"%s)
        headers["User_Agent"]=myran.agents()
        response = requests.get(url,headers=headers,proxies=proxy)
        if response.status_code == 200:
            with open(path_img,"wb") as f:
                f.write(response.content)
        else:print("图片request失败")
    def pjdown(self,*args):
        imgurl = args[0]
        #print(imgurl)
        imgpath=args[-1]
        # httpproxy_handler = urllib.request.ProxyHandler(proxies=proxy)
        # opener = urllib.request.build_opener(httpproxy_handler)
        # urlz = urllib.request.Request(imgurl, headers={"User-Agent": myran.agents()})
        # im2 = Image.open(opener.open(urlz))

        headers["User_Agent"]=myran.agents()
        response=requests.get(imgurl, headers=headers,proxies=proxy)
        if response.status_code == 200:
            im2 = Image.open(io.BytesIO(response.content))
            #im2.show()
            #print(imgurl, args[1],imgpath, im2)
            self.splitimage(imgurl, args[1],imgpath, im2)
    def get_md5(self,num):
        with open('js/md5.js', 'r') as file:
            result = file.read()
        context1 = execjs.compile(result)
        result1 = context1.call('md5', num)
        return result1
    def get_num(self,e, t):
        #print(type(e),e, type(t),t)
        a = 10
        try:
            num_dict = {}
            for i in range(10):
                num_dict[i] = i * 2 + 2
            if (int(e) >= 268850):
                n = str(e) + t;
                # switch(n=(n = (n = md5(n)).substr(-1)), n %= 10) {
                #print("n=",n)
                tmp = ord(self.get_md5(n)[-1])
                result = num_dict[tmp % 10]
                a = result
            return a
        except Exception as e:
            print(e.__traceback__.tb_lineno,e)
            return False
    def splitimage(self,src, aid,imgpath,imageob=''):
        if imageob == '':
            image = Image.open(src)
        else:
            image = imageob
        w, h = image.size
        #image.show()
        img_name = os.path.basename(src).split('.')[0]
        # print(type(aid),type(img_name))
        if self.get_num(aid, img_name):
            s = self.get_num(aid, img_name)  # 随机值
            # print(s)
            l = h % s  # 切割最后多余的值
            box_list = []
            hz = 0
            for i in range(s):
                c = math.floor(h / s)
                g = i * c
                hz += c
                h2 = h - c * (i + 1) - l
                if i == 0:
                    c += l;hz += l
                else:
                    g += l
                box_list.append((0, h2, w, h - g))

            # print(box_list,len(box_list))
            item_width = w
            # box_list.reverse() #还原切图可以倒序列表
            # print(box_list, len(box_list))
            newh = 0
            image_list = [image.crop(box) for box in box_list]
            # print(box_list)
            newimage = Image.new("RGB", (w, h))
            for image in image_list:
                # image.show()
                b_w, b_h = image.size
                newimage.paste(image, (0, newh))

                newh += b_h
            newimage.save(imgpath)

down_queue=Queue()
data_empty = False
down_empty = False
lock = Lock()
myran = Myran()
headers = {
    #'cookie':'ipcountry=US; AVS=4eb0s4o5ho9hfmp704ge7jtium; ipm5=bb7f6ac39cebfa37e89bd07544c549fd; cover=1; guide=1; __atuvc=12|39,31|40,5|41,0|42,4|43; __atuvs=635cabf67eff0d49003; yuo1={"objName":"hT3l8Pyn15Uf","request_id":0,"zones":[{"idzone":"2967008","here":{}},{"idzone":"2967010","here":{}},{"idzone":"2967010","here":{}},{"idzone":"3597795","sub":"70","here":{}}]}',
    #'referer': 'https://18comic.org/',
    "User_Agent": myran.agents()
}
proxy = {
#    "http":"127.0.0.1:7890",
#    "https":"127.0.0.1:7890"
}
def app(url):
    try:
        global data_empty,down_empty

        newurl_list=get_jm_url.app()
        response=''
        if newurl_list:
            if re.findall(r'https://(.*?)/\w+/\d+/',url)[0] not in newurl_list:
                for newurl in newurl_list:
                    url = re.sub(re.findall(r'https://(.*?)/\w+/\d+/', url)[0], newurl, url)
                    response = requests.get(url=url, headers=headers, proxies=proxy)
                    break
            else:
                response = requests.get(url=url, headers=headers, proxies=proxy)
        else:
            response = requests.get(url=url, headers=headers, proxies=proxy)
        if response:
            albumid = re.search(r'/album/(\d+)', url).group(1)
            referer = re.search(r'(https://\w+\.\w+)/', url).group(1)
            print("albumid", albumid, referer, url)
            print(response.url)
            if response.status_code == 200:
                print(response.status_code)
                eth = etree.HTML(response.text)
                #拿到所有话数
                nums = eth.xpath("//div[@class='row']/div[6]/div[1]/div[1]/ul[contains(@class,'btn-toolbar')]/a")
                book_name = eth.xpath("//div[@itemprop='name']/h1[@id='book-name']/text()")[0]
                book_name = re.sub(r'[\\\/\|\(\)\~\?\.\:\：\-\*\<\>]', '', book_name)
                tags = eth.xpath("//div[@class='row']/div[@class='col-lg-7']/div[1]/div[@class='tag-block']/span[@data-type='tags']/a[@class='btn btn-sm btn-primary']/text()")
                author = eth.xpath("//div[@class='row']/div[@class='col-lg-7']/div[1]/div[@class='tag-block']/span[@data-type='author']/a[@class='btn btn-sm btn-primary']/text()")
                book_msg = eth.xpath("//div[@class='row']/div[@class='col-lg-7']/div[1]/div[@class='p-t-5 p-b-5']/text()")
                jmid = book_msg[0]
                dep = str(book_msg[1]).replace("叙述：","")

                comicInfo.setComicName(book_name)
                comicInfo.setAuthor(author)
                comicInfo.setDep(dep)
                comicInfo.setTags(tags)
                comicInfo.setTag(tags)
                comicInfo.setCBS("韩漫")
                comicInfo.setLang("zh")

                if nums:
                    for i in nums:
                        photo_name_list = i.xpath("li/text()")[0].split()
                        photo_date = i.xpath("li/span/text()")[0].split()
                        #print(re.findall(r'[\u4E00-\u9FA5]+.*?', i.xpath("li/text()")[0]))
                        try:
                            if re.findall(r'[\u4E00-\u9FA5]', photo_name_list[2]):
                                photo_name=re.sub(r'\s','',photo_name_list[0])+' '+photo_name_list[2]
                            else:photo_name=re.sub(r'\s','',photo_name_list[0])
                        except Exception as e:
                            photo_name = re.sub(r'\s', '', photo_name_list[0])
                        photo_name = re.sub(r'[\\\/\|\(\)\~\?\.\:\：\-\*\<\>\-]', '',photo_name)
                        #print(photo_name)
                        photoid=i.attrib['data-album']
                        comicInfo.setChapterName(photo_name)
                        comicInfo.setDate(photo_date[0],split='-')
                        comicInfo.setWeb(referer+i.attrib['href'])
                        Data.oneChapter(photoid)
    except Exception as e:
        print(e.__traceback__.tb_lineno,e)
        startime=time.perf_counter()
        while True:
            if down_queue.qsize()>100 or time.perf_counter()-startime>10:
                break
        print('down_queue.qsize():%s'%down_queue.qsize())
        down_list=['down下载线程%s号'%s for s in list(range(1,40 if down_queue.qsize()>40 else down_queue.qsize()))]
        down_thread_list=[]
        for i in down_list:
            down=Download(i)
            down.start()
            time.sleep(0.7)
            down_thread_list.append(down)
        while not down_queue.empty():
            pass
        down_empty=True
        for down_thread in down_thread_list:
            down_thread.join()
            print("%s结束了！"%down_thread.thread_name)


if __name__ == '__main__':
#    os.environ["http_proxy"] = "http://127.0.0.1:7890"
#    os.environ["https_proxy"] = "http://127.0.0.1:7890"
    app("https://18comic.vip/album/407792/")