PyComicPackRouMan/jmdowning.py
2023-01-08 17:30:08 +08:00

286 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import io
import requests
import time
import os,re
from multiprocessing import Queue
import threading
from threading import Lock
from lxml import etree
import math
import execjs
from PIL import Image
from myran import Myran
import get_jm_url
from utils.ComicInfo import comicInfo
from utils.PathStr import pathStr
from utils.CBZUtils import CBZUtils
from utils.HtmlUtils import htmlUtils
os.environ['EXECJS_RUNTIME'] = "JScript"
class Data:
@classmethod
def oneChapter(cls,*args):
book_name = comicInfo.getComicName()
chapter_name = comicInfo.getChapter()
chapter_href = comicInfo.getWeb()
try:
#print(data[2])
#response = requests.get(url=chapter_href, headers=headers, proxies=proxy)
path_album = os.path.join(pathStr.base_comic_img,book_name)
path_photo = os.path.join(path_album, chapter_name)
# path_img = "path_photo\\%s.jpg" %img_name
with lock: # 判断文件夹是否存在要加锁
if not os.path.exists(path_album): os.makedirs(path_album)
if not os.path.exists(path_photo): os.makedirs(path_photo)
# comicInfo.writeComicInfoXML(data[0],path=path_photo)
cls.parse(chapter_href,path_photo,args[0])
# except requests.exceptions.ConnectionError:
# print("重新抛入queue",data)
# data_queue.put(data)
except Exception as e:
print(e.__traceback__.tb_lineno,e)
print("重新抛入data_queue")
cls.oneChapter(args)
@classmethod
def parse(cls,rsp,path_photo,photoid):
img_list =htmlUtils.xpathData("//div[@class='panel-body']/div/div[contains(@class,'center')]/img",url=rsp)
pages_imgs =htmlUtils.xpathData("//div[@class='center scramble-page']/@id",url=rsp)
comicInfo.setPages(pages_imgs)
comicInfo.writeComicInfoXML(comicInfo.str_chapter,path=path_photo)
#print("img_list:",len(img_list))
for i in img_list:
img_url= i.attrib['data-original']
img_name = os.path.basename(img_url).split('.')[0]
path_img = "%s\\%s.jpg" % (path_photo, img_name)
#print([img_url,photoid,path_img])
down_queue.put([img_url,photoid,path_img])
class Download(threading.Thread):
def __init__(self,thread_name):
super().__init__()
self.thread_name = thread_name
def run(self):
print("%s开始了!"%self.thread_name)
while not down_empty:
try:
print("还剩余%s张图片"%down_queue.qsize())
if not down_queue.empty():
down = down_queue.get(False)
else:
time.sleep(3)
down = down_queue.get(False)
try:
print("down",down)
if not os.path.exists(down[2]):
#scramble_id=220980 网页固定值
if int(down[1])>220980:#albumid>aid就使用拼接函数 否则直接下载
print("拼接图片")
self.pjdown(down[0],down[1],down[2])
else:
print("直接下载图片")
self.dowm_img(down[0],down[2])
except Exception as e:
print(e.__traceback__.tb_lineno,e)
print("重新抛入queue",down)
down_queue.put(down)
except:
pass
def dowm_img(self,url,path_img):
# s=random.choice(list(range(3)))+1+random.random()
# time.sleep(s)
#print("time.sleep=%d"%s)
headers["User_Agent"]=myran.agents()
response = requests.get(url,headers=headers,proxies=proxy)
if response.status_code == 200:
with open(path_img,"wb") as f:
f.write(response.content)
else:print("图片request失败")
def pjdown(self,*args):
imgurl = args[0]
#print(imgurl)
imgpath=args[-1]
# httpproxy_handler = urllib.request.ProxyHandler(proxies=proxy)
# opener = urllib.request.build_opener(httpproxy_handler)
# urlz = urllib.request.Request(imgurl, headers={"User-Agent": myran.agents()})
# im2 = Image.open(opener.open(urlz))
headers["User_Agent"]=myran.agents()
response=requests.get(imgurl, headers=headers,proxies=proxy)
if response.status_code == 200:
im2 = Image.open(io.BytesIO(response.content))
#im2.show()
#print(imgurl, args[1],imgpath, im2)
self.splitimage(imgurl, args[1],imgpath, im2)
def get_md5(self,num):
with open('js/md5.js', 'r') as file:
result = file.read()
context1 = execjs.compile(result)
result1 = context1.call('md5', num)
return result1
def get_num(self,e, t):
#print(type(e),e, type(t),t)
a = 10
try:
num_dict = {}
for i in range(10):
num_dict[i] = i * 2 + 2
if (int(e) >= 268850):
n = str(e) + t;
# switch(n=(n = (n = md5(n)).substr(-1)), n %= 10) {
#print("n=",n)
tmp = ord(self.get_md5(n)[-1])
result = num_dict[tmp % 10]
a = result
return a
except Exception as e:
print(e.__traceback__.tb_lineno,e)
return False
def splitimage(self,src, aid,imgpath,imageob=''):
if imageob == '':
image = Image.open(src)
else:
image = imageob
w, h = image.size
#image.show()
img_name = os.path.basename(src).split('.')[0]
# print(type(aid),type(img_name))
if self.get_num(aid, img_name):
s = self.get_num(aid, img_name) # 随机值
# print(s)
l = h % s # 切割最后多余的值
box_list = []
hz = 0
for i in range(s):
c = math.floor(h / s)
g = i * c
hz += c
h2 = h - c * (i + 1) - l
if i == 0:
c += l;hz += l
else:
g += l
box_list.append((0, h2, w, h - g))
# print(box_list,len(box_list))
item_width = w
# box_list.reverse() #还原切图可以倒序列表
# print(box_list, len(box_list))
newh = 0
image_list = [image.crop(box) for box in box_list]
# print(box_list)
newimage = Image.new("RGB", (w, h))
for image in image_list:
# image.show()
b_w, b_h = image.size
newimage.paste(image, (0, newh))
newh += b_h
newimage.save(imgpath)
down_queue=Queue()
data_empty = False
down_empty = False
lock = Lock()
myran = Myran()
headers = {
#'cookie':'ipcountry=US; AVS=4eb0s4o5ho9hfmp704ge7jtium; ipm5=bb7f6ac39cebfa37e89bd07544c549fd; cover=1; guide=1; __atuvc=12|39,31|40,5|41,0|42,4|43; __atuvs=635cabf67eff0d49003; yuo1={"objName":"hT3l8Pyn15Uf","request_id":0,"zones":[{"idzone":"2967008","here":{}},{"idzone":"2967010","here":{}},{"idzone":"2967010","here":{}},{"idzone":"3597795","sub":"70","here":{}}]}',
#'referer': 'https://18comic.org/',
"User_Agent": myran.agents()
}
proxy = {
# "http":"127.0.0.1:7890",
# "https":"127.0.0.1:7890"
}
def app(url):
try:
global data_empty,down_empty
newurl_list=get_jm_url.app()
response=''
if newurl_list:
if re.findall(r'https://(.*?)/\w+/\d+/',url)[0] not in newurl_list:
for newurl in newurl_list:
url = re.sub(re.findall(r'https://(.*?)/\w+/\d+/', url)[0], newurl, url)
response = requests.get(url=url, headers=headers, proxies=proxy)
break
else:
response = requests.get(url=url, headers=headers, proxies=proxy)
else:
response = requests.get(url=url, headers=headers, proxies=proxy)
if response:
albumid = re.search(r'/album/(\d+)', url).group(1)
referer = re.search(r'(https://\w+\.\w+)/', url).group(1)
print("albumid", albumid, referer, url)
print(response.url)
if response.status_code == 200:
print(response.status_code)
eth = etree.HTML(response.text)
#拿到所有话数
nums = eth.xpath("//div[@class='row']/div[6]/div[1]/div[1]/ul[contains(@class,'btn-toolbar')]/a")
book_name = eth.xpath("//div[@itemprop='name']/h1[@id='book-name']/text()")[0]
book_name = re.sub(r'[\\\/\|\(\)\~\?\.\:\\-\*\<\>]', '', book_name)
tags = eth.xpath("//div[@class='row']/div[@class='col-lg-7']/div[1]/div[@class='tag-block']/span[@data-type='tags']/a[@class='btn btn-sm btn-primary']/text()")
author = eth.xpath("//div[@class='row']/div[@class='col-lg-7']/div[1]/div[@class='tag-block']/span[@data-type='author']/a[@class='btn btn-sm btn-primary']/text()")
book_msg = eth.xpath("//div[@class='row']/div[@class='col-lg-7']/div[1]/div[@class='p-t-5 p-b-5']/text()")
jmid = book_msg[0]
dep = str(book_msg[1]).replace("叙述:","")
comicInfo.setComicName(book_name)
comicInfo.setAuthor(author)
comicInfo.setDep(dep)
comicInfo.setTags(tags)
comicInfo.setTag(tags)
comicInfo.setCBS("韩漫")
comicInfo.setLang("zh")
if nums:
for i in nums:
photo_name_list = i.xpath("li/text()")[0].split()
photo_date = i.xpath("li/span/text()")[0].split()
#print(re.findall(r'[\u4E00-\u9FA5]+.*?', i.xpath("li/text()")[0]))
try:
if re.findall(r'[\u4E00-\u9FA5]', photo_name_list[2]):
photo_name=re.sub(r'\s','',photo_name_list[0])+' '+photo_name_list[2]
else:photo_name=re.sub(r'\s','',photo_name_list[0])
except Exception as e:
photo_name = re.sub(r'\s', '', photo_name_list[0])
photo_name = re.sub(r'[\\\/\|\(\)\~\?\.\:\\-\*\<\>\-]', '',photo_name)
#print(photo_name)
photoid=i.attrib['data-album']
comicInfo.setChapterName(photo_name)
comicInfo.setDate(photo_date[0],split='-')
comicInfo.setWeb(referer+i.attrib['href'])
Data.oneChapter(photoid)
except Exception as e:
print(e.__traceback__.tb_lineno,e)
startime=time.perf_counter()
while True:
if down_queue.qsize()>100 or time.perf_counter()-startime>10:
break
print('down_queue.qsize():%s'%down_queue.qsize())
down_list=['down下载线程%s'%s for s in list(range(1,40 if down_queue.qsize()>40 else down_queue.qsize()))]
down_thread_list=[]
for i in down_list:
down=Download(i)
down.start()
time.sleep(0.7)
down_thread_list.append(down)
while not down_queue.empty():
pass
down_empty=True
for down_thread in down_thread_list:
down_thread.join()
print("%s结束了!"%down_thread.thread_name)
if __name__ == '__main__':
# os.environ["http_proxy"] = "http://127.0.0.1:7890"
# os.environ["https_proxy"] = "http://127.0.0.1:7890"
app("https://18comic.vip/album/407792/")