286 lines
12 KiB
Python
286 lines
12 KiB
Python
import io
|
||
|
||
import requests
|
||
import time
|
||
import os,re
|
||
from multiprocessing import Queue
|
||
import threading
|
||
from threading import Lock
|
||
from lxml import etree
|
||
import math
|
||
import execjs
|
||
from PIL import Image
|
||
from myran import Myran
|
||
import get_jm_url
|
||
from utils.ComicInfo import comicInfo
|
||
from utils.PathStr import pathStr
|
||
from utils.CBZUtils import CBZUtils
|
||
from utils.HtmlUtils import htmlUtils
|
||
|
||
os.environ['EXECJS_RUNTIME'] = "JScript"
|
||
|
||
class Data:
|
||
|
||
@classmethod
|
||
def oneChapter(cls,*args):
|
||
book_name = comicInfo.getComicName()
|
||
chapter_name = comicInfo.getChapter()
|
||
chapter_href = comicInfo.getWeb()
|
||
try:
|
||
#print(data[2])
|
||
#response = requests.get(url=chapter_href, headers=headers, proxies=proxy)
|
||
path_album = os.path.join(pathStr.base_comic_img,book_name)
|
||
path_photo = os.path.join(path_album, chapter_name)
|
||
# path_img = "path_photo\\%s.jpg" %img_name
|
||
with lock: # 判断文件夹是否存在要加锁
|
||
if not os.path.exists(path_album): os.makedirs(path_album)
|
||
if not os.path.exists(path_photo): os.makedirs(path_photo)
|
||
# comicInfo.writeComicInfoXML(data[0],path=path_photo)
|
||
cls.parse(chapter_href,path_photo,args[0])
|
||
# except requests.exceptions.ConnectionError:
|
||
# print("重新抛入queue:",data)
|
||
# data_queue.put(data)
|
||
except Exception as e:
|
||
print(e.__traceback__.tb_lineno,e)
|
||
print("重新抛入data_queue:")
|
||
cls.oneChapter(args)
|
||
|
||
@classmethod
|
||
def parse(cls,rsp,path_photo,photoid):
|
||
img_list =htmlUtils.xpathData("//div[@class='panel-body']/div/div[contains(@class,'center')]/img",url=rsp)
|
||
|
||
pages_imgs =htmlUtils.xpathData("//div[@class='center scramble-page']/@id",url=rsp)
|
||
comicInfo.setPages(pages_imgs)
|
||
comicInfo.writeComicInfoXML(comicInfo.str_chapter,path=path_photo)
|
||
#print("img_list:",len(img_list))
|
||
for i in img_list:
|
||
img_url= i.attrib['data-original']
|
||
img_name = os.path.basename(img_url).split('.')[0]
|
||
path_img = "%s\\%s.jpg" % (path_photo, img_name)
|
||
#print([img_url,photoid,path_img])
|
||
down_queue.put([img_url,photoid,path_img])
|
||
|
||
class Download(threading.Thread):
|
||
def __init__(self,thread_name):
|
||
super().__init__()
|
||
self.thread_name = thread_name
|
||
def run(self):
|
||
print("%s开始了!"%self.thread_name)
|
||
while not down_empty:
|
||
try:
|
||
print("还剩余%s张图片"%down_queue.qsize())
|
||
if not down_queue.empty():
|
||
down = down_queue.get(False)
|
||
else:
|
||
time.sleep(3)
|
||
down = down_queue.get(False)
|
||
try:
|
||
print("down",down)
|
||
if not os.path.exists(down[2]):
|
||
#scramble_id=220980 网页固定值
|
||
if int(down[1])>220980:#albumid>aid就使用拼接函数 否则直接下载
|
||
print("拼接图片")
|
||
self.pjdown(down[0],down[1],down[2])
|
||
else:
|
||
print("直接下载图片")
|
||
self.dowm_img(down[0],down[2])
|
||
|
||
except Exception as e:
|
||
print(e.__traceback__.tb_lineno,e)
|
||
print("重新抛入queue:",down)
|
||
down_queue.put(down)
|
||
except:
|
||
pass
|
||
def dowm_img(self,url,path_img):
|
||
# s=random.choice(list(range(3)))+1+random.random()
|
||
# time.sleep(s)
|
||
#print("time.sleep=%d"%s)
|
||
headers["User_Agent"]=myran.agents()
|
||
response = requests.get(url,headers=headers,proxies=proxy)
|
||
if response.status_code == 200:
|
||
with open(path_img,"wb") as f:
|
||
f.write(response.content)
|
||
else:print("图片request失败")
|
||
def pjdown(self,*args):
|
||
imgurl = args[0]
|
||
#print(imgurl)
|
||
imgpath=args[-1]
|
||
# httpproxy_handler = urllib.request.ProxyHandler(proxies=proxy)
|
||
# opener = urllib.request.build_opener(httpproxy_handler)
|
||
# urlz = urllib.request.Request(imgurl, headers={"User-Agent": myran.agents()})
|
||
# im2 = Image.open(opener.open(urlz))
|
||
|
||
headers["User_Agent"]=myran.agents()
|
||
response=requests.get(imgurl, headers=headers,proxies=proxy)
|
||
if response.status_code == 200:
|
||
im2 = Image.open(io.BytesIO(response.content))
|
||
#im2.show()
|
||
#print(imgurl, args[1],imgpath, im2)
|
||
self.splitimage(imgurl, args[1],imgpath, im2)
|
||
def get_md5(self,num):
|
||
with open('js/md5.js', 'r') as file:
|
||
result = file.read()
|
||
context1 = execjs.compile(result)
|
||
result1 = context1.call('md5', num)
|
||
return result1
|
||
def get_num(self,e, t):
|
||
#print(type(e),e, type(t),t)
|
||
a = 10
|
||
try:
|
||
num_dict = {}
|
||
for i in range(10):
|
||
num_dict[i] = i * 2 + 2
|
||
if (int(e) >= 268850):
|
||
n = str(e) + t;
|
||
# switch(n=(n = (n = md5(n)).substr(-1)), n %= 10) {
|
||
#print("n=",n)
|
||
tmp = ord(self.get_md5(n)[-1])
|
||
result = num_dict[tmp % 10]
|
||
a = result
|
||
return a
|
||
except Exception as e:
|
||
print(e.__traceback__.tb_lineno,e)
|
||
return False
|
||
def splitimage(self,src, aid,imgpath,imageob=''):
|
||
if imageob == '':
|
||
image = Image.open(src)
|
||
else:
|
||
image = imageob
|
||
w, h = image.size
|
||
#image.show()
|
||
img_name = os.path.basename(src).split('.')[0]
|
||
# print(type(aid),type(img_name))
|
||
if self.get_num(aid, img_name):
|
||
s = self.get_num(aid, img_name) # 随机值
|
||
# print(s)
|
||
l = h % s # 切割最后多余的值
|
||
box_list = []
|
||
hz = 0
|
||
for i in range(s):
|
||
c = math.floor(h / s)
|
||
g = i * c
|
||
hz += c
|
||
h2 = h - c * (i + 1) - l
|
||
if i == 0:
|
||
c += l;hz += l
|
||
else:
|
||
g += l
|
||
box_list.append((0, h2, w, h - g))
|
||
|
||
# print(box_list,len(box_list))
|
||
item_width = w
|
||
# box_list.reverse() #还原切图可以倒序列表
|
||
# print(box_list, len(box_list))
|
||
newh = 0
|
||
image_list = [image.crop(box) for box in box_list]
|
||
# print(box_list)
|
||
newimage = Image.new("RGB", (w, h))
|
||
for image in image_list:
|
||
# image.show()
|
||
b_w, b_h = image.size
|
||
newimage.paste(image, (0, newh))
|
||
|
||
newh += b_h
|
||
newimage.save(imgpath)
|
||
|
||
down_queue=Queue()
|
||
data_empty = False
|
||
down_empty = False
|
||
lock = Lock()
|
||
myran = Myran()
|
||
headers = {
|
||
#'cookie':'ipcountry=US; AVS=4eb0s4o5ho9hfmp704ge7jtium; ipm5=bb7f6ac39cebfa37e89bd07544c549fd; cover=1; guide=1; __atuvc=12|39,31|40,5|41,0|42,4|43; __atuvs=635cabf67eff0d49003; yuo1={"objName":"hT3l8Pyn15Uf","request_id":0,"zones":[{"idzone":"2967008","here":{}},{"idzone":"2967010","here":{}},{"idzone":"2967010","here":{}},{"idzone":"3597795","sub":"70","here":{}}]}',
|
||
#'referer': 'https://18comic.org/',
|
||
"User_Agent": myran.agents()
|
||
}
|
||
proxy = {
|
||
# "http":"127.0.0.1:7890",
|
||
# "https":"127.0.0.1:7890"
|
||
}
|
||
def app(url):
|
||
try:
|
||
global data_empty,down_empty
|
||
|
||
newurl_list=get_jm_url.app()
|
||
response=''
|
||
if newurl_list:
|
||
if re.findall(r'https://(.*?)/\w+/\d+/',url)[0] not in newurl_list:
|
||
for newurl in newurl_list:
|
||
url = re.sub(re.findall(r'https://(.*?)/\w+/\d+/', url)[0], newurl, url)
|
||
response = requests.get(url=url, headers=headers, proxies=proxy)
|
||
break
|
||
else:
|
||
response = requests.get(url=url, headers=headers, proxies=proxy)
|
||
else:
|
||
response = requests.get(url=url, headers=headers, proxies=proxy)
|
||
if response:
|
||
albumid = re.search(r'/album/(\d+)', url).group(1)
|
||
referer = re.search(r'(https://\w+\.\w+)/', url).group(1)
|
||
print("albumid", albumid, referer, url)
|
||
print(response.url)
|
||
if response.status_code == 200:
|
||
print(response.status_code)
|
||
eth = etree.HTML(response.text)
|
||
#拿到所有话数
|
||
nums = eth.xpath("//div[@class='row']/div[6]/div[1]/div[1]/ul[contains(@class,'btn-toolbar')]/a")
|
||
book_name = eth.xpath("//div[@itemprop='name']/h1[@id='book-name']/text()")[0]
|
||
book_name = re.sub(r'[\\\/\|\(\)\~\?\.\:\:\-\*\<\>]', '', book_name)
|
||
tags = eth.xpath("//div[@class='row']/div[@class='col-lg-7']/div[1]/div[@class='tag-block']/span[@data-type='tags']/a[@class='btn btn-sm btn-primary']/text()")
|
||
author = eth.xpath("//div[@class='row']/div[@class='col-lg-7']/div[1]/div[@class='tag-block']/span[@data-type='author']/a[@class='btn btn-sm btn-primary']/text()")
|
||
book_msg = eth.xpath("//div[@class='row']/div[@class='col-lg-7']/div[1]/div[@class='p-t-5 p-b-5']/text()")
|
||
jmid = book_msg[0]
|
||
dep = str(book_msg[1]).replace("叙述:","")
|
||
|
||
comicInfo.setComicName(book_name)
|
||
comicInfo.setAuthor(author)
|
||
comicInfo.setDep(dep)
|
||
comicInfo.setTags(tags)
|
||
comicInfo.setTag(tags)
|
||
comicInfo.setCBS("韩漫")
|
||
comicInfo.setLang("zh")
|
||
|
||
if nums:
|
||
for i in nums:
|
||
photo_name_list = i.xpath("li/text()")[0].split()
|
||
photo_date = i.xpath("li/span/text()")[0].split()
|
||
#print(re.findall(r'[\u4E00-\u9FA5]+.*?', i.xpath("li/text()")[0]))
|
||
try:
|
||
if re.findall(r'[\u4E00-\u9FA5]', photo_name_list[2]):
|
||
photo_name=re.sub(r'\s','',photo_name_list[0])+' '+photo_name_list[2]
|
||
else:photo_name=re.sub(r'\s','',photo_name_list[0])
|
||
except Exception as e:
|
||
photo_name = re.sub(r'\s', '', photo_name_list[0])
|
||
photo_name = re.sub(r'[\\\/\|\(\)\~\?\.\:\:\-\*\<\>\-]', '',photo_name)
|
||
#print(photo_name)
|
||
photoid=i.attrib['data-album']
|
||
comicInfo.setChapterName(photo_name)
|
||
comicInfo.setDate(photo_date[0],split='-')
|
||
comicInfo.setWeb(referer+i.attrib['href'])
|
||
Data.oneChapter(photoid)
|
||
except Exception as e:
|
||
print(e.__traceback__.tb_lineno,e)
|
||
startime=time.perf_counter()
|
||
while True:
|
||
if down_queue.qsize()>100 or time.perf_counter()-startime>10:
|
||
break
|
||
print('down_queue.qsize():%s'%down_queue.qsize())
|
||
down_list=['down下载线程%s号'%s for s in list(range(1,40 if down_queue.qsize()>40 else down_queue.qsize()))]
|
||
down_thread_list=[]
|
||
for i in down_list:
|
||
down=Download(i)
|
||
down.start()
|
||
time.sleep(0.7)
|
||
down_thread_list.append(down)
|
||
while not down_queue.empty():
|
||
pass
|
||
down_empty=True
|
||
for down_thread in down_thread_list:
|
||
down_thread.join()
|
||
print("%s结束了!"%down_thread.thread_name)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# os.environ["http_proxy"] = "http://127.0.0.1:7890"
|
||
# os.environ["https_proxy"] = "http://127.0.0.1:7890"
|
||
app("https://18comic.vip/album/407792/")
|