This commit is contained in:
caiwx86 2023-05-16 06:46:10 +08:00
parent 43569ed246
commit b59e241fa1
19 changed files with 380 additions and 41 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
.scrapy/*
images/*
/**/__pycache__

View File

@ -26,7 +26,7 @@ class ComicItem(scrapy.Item):
genre = scrapy.Field() genre = scrapy.Field()
age_rating = scrapy.Field() age_rating = scrapy.Field()
class DownImagesItem(scrapy.Item): class ImageItem(scrapy.Item):
image_name = scrapy.Field() image_name = scrapy.Field()
image_url = scrapy.Field() image_url = scrapy.Field()
image_path = scrapy.Field() image_path = scrapy.Field()

View File

@ -4,10 +4,14 @@
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals from scrapy import signals
import random
from Comics.settings import PROXY_LIST
# useful for handling different item types with a single interface # useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter from itemadapter import is_item, ItemAdapter
class ProxyMiddleware(object):
def process_request(self, request, spider):
request.meta["proxy"] = random.choice(PROXY_LIST)
class ComicsSpiderMiddleware: class ComicsSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined, # Not all methods need to be defined. If a method is not defined,

View File

@ -5,6 +5,14 @@
# useful for handling different item types with a single interface # useful for handling different item types with a single interface
import os,requests,re,scrapy,logging
from Comics import settings
from Comics.spiders.utils.FileUtils import imageUtils
from Comics.spiders.utils.Constant import ComicPath
from Comics.items import ComicItem
from Comics.items import ImageItem
from scrapy.pipelines.files import FilesPipeline
from scrapy.pipelines.images import ImagesPipeline
from itemadapter import ItemAdapter from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline from scrapy.pipelines.images import ImagesPipeline
@ -15,7 +23,8 @@ class ComicsPipeline:
def comic_process(self,images): def comic_process(self,images):
count = 1 count = 1
scramble_count = 0 scramble_count = 0
(files_name,images_url) = [[],[]] #(files_name,images_url) = [[],[]]
list_image_item = []
for image in images: for image in images:
(image_src,scramble) = [image.get("src"),image.get("scramble")] (image_src,scramble) = [image.get("src"),image.get("scramble")]
count_image = "{:0>3d}".format(count) count_image = "{:0>3d}".format(count)
@ -23,23 +32,71 @@ class ComicsPipeline:
image_file_name = count_image+image_src_suffix image_file_name = count_image+image_src_suffix
if scramble: if scramble:
de_str = str(image_src).split("/")[-1].replace(image_src_suffix,"==") de_str = str(image_src).split("/")[-1].replace(image_src_suffix,"==")
#blocks_num = imageUtils.encodeImage(de_str) blocks_num = imageUtils.encodeImage(de_str)
#image_file_name = ComicPath.getFileScrambleImageName(count=count_image,block=blocks_num,suffix=image_src_suffix) image_file_name = ComicPath.getFileScrambleImageName(count=count_image,block=blocks_num,suffix=image_src_suffix)
scramble_count += 1 scramble_count += 1
files_name.append(image_file_name) #files_name.append(image_file_name)
images_url.append(image_src) #images_url.append(image_src)
#downUtils.putDownImageUrlDirFile(image_src,ComicPath.getDirComicChapter(),image_file_name) #downUtils.putDownImageUrlDirFile(image_src,ComicPath.getDirComicChapter(),image_file_name)
list_image_item.append(ImageItem(image_name=image_file_name,image_url=image_src,image_path=image_file_name))
yield ImageItem(image_name=image_file_name,image_url=image_src,image_path=image_file_name)
count+=1 count+=1
#yield list_image_item
# item就是yield后面的对象 # item就是yield后面的对象
def process_item(self, item, spider): def process_item(self, item, spider):
self.fp.write(str(item)) self.fp.write(str(item))
self.comic_process(item["list_img"])
return item return item
#image解析
def close_spider(self,spider): def close_spider(self,spider):
self.fp.close() self.fp.close()
class ImgDownloadPipeline(ImagesPipeline): class ImageParsePipeline:
def get_ def process_item(self, item, spider):
if isinstance(item, ComicItem):
list_img = item['list_img']
count = 1
scramble_count = 0
#(files_name,images_url) = [[],[]]
list_image_item = []
for image in list_img:
(image_src,scramble) = [image.get("src"),image.get("scramble")]
count_image = "{:0>3d}".format(count)
image_src_suffix = "."+str(image_src).split(".")[-1]
image_file_name = count_image+image_src_suffix
if scramble:
de_str = str(image_src).split("/")[-1].replace(image_src_suffix,"==")
blocks_num = imageUtils.encodeImage(de_str)
scramble_image_file_name = ComicPath.getFileScrambleImageName(count=count_image,block=blocks_num,suffix=image_src_suffix)
scramble_count += 1
#files_name.append(image_file_name)
#images_url.append(image_src)
#downUtils.putDownImageUrlDirFile(image_src,ComicPath.getDirComicChapter(),image_file_name)
image_path = os.path.join(item['name'],item['chapter'],scramble_image_file_name)
image_path = ComicPath.ChineseConvert(image_path)
list_image_item.append(ImageItem(image_name=image_file_name,image_url=image_src,image_path=image_path))
#ImageItem(image_name=image_file_name,image_url=image_src,image_path=image_file_name)
count+=1
return list_image_item
class ImgDownloadPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
image = request.meta['item']
return image['image_path']
#return '%s/%s' % (name,chapter)
def get_media_requests(self, item, info):
for image in item:
host = re.sub(r'(http://|https://)', '', image['image_url']).split('/')[0]
yield scrapy.Request(url= image['image_url'], meta= {'item' : image})
def item_completed(self, results, item, info):
if len(results) == len(item):
for image in results:
success = image[0]
img = image[1]
img_path = os.path.join(settings.IMAGES_STORE,img['path'])
#解密图片
imageUtils.deScrambleImagesByPath(img_path)
return item

View File

@ -6,6 +6,7 @@
# https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from fake_useragent import UserAgent
BOT_NAME = 'Comics' BOT_NAME = 'Comics'
@ -15,7 +16,7 @@ NEWSPIDER_MODULE = 'Comics.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent # Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Comics (+http://www.yourdomain.com)' #USER_AGENT = 'Comics (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36' USER_AGENT = UserAgent().random
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = False ROBOTSTXT_OBEY = False
@ -28,11 +29,18 @@ ROBOTSTXT_OBEY = False
IMAGES_URLS_FIELD = "image_url" IMAGES_URLS_FIELD = "image_url"
IMAGES_RESULT_FIELD = "image_path" IMAGES_RESULT_FIELD = "image_path"
IMAGES_STORE = 'images' IMAGES_STORE = 'images'
DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 20
#重试
RETRY_ENABLED = True
RETRY_TIMES = 10 # 想重试几次就写几
# 下面这行可要可不要
#RETRY_HTTP_CODES = [500, 502, 503, 504, 408]
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
PROXY_LIST = [
"http://127.0.0.1:7890",
]
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
COOKIES_ENABLED = False COOKIES_ENABLED = False
@ -40,21 +48,26 @@ COOKIES_ENABLED = False
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
# Override the default request headers: # Override the default request headers:
DEFAULT_REQUEST_HEADERS = { #DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en', # 'Accept-Language': 'en',
} #}
# Enable or disable spider middlewares # Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = { #SPIDER_MIDDLEWARES = {
# 'Comics.middlewares.ComicsSpiderMiddleware': 543, # 'Comics.middlewares.ComicsSpiderMiddleware': 543,
# 'Comics.middlewares.ProxyMiddleware' : 100,
# 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400,
#} #}
# Enable or disable downloader middlewares # Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = { DOWNLOADER_MIDDLEWARES = {
'Comics.middlewares.ComicsDownloaderMiddleware': 543, # 'Comics.middlewares.ComicsDownloaderMiddleware': 543,
# 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500,
# 'Comics.middlewares.ProxyMiddleware' : 100,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400,
} }
# Enable or disable extensions # Enable or disable extensions
@ -67,20 +80,22 @@ DOWNLOADER_MIDDLEWARES = {
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = { ITEM_PIPELINES = {
'Comics.pipelines.ComicsPipeline': 300, 'Comics.pipelines.ComicsPipeline': 300,
'Comics.pipelines.ImageParsePipeline': 400,
'Comics.pipelines.ImgDownloadPipeline': 500,
} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True AUTOTHROTTLE_ENABLED = True
# The initial download delay # The initial download delay
#AUTOTHROTTLE_START_DELAY = 5 AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies # The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60 AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to # The average number of requests Scrapy should be sending in parallel to
# each remote server # each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received: # Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default) # Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

View File

@ -1,6 +1,26 @@
import scrapy,json import scrapy,json,requests
from Comics.items import ComicItem from Comics.items import ComicItem
from Comics.spiders.utils.CommonUtils import CommonUtils from Comics.spiders.utils.FileUtils import CommonUtils
import threading
import toml
class ErrorLog:
def __init__(self) -> None:
self.lock = threading.Lock()
def err_ls(self, dic):
self.lock.acquire()
with open('error.toml', 'r+t') as f:
data = toml.load('error.toml')
f.seek(0, 0)
f.truncate()
dic_name = f'err_{len(data)}'
data[dic_name] = dic
_ = toml.dump(data, f)
self.lock.release()
error_logger = ErrorLog()
class RmComicSpider(scrapy.Spider): class RmComicSpider(scrapy.Spider):
name = 'rm_comic' name = 'rm_comic'
@ -9,7 +29,7 @@ class RmComicSpider(scrapy.Spider):
#start_urls = ['https://rm01.xyz/books/63b65185-f798-4c8f-a0b0-8811615908fd/0'] #start_urls = ['https://rm01.xyz/books/63b65185-f798-4c8f-a0b0-8811615908fd/0']
def start_requests(self): def start_requests(self):
yield scrapy.Request(self.main_url + '/books/63b65185-f798-4c8f-a0b0-8811615908fd', callback=self.parse_comic) yield scrapy.Request(self.main_url + '/books/0a7e8bd1-4cfa-481a-b067-1df663fb2017', callback=self.parse_comic)
def parse_comic(self, response): def parse_comic(self, response):
comic = ComicItem() comic = ComicItem()
@ -21,9 +41,8 @@ class RmComicSpider(scrapy.Spider):
comic['date'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()').extract()[1] comic['date'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()').extract()[1]
comic['chapters'] = response.xpath('//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/text()').extract() comic['chapters'] = response.xpath('//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/text()').extract()
comic['chapter_href'] = response.xpath('//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/@href').extract() comic['chapter_href'] = response.xpath('//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/@href').extract()
list_img = []
for link in comic['chapter_href']: for link in comic['chapter_href']:
yield list_img.append(scrapy.Request(self.main_url+link,meta={'item' : comic} , callback=self.parse_chapter,errback=self.err)) yield scrapy.Request(self.main_url+link,meta={'item' : comic} , callback=self.parse_chapter,errback=self.err)
def err(self): def err(self):
print("Error=====") print("Error=====")
@ -37,8 +56,18 @@ class RmComicSpider(scrapy.Spider):
description = CommonUtils.parseExec(data,str_exec+"description") description = CommonUtils.parseExec(data,str_exec+"description")
images = CommonUtils.parseExec(data,str_exec+"images") images = CommonUtils.parseExec(data,str_exec+"images")
chapter_api_url = CommonUtils.parseExec(data,str_exec+"chapterAPIPath") chapter_api_url = CommonUtils.parseExec(data,str_exec+"chapterAPIPath")
item['chapter'] = chapterName
item['list_img'] = images item['list_img'] = images
yield item if chapter_api_url != None:
yield scrapy.Request(url=self.main_url+chapter_api_url,meta={'item' : item}, callback=self.parse_chapter_api, errback=self.err)
else:
item['list_img'] = images
yield item
def parse_chapter_api(self,response,item):
data = response.meta['item']
print(item)
return response
def parse(self, response): def parse(self, response):
raise NotImplementedError raise NotImplementedError

View File

@ -1,11 +0,0 @@
import json
class CommonUtils:
@classmethod
def parseExec(cls,data,exec):
if data !=None and exec != None:
dots = str(exec).split(".")
if not isinstance(data,dict): data = json.loads(data)
for dot in dots:
data = data.get(dot)
return data

View File

@ -0,0 +1,16 @@
from opencc import OpenCC
class ComicPath:
@classmethod
def getDirComicChapter(cls):
return None
@classmethod
def getFileScrambleImageName(cls,count,block,suffix=".jpg"): return "scramble="+str(block)+"_"+str(count)+suffix
@classmethod
def getFileScrambleImageSave(cls,file): return str(file).split("_")[-1]
#繁体中文转简体中文
@classmethod
def ChineseConvert(cls, text,convert='t2s'): return OpenCC(convert).convert(str(text))

View File

@ -0,0 +1,226 @@
import base64,hashlib,os,shutil
import math,time,json,datetime,logging
from PIL import Image
from tinydb import TinyDB, Query
from Comics.spiders.utils.Constant import ComicPath
class CommonUtils:
@classmethod
def parseExec(cls,data,exec):
if data !=None and exec != None:
dots = str(exec).split(".")
if not isinstance(data,dict): data = json.loads(data)
for dot in dots:
data = data.get(dot)
return data
class imageUtils:
@classmethod
def deScrambleImagesByDir(cls,chapter_dir):
scramble_count = 0
if os.path.exists(chapter_dir): #获取章节图片路径
dirs = os.listdir(chapter_dir)
for img in dirs:
if img.startswith("scramble="):
imageUtils.encode_scramble_image(os.path.join(chapter_dir,img))
scramble_count += 1
logging.debug(f"scramble= {scramble_count}")
return scramble_count
@classmethod
def deScrambleImagesByPath(cls,img_path,img_save=None):
if os.path.basename(img_path).startswith("scramble="):
imageUtils.encode_scramble_image(img_path,img_save)
return True
else:
return False
@classmethod
def encodeImage(cls,str_en):
#print("en",str_en)
enc = base64.b64decode(str_en)
#print("解密:",enc)
m = hashlib.md5()
m.update(enc)
md5 = m.digest()
d = md5[-1]
#print(md5)
try:
blocks = d % 10 + 5
except:
blocks = 0 %10 + 5
#print("blocks=",blocks)
return blocks
@classmethod
def scrambleImage(cls,file_path):
#检测到未下载完的图像 直接返回None
if str(file_path).endswith(".downloads"):
os.remove(file_path)
return None
file_str = str(file_path).split("=")
#10_29.jpg
base_dir = file_str[0].replace("scramble","")
base_name = file_str[-1]
base_fn = base_name.split("_")
save_name = base_fn[1]
save_name_delesu = save_name.split(".")[0]
blocks = int(base_fn[0])
save_file_path = os.path.join(base_dir,save_name)
print("sva",save_file_path)
if os.path.exists(save_file_path):
print("图片已解密,已跳过:", save_file_path)
return None
image_su = str(file_path).split(".")[-1]
try:
img = Image.open(file_path)
except:
print(f"error Image: {file_path}")
width = img.width
height = img.height
#blocks = cls.encodeImage(enStr)
print("blocks=",blocks)
block_height = int(height / blocks)
block_width = int(width / blocks)
print("blockHeight=",block_height)
suffix = str(file_path).split(".")[-1]
split_path = os.path.join(base_dir,save_name_delesu+"split")
if image_su == "downloads":
return None
is_split = cls.splitimage(file_path,blocks,1,split_path)
if is_split != None:
cls.image_compose(split_path,blocks,1,save_file_path,block_height,width)
else:
if os.path.exists(split_path):
shutil.rmtree(split_path)
if os.path.exists(file_path):
shutil.move(file_path, save_file_path)
#完成后清空
return file_path
@classmethod
def splitimage(cls,src,rownum,colnum,dstpath):
img=Image.open(src)
w,h=img.size
if rownum<= h and colnum<=w:
s=os.path.split(src)
if dstpath=='':
dstpath = s[0]
if not os.path.exists(dstpath):
os.makedirs(dstpath)
fn=s[1].split('.')
basename=fn[0]
ext=fn[-1]
num=0
rowheight=h//rownum
colwidth=w//colnum
for r in range(rownum):
for c in range(colnum):
box=(c*colwidth,r*rowheight,(c+1)*colwidth,(r+1)*rowheight)
count_image = "{:0>3d}".format(num)
file_path = os.path.join(dstpath,str(count_image)+'.'+ext)
print("file_path=",file_path)
img.crop(box).save(file_path)
num=num+1
return "成功"
else:
print('不数!')
return None
@classmethod
def image_compose(cls,src,row,column,save_path,image_height,image_width):
image_size = image_height
#image_height = 376
#image_width = 720
images_format = ['.png','.jpg']
#image_names = [name for name in os.listdir(src) for item in images_format if
# os.path.splitext(name)[1] == item][::-1]
img_list=os.listdir(src)
img_list.sort()
img_list.sort(key=lambda x: int(x[:-4]))
##文件名按数字排序
img_nums=len(img_list)
image_names = []
for i in range(img_nums):
img_name=os.path.join(src,img_list[i])
image_names.append(img_name)
#使用倒序
image_names = image_names[::-1]
# 简单的对于参数的设定和实际图片集的大小进行数量判断
if len(image_names) < row * column:
raise ValueError("合成图片的参数和要求的数量不能匹配!")
to_image = Image.new('RGB', (column * image_width, row * image_height)) #创建一个新图
# 循环遍历,把每张图片按顺序粘贴到对应位置上
for y in range(1, row + 1):
for x in range(1, column + 1):
#1 * (row=1 -1) col=1 -1
image_path = image_names[column * (y - 1) + x - 1]
print("split_image=",image_path)
from_image = Image.open(image_path)
#保持原图片大小
#.resize(
# (image_size, image_size),Image.ANTIALIAS)
to_image.paste(from_image, ((x - 1) * image_size, (y - 1) * image_size))
from_image.close()
to_image.save(save_path)
print("图片合并完成:", save_path)
shutil.rmtree(src)
# 保存新图
@classmethod
def getScrambleImage(cls,path):
scramble_file_cache = cls.scrambleImage(path)
if scramble_file_cache != None and os.path.exists(scramble_file_cache): os.remove(scramble_file_cache)
@classmethod
def encode_scramble_image(cls,imgpath,img_save=None):
image = Image.open(imgpath)
w, h = image.size
#image.show()
file_str = str(imgpath).split("=")
#10_29.jpg
base_fn = file_str[-1].split("_")
blocks = int(base_fn[0])
if img_save == None:
save_path = os.path.join(os.path.dirname(imgpath),ComicPath.getFileScrambleImageSave(imgpath))
else: save_path = img_save
# print(type(aid),type(img_name))
if blocks:
s = blocks # 随机值
# print(s)
l = h % s # 切割最后多余的值
box_list = []
hz = 0
for i in range(s):
c = math.floor(h / s)
g = i * c
hz += c
h2 = h - c * (i + 1) - l
if i == 0:
c += l;hz += l
else:
g += l
box_list.append((0, h2, w, h - g))
# print(box_list,len(box_list))
item_width = w
# box_list.reverse() #还原切图可以倒序列表
# print(box_list, len(box_list))
newh = 0
image_list = [image.crop(box) for box in box_list]
# print(box_list)
newimage = Image.new("RGB", (w, h))
for image in image_list:
# image.show()
b_w, b_h = image.size
newimage.paste(image, (0, newh))
newh += b_h
newimage.save(save_path)
print("解密成功=",save_path)
if os.path.exists(imgpath):
os.remove(imgpath)
print("remove=",imgpath)

View File