update

2023-05-16 06:46:10 +08:00 · 2023-05-16 06:46:10 +08:00 · b59e241fa1
commit b59e241fa1
parent 43569ed246
19 changed files with 380 additions and 41 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 .scrapy/*
 images/*
 /**/__pycache__
--- a/Comics/pycache/init.cpython-37.pyc
+++ b/Comics/pycache/init.cpython-37.pyc
--- a/Comics/pycache/init.cpython-39.pyc
+++ b/Comics/pycache/init.cpython-39.pyc
--- a/Comics/pycache/items.cpython-37.pyc
+++ b/Comics/pycache/items.cpython-37.pyc
--- a/Comics/pycache/pipelines.cpython-37.pyc
+++ b/Comics/pycache/pipelines.cpython-37.pyc
--- a/Comics/pycache/settings.cpython-37.pyc
+++ b/Comics/pycache/settings.cpython-37.pyc
--- a/Comics/pycache/settings.cpython-39.pyc
+++ b/Comics/pycache/settings.cpython-39.pyc
--- a/Comics/items.py
+++ b/Comics/items.py
@ -26,7 +26,7 @@ class ComicItem(scrapy.Item):
    genre = scrapy.Field()
    age_rating = scrapy.Field()
-class DownImagesItem(scrapy.Item):
+class ImageItem(scrapy.Item):
    image_name = scrapy.Field()
    image_url = scrapy.Field()
    image_path = scrapy.Field()
--- a/Comics/middlewares.py
+++ b/Comics/middlewares.py
@ -4,10 +4,14 @@
 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 from scrapy import signals
-
+import random
 from Comics.settings import PROXY_LIST
 # useful for handling different item types with a single interface
 from itemadapter import is_item, ItemAdapter
 class ProxyMiddleware(object):
    def process_request(self, request, spider):
        request.meta["proxy"] = random.choice(PROXY_LIST)
 class ComicsSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
--- a/Comics/pipelines.py
+++ b/Comics/pipelines.py
@ -5,6 +5,14 @@
 # useful for handling different item types with a single interface
 import os,requests,re,scrapy,logging
 from Comics import settings
 from Comics.spiders.utils.FileUtils import imageUtils
 from Comics.spiders.utils.Constant import ComicPath
 from Comics.items import ComicItem
 from Comics.items import ImageItem
 from scrapy.pipelines.files import FilesPipeline
 from scrapy.pipelines.images import ImagesPipeline
 from itemadapter import ItemAdapter
 from scrapy.pipelines.images import ImagesPipeline
@ -15,7 +23,8 @@ class ComicsPipeline:
    def comic_process(self,images):
        count = 1
        scramble_count = 0
-        (files_name,images_url) = [[],[]]
+        #(files_name,images_url) = [[],[]]
        list_image_item = []
        for image in images:
            (image_src,scramble) = [image.get("src"),image.get("scramble")]
            count_image = "{:0>3d}".format(count)
@ -23,23 +32,71 @@ class ComicsPipeline:
            image_file_name = count_image+image_src_suffix
            if scramble:
                de_str = str(image_src).split("/")[-1].replace(image_src_suffix,"==")
-                #blocks_num = imageUtils.encodeImage(de_str)
+                blocks_num = imageUtils.encodeImage(de_str)
-                #image_file_name = ComicPath.getFileScrambleImageName(count=count_image,block=blocks_num,suffix=image_src_suffix)
+                image_file_name = ComicPath.getFileScrambleImageName(count=count_image,block=blocks_num,suffix=image_src_suffix)
                scramble_count += 1
-            files_name.append(image_file_name)
+            #files_name.append(image_file_name)
-            images_url.append(image_src)
+            #images_url.append(image_src)
            #downUtils.putDownImageUrlDirFile(image_src,ComicPath.getDirComicChapter(),image_file_name)
            list_image_item.append(ImageItem(image_name=image_file_name,image_url=image_src,image_path=image_file_name))
            yield ImageItem(image_name=image_file_name,image_url=image_src,image_path=image_file_name)
            count+=1
        #yield list_image_item
    # item就是yield后面的对象
    def process_item(self, item, spider):
        self.fp.write(str(item))
        self.comic_process(item["list_img"])
        return item
        #image解析
    def close_spider(self,spider):
        self.fp.close()
-class ImgDownloadPipeline(ImagesPipeline):
+class ImageParsePipeline:
-    def get_
+    def process_item(self, item, spider):
        if isinstance(item, ComicItem):
            list_img = item['list_img']
            count = 1
            scramble_count = 0
            #(files_name,images_url) = [[],[]]
            list_image_item = []
            for image in list_img:
                (image_src,scramble) = [image.get("src"),image.get("scramble")]
                count_image = "{:0>3d}".format(count)
                image_src_suffix = "."+str(image_src).split(".")[-1]
                image_file_name = count_image+image_src_suffix
                if scramble:
                    de_str = str(image_src).split("/")[-1].replace(image_src_suffix,"==")
                    blocks_num = imageUtils.encodeImage(de_str)
                    scramble_image_file_name = ComicPath.getFileScrambleImageName(count=count_image,block=blocks_num,suffix=image_src_suffix)
                    scramble_count += 1
                #files_name.append(image_file_name)
                #images_url.append(image_src)
                #downUtils.putDownImageUrlDirFile(image_src,ComicPath.getDirComicChapter(),image_file_name)
                image_path = os.path.join(item['name'],item['chapter'],scramble_image_file_name)
                image_path = ComicPath.ChineseConvert(image_path)
                list_image_item.append(ImageItem(image_name=image_file_name,image_url=image_src,image_path=image_path))
                #ImageItem(image_name=image_file_name,image_url=image_src,image_path=image_file_name)
                count+=1
            return list_image_item
 class ImgDownloadPipeline(ImagesPipeline):
    def file_path(self, request, response=None, info=None, *, item=None):
        image = request.meta['item']
        return image['image_path']
        #return '%s/%s' % (name,chapter)
    def get_media_requests(self, item, info):
        for image in item:
            host = re.sub(r'(http://|https://)', '', image['image_url']).split('/')[0]
            yield scrapy.Request(url= image['image_url'], meta= {'item' : image})
    def item_completed(self, results, item, info):
        if len(results) == len(item):
            for image in results:
                success = image[0]
                img = image[1]
                img_path = os.path.join(settings.IMAGES_STORE,img['path'])
                #解密图片
                imageUtils.deScrambleImagesByPath(img_path)
        return item
--- a/Comics/settings.py
+++ b/Comics/settings.py
@ -6,6 +6,7 @@
 #     https://docs.scrapy.org/en/latest/topics/settings.html
 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 from fake_useragent import UserAgent
 BOT_NAME = 'Comics'
@ -15,7 +16,7 @@ NEWSPIDER_MODULE = 'Comics.spiders'
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = 'Comics (+http://www.yourdomain.com)'
-USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36'
+USER_AGENT = UserAgent().random
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = False 
@ -28,11 +29,18 @@ ROBOTSTXT_OBEY = False
 IMAGES_URLS_FIELD = "image_url"
 IMAGES_RESULT_FIELD = "image_path"
 IMAGES_STORE = 'images'
-DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 20
 #重试
 RETRY_ENABLED = True
 RETRY_TIMES = 10 # 想重试几次就写几
 # 下面这行可要可不要
 #RETRY_HTTP_CODES = [500, 502, 503, 504, 408]
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
-
+PROXY_LIST = [
    "http://127.0.0.1:7890",
 ]
 # Disable cookies (enabled by default)
 COOKIES_ENABLED = False
@ -40,21 +48,26 @@ COOKIES_ENABLED = False
 #TELNETCONSOLE_ENABLED = False
 # Override the default request headers:
-DEFAULT_REQUEST_HEADERS = {
+#DEFAULT_REQUEST_HEADERS = {
-   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-   'Accept-Language': 'en',
+#   'Accept-Language': 'en',
-}
+#}
 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 #SPIDER_MIDDLEWARES = {
 #    'Comics.middlewares.ComicsSpiderMiddleware': 543,
 #    'Comics.middlewares.ProxyMiddleware' : 100,
 #    'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400,
 #}
 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 DOWNLOADER_MIDDLEWARES = {
-    'Comics.middlewares.ComicsDownloaderMiddleware': 543,
+#    'Comics.middlewares.ComicsDownloaderMiddleware': 543,
 #    'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500,
 #    'Comics.middlewares.ProxyMiddleware' : 100,
    'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400,
 }
 # Enable or disable extensions
@ -67,20 +80,22 @@ DOWNLOADER_MIDDLEWARES = {
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
    'Comics.pipelines.ComicsPipeline': 300,
    'Comics.pipelines.ImageParsePipeline': 400,
    'Comics.pipelines.ImgDownloadPipeline': 500,
 }
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
+AUTOTHROTTLE_ENABLED = True
 # The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
+AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
+AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
+AUTOTHROTTLE_DEBUG = False
 # Enable and configure HTTP caching (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
--- a/Comics/spiders/pycache/init.cpython-37.pyc
+++ b/Comics/spiders/pycache/init.cpython-37.pyc
--- a/Comics/spiders/pycache/init.cpython-39.pyc
+++ b/Comics/spiders/pycache/init.cpython-39.pyc
--- a/Comics/spiders/pycache/rm_comic.cpython-37.pyc
+++ b/Comics/spiders/pycache/rm_comic.cpython-37.pyc
--- a/Comics/spiders/rm_comic.py
+++ b/Comics/spiders/rm_comic.py
@ -1,6 +1,26 @@
-import scrapy,json
+import scrapy,json,requests
 from Comics.items import ComicItem
-from Comics.spiders.utils.CommonUtils import CommonUtils
+from Comics.spiders.utils.FileUtils import CommonUtils
 import threading
 import toml
 class ErrorLog:
    def __init__(self) -> None:
        self.lock = threading.Lock()
    def err_ls(self, dic):
        self.lock.acquire()
        with open('error.toml', 'r+t') as f:
            data = toml.load('error.toml')
            f.seek(0, 0)
            f.truncate()
            dic_name = f'err_{len(data)}'
            data[dic_name] = dic
            _ = toml.dump(data, f)
        self.lock.release()
 error_logger = ErrorLog()
 class RmComicSpider(scrapy.Spider):
    name = 'rm_comic'
@ -9,7 +29,7 @@ class RmComicSpider(scrapy.Spider):
    #start_urls = ['https://rm01.xyz/books/63b65185-f798-4c8f-a0b0-8811615908fd/0']
    def start_requests(self):
-        yield scrapy.Request(self.main_url + '/books/63b65185-f798-4c8f-a0b0-8811615908fd', callback=self.parse_comic)
+        yield scrapy.Request(self.main_url + '/books/0a7e8bd1-4cfa-481a-b067-1df663fb2017', callback=self.parse_comic)
    def parse_comic(self, response):
        comic = ComicItem()
@ -21,9 +41,8 @@ class RmComicSpider(scrapy.Spider):
        comic['date'] = response.xpath('//div[contains(@class,"bookid_bookInfo")]/p[5]/small/text()').extract()[1]
        comic['chapters'] = response.xpath('//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/text()').extract()
        comic['chapter_href'] = response.xpath('//div[contains(@class,"bookid_chapterBox")]//div[contains(@class,"bookid_chapter")]/a/@href').extract()
        list_img = []
        for link in comic['chapter_href']:
-            yield list_img.append(scrapy.Request(self.main_url+link,meta={'item' : comic} , callback=self.parse_chapter,errback=self.err))
+            yield scrapy.Request(self.main_url+link,meta={'item' : comic} , callback=self.parse_chapter,errback=self.err)
    def err(self):
        print("Error=====")
@ -37,8 +56,18 @@ class RmComicSpider(scrapy.Spider):
        description = CommonUtils.parseExec(data,str_exec+"description")
        images = CommonUtils.parseExec(data,str_exec+"images")
        chapter_api_url = CommonUtils.parseExec(data,str_exec+"chapterAPIPath")
        item['chapter'] = chapterName
        item['list_img'] = images
-        yield item
+        if chapter_api_url != None:
            yield scrapy.Request(url=self.main_url+chapter_api_url,meta={'item' : item}, callback=self.parse_chapter_api, errback=self.err)
        else:
            item['list_img'] = images
            yield item
    def parse_chapter_api(self,response,item):
        data = response.meta['item']
        print(item)
        return response
    def parse(self, response):
        raise NotImplementedError
--- a/Comics/spiders/utils/CommonUtils.py
+++ b/Comics/spiders/utils/CommonUtils.py
@ -1,11 +0,0 @@
 import json
 class CommonUtils:
    @classmethod
    def parseExec(cls,data,exec):
        if data !=None and exec != None:
            dots = str(exec).split(".")
            if not isinstance(data,dict): data = json.loads(data)
            for dot in dots:
                data = data.get(dot)
        return data
--- a/Comics/spiders/utils/Constant.py
+++ b/Comics/spiders/utils/Constant.py
@ -0,0 +1,16 @@
 from opencc import OpenCC
 class ComicPath:
    @classmethod
    def getDirComicChapter(cls):
        return None
    @classmethod
    def getFileScrambleImageName(cls,count,block,suffix=".jpg"): return "scramble="+str(block)+"_"+str(count)+suffix
    @classmethod
    def getFileScrambleImageSave(cls,file): return str(file).split("_")[-1]
    #繁体中文转简体中文
    @classmethod
    def ChineseConvert(cls, text,convert='t2s'): return OpenCC(convert).convert(str(text))
--- a/Comics/spiders/utils/FileUtils.py
+++ b/Comics/spiders/utils/FileUtils.py
@ -0,0 +1,226 @@
 import base64,hashlib,os,shutil
 import math,time,json,datetime,logging
 from PIL import Image
 from tinydb import TinyDB, Query
 from Comics.spiders.utils.Constant import ComicPath
 class CommonUtils:
    @classmethod
    def parseExec(cls,data,exec):
        if data !=None and exec != None:
            dots = str(exec).split(".")
            if not isinstance(data,dict): data = json.loads(data)
            for dot in dots:
                data = data.get(dot)
        return data
 class imageUtils:
    @classmethod
    def deScrambleImagesByDir(cls,chapter_dir):
        scramble_count = 0
        if os.path.exists(chapter_dir): #获取章节图片路径
            dirs = os.listdir(chapter_dir)
            for img in dirs:
                if img.startswith("scramble="):
                    imageUtils.encode_scramble_image(os.path.join(chapter_dir,img))
                    scramble_count += 1
                    logging.debug(f"scramble= {scramble_count}")
        return scramble_count
    @classmethod
    def deScrambleImagesByPath(cls,img_path,img_save=None):
        if os.path.basename(img_path).startswith("scramble="):
            imageUtils.encode_scramble_image(img_path,img_save)
            return True
        else:
            return False
    @classmethod
    def encodeImage(cls,str_en):
        #print("en",str_en)
        enc = base64.b64decode(str_en)
        #print("解密：",enc)
        m = hashlib.md5()
        m.update(enc)
        md5 = m.digest()
        d = md5[-1]
        #print(md5)
        try:
            blocks = d % 10 + 5
        except:
            blocks = 0  %10 + 5
        #print("blocks=",blocks)
        return blocks 
    @classmethod
    def scrambleImage(cls,file_path):
        #检测到未下载完的图像 直接返回None
        if str(file_path).endswith(".downloads"):
            os.remove(file_path)
            return None
        file_str = str(file_path).split("=")
        #10_29.jpg
        base_dir = file_str[0].replace("scramble","")
        base_name = file_str[-1]
        base_fn = base_name.split("_") 
        save_name = base_fn[1]
        save_name_delesu = save_name.split(".")[0]
        blocks = int(base_fn[0])
        save_file_path = os.path.join(base_dir,save_name)
        print("sva",save_file_path)
        if os.path.exists(save_file_path):
            print("图片已解密，已跳过:", save_file_path)
            return None
        image_su = str(file_path).split(".")[-1]
        try:
            img = Image.open(file_path)
        except:
            print(f"error Image: {file_path}")
        width = img.width
        height = img.height
        #blocks = cls.encodeImage(enStr)
        print("blocks=",blocks)
        block_height = int(height / blocks)
        block_width = int(width / blocks)
        print("blockHeight=",block_height)
        suffix = str(file_path).split(".")[-1]
        split_path = os.path.join(base_dir,save_name_delesu+"split")
        if image_su == "downloads":
            return None
        is_split = cls.splitimage(file_path,blocks,1,split_path)
        if is_split != None:
            cls.image_compose(split_path,blocks,1,save_file_path,block_height,width)
        else:
            if os.path.exists(split_path):
                 shutil.rmtree(split_path)
            if os.path.exists(file_path):
                shutil.move(file_path, save_file_path)
        #完成后清空
        return file_path 
    @classmethod
    def splitimage(cls,src,rownum,colnum,dstpath):
            img=Image.open(src)
            w,h=img.size                
            if rownum<= h and colnum<=w:
                s=os.path.split(src)
                if dstpath=='':
                    dstpath = s[0]
                if not os.path.exists(dstpath):
                    os.makedirs(dstpath)
                fn=s[1].split('.')
                basename=fn[0]
                ext=fn[-1]
                num=0
                rowheight=h//rownum
                colwidth=w//colnum
                for r in range(rownum):
                    for c in range(colnum):
                        box=(c*colwidth,r*rowheight,(c+1)*colwidth,(r+1)*rowheight)
                        count_image = "{:0>3d}".format(num)
                        file_path = os.path.join(dstpath,str(count_image)+'.'+ext)
                        print("file_path=",file_path)
                        img.crop(box).save(file_path)
                        num=num+1
                return "成功"
            else:
                print('不数！')
                return None
    @classmethod
    def image_compose(cls,src,row,column,save_path,image_height,image_width):
        image_size = image_height
        #image_height = 376
        #image_width = 720
        images_format = ['.png','.jpg']
        #image_names = [name for name in os.listdir(src) for item in images_format if
        #           os.path.splitext(name)[1] == item][::-1]
        img_list=os.listdir(src)
        img_list.sort() 
        img_list.sort(key=lambda x: int(x[:-4]))
        ##文件名按数字排序 
        img_nums=len(img_list)
        image_names = [] 
        for i in range(img_nums):  
            img_name=os.path.join(src,img_list[i])   
            image_names.append(img_name)
        #使用倒序
        image_names = image_names[::-1]
        # 简单的对于参数的设定和实际图片集的大小进行数量判断
        if len(image_names) < row * column:
            raise ValueError("合成图片的参数和要求的数量不能匹配！")
        to_image = Image.new('RGB', (column * image_width, row * image_height)) #创建一个新图
        # 循环遍历，把每张图片按顺序粘贴到对应位置上
        for y in range(1, row + 1):
            for x in range(1, column + 1):
                #1 * (row=1 -1)   col=1 -1
                image_path = image_names[column * (y - 1) + x - 1]
                print("split_image=",image_path)
                from_image = Image.open(image_path)
                #保持原图片大小
                #.resize(
                #    (image_size, image_size),Image.ANTIALIAS)
                to_image.paste(from_image, ((x - 1) * image_size, (y - 1) * image_size))
                from_image.close()
        to_image.save(save_path)
        print("图片合并完成：", save_path)
        shutil.rmtree(src)
        # 保存新图
    @classmethod
    def getScrambleImage(cls,path):
        scramble_file_cache = cls.scrambleImage(path)
        if scramble_file_cache != None and os.path.exists(scramble_file_cache): os.remove(scramble_file_cache)
    @classmethod
    def encode_scramble_image(cls,imgpath,img_save=None):
        image = Image.open(imgpath)
        w, h = image.size
        #image.show()
        file_str = str(imgpath).split("=")
        #10_29.jpg
        base_fn = file_str[-1].split("_") 
        blocks = int(base_fn[0])
        if img_save == None:
            save_path = os.path.join(os.path.dirname(imgpath),ComicPath.getFileScrambleImageSave(imgpath))
        else: save_path = img_save
        # print(type(aid),type(img_name))
        if blocks:
            s = blocks  # 随机值
            # print(s)
            l = h % s  # 切割最后多余的值
            box_list = []
            hz = 0
            for i in range(s):
                c = math.floor(h / s)
                g = i * c
                hz += c
                h2 = h - c * (i + 1) - l
                if i == 0:
                    c += l;hz += l
                else:
                    g += l
                box_list.append((0, h2, w, h - g))
            # print(box_list,len(box_list))
            item_width = w
            # box_list.reverse() #还原切图可以倒序列表
            # print(box_list, len(box_list))
            newh = 0
            image_list = [image.crop(box) for box in box_list]
            # print(box_list)
            newimage = Image.new("RGB", (w, h))
            for image in image_list:
                # image.show()
                b_w, b_h = image.size
                newimage.paste(image, (0, newh))
                newh += b_h
            newimage.save(save_path)
            print("解密成功=",save_path)
            if os.path.exists(imgpath):
                os.remove(imgpath)
                print("remove=",imgpath)
--- a/test.py
+++ b/test.py