This commit is contained in:
caiwx86 2023-06-20 02:52:51 +08:00
parent af7812794f
commit ac30f59a33
20 changed files with 1167 additions and 1060 deletions

4
.gitignore vendored
View File

@ -1,5 +1,5 @@
.scrapy/*
images/*
json/*
.vscode/*
CBZ/*
output/*
/**/__pycache__

3
.idea/.gitignore vendored
View File

@ -1,3 +0,0 @@
# Default ignored files
/shelf/
/workspace.xml

View File

@ -1,8 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="stable_vscode" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@ -1,6 +0,0 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

View File

@ -1,4 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="stable_vscode" project-jdk-type="Python SDK" />
</project>

View File

@ -1,8 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/ComicScrapy.iml" filepath="$PROJECT_DIR$/.idea/ComicScrapy.iml" />
</modules>
</component>
</project>

View File

@ -1,6 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

View File

@ -3,6 +3,7 @@ import os.path,json,ast
from Comics.settings import COMIC_INFO_FIELDS_TO_EXPORT
from scrapy.exporters import XmlItemExporter
from scrapy.exporters import PythonItemExporter
from scrapy.exporters import JsonItemExporter
from Comics.items import ComicInfoItem
from Comics.items import ComicItem
from Comics.settings import COMIC_INFO_XML_STORE
@ -10,6 +11,15 @@ from Comics.utils.Constant import ComicPath
from scrapy.utils.python import is_listlike, to_bytes, to_unicode
from itemadapter import ItemAdapter
class CommonExporter():
def getPath(self, file , sufix=None):
sufix = "."+sufix
dirname = os.path.dirname(file)
if not os.path.exists(dirname):
os.makedirs(dirname)
if sufix != None and sufix not in file:
file = file + sufix
return file
class ItemExporter(PythonItemExporter):
def convert(self, data):
@ -25,6 +35,21 @@ class ItemExporter(PythonItemExporter):
self.finish_exporting()
return obj_item
class JsonExport(JsonItemExporter):
def __init__(self, file, **kwargs):
file = CommonExporter().getPath(file=file, sufix= "json")
self.file = open(file, "wb")
super(JsonExport, self).__init__(self.file, **kwargs)
def export_json(self, json_object, if_return=False):
self.start_exporting()
self.export_item(json_object)
self.finish_exporting()
self.file.close()
if if_return:
return ItemExporter().export_obj(json_object)
class ComicInfoXmlItemExporter(XmlItemExporter):
custom_root_element = "ComicInfo"
def __init__(self, comic, chapter):
@ -66,8 +91,8 @@ class ComicInfoXmlItemExporter(XmlItemExporter):
self._beautify_indent(depth=1)
self._beautify_newline()
for name, value in self._get_serialized_fields(comic_info, default_value=""):
if name is "Pages":
value = str(value).split(',')
if name == "Pages":
value = ast.literal_eval(value)
if value is not None or value != "":
self._export_xml_field(name, value, depth=2, child_element=child_element)
#self._beautify_indent(depth=1)

View File

@ -2,8 +2,10 @@
#
# See documentation in:
# https://docs.org/en/latest/topics/items.html
import os,Comics.settings as settings,logging
from scrapy.item import Item, Field
from Comics.utils.Constant import ComicPath
from Comics.utils.FileUtils import imageUtils
from scrapy.loader.processors import TakeFirst, MapCompose, Join
def serialize_to_chinese(value):
@ -13,9 +15,37 @@ def serialize_to_fix_file(value):
file = ComicPath.chinese_convert(value)
return ComicPath.fix_file_name(file)
class ComicOItem(Item):
def _serialize_to_images(value, result_type=None):
count = 1
images_item = []
image_urls = []
for image in value:
(image_src, scramble) = [image.get("src"), image.get("scramble")]
count_image = settings.IMAGES_NAME_FORMAT.format(count)
suffix = "."+str(image_src).split(".")[-1]
image_name = count_image + suffix
if scramble:
de_str = str(image_src).split("/")[-1].replace(suffix, "==")
blocks_num = imageUtils.encodeImage(de_str)
image_name = ComicPath.getFileScrambleImageName(count=count_image, block=blocks_num, suffix=suffix)
#images_item.append(ImagesItem(image_name=count_image + suffix, image_url=image_src, image_path=image_name))
images_item.append(image_name)
image_urls.append(image_src)
count += 1
logging.info(f"images_len: {len(images_item)}")
if result_type == "image_urls": return image_urls
else: return images_item
def serialize_to_images(value): return _serialize_to_images(value)
def serialize_to_image_urls(value): return _serialize_to_images(value, result_type="image_urls")
class ListComicItem(Item):
name = Field()
chapterItem = Field()
link = Field()
class ComicItem(Item):
# 编号
@ -25,7 +55,7 @@ class ComicItem(Item):
# 章节名
chapter = Field(serializer=serialize_to_fix_file, output_processor=TakeFirst())
# 图片链接
list_img = Field()
list_img = Field(serializer=serialize_to_images)
# 作者
author = Field(serialize_to_chinese=serialize_to_chinese, output_processor=TakeFirst())
# 封面链接
@ -41,13 +71,18 @@ class ComicItem(Item):
# 年龄分级
age_rating = Field(output_processor=TakeFirst())
images = Field()
images_old = Field(serializer=serialize_to_images)
images = Field(serializer=serialize_to_images)
image_urls = Field(serializer=serialize_to_image_urls)
images_name = Field()
class ImageItem(Item):
class ImagesItem(Item):
image_name = Field()
image_url = Field()
image_path = Field()
images = Field()
image_urls = Field()
comic = Field()
def serializer_info_writer(value):
list_value = []
@ -56,6 +91,42 @@ def serializer_info_writer(value):
list_value.append(v)
return ",".join(list_value)
# Result_type name
def _serializer_info_imagesa(value, result_type=None):
info = []
for success, img in value:
img_path = os.path.join(settings.IMAGES_STORE, img['path'])
if result_type == 'name':
info.append(ComicPath().getFileScrambleImageSave(img_path,True,False))
else:
info.append(img_path)
if result_type == "len":
value = len(info)
else:
value = info
return value
def _serialize_info_images(value, result_type=None):
images = []
for image in value:
images.append(ComicPath().getFileScrambleImageSave(image,True,False))
if result_type == "count":
return len(images)
else:
return images
def serializer_info_images(value): return _serialize_info_images(value)
def serializer_info_images_count(value): return _serialize_info_images(value, "count")
def serializer_info_images_completed(value):
return _serialize_info_images(value, result_type='name')
def serializer_info_images_count(value):
return _serialize_info_images(value, result_type='len')
class ComicInfoItem(Item):
Title = Field(info='chapter')#"章节名",True]
Series = Field(info='name')# ","漫画名",True]
@ -70,9 +141,11 @@ class ComicInfoItem(Item):
Genre = Field(info='genre')# ","流派",True]
Tags = Field(info='tags')# ","标签",True]
Web = Field()# ","主页",False]
PageCount = Field()# ","总页数",True]
#PageCount = Field()# ","总页数",True]
PageCount = Field(info='images',serializer=serializer_info_images_count)# ","总页数",True]
LanguageISO = Field()#","语言",True]
AgeRating = Field(info='age_rating')#","年龄分级",False]
Pages = Field(info='images_name')#","页码",True]
#Pages = Field(info='images_name', serializer=serializer_info_images_completed)#","页码",True]
Pages = Field(info='images', serializer=serializer_info_images)#","页码",True]
# ComicInfo.xml and ComicChapter.json end

View File

@ -1,5 +1,6 @@
import json
from scrapy.loader import ItemLoader
class ComicLoader(ItemLoader):
def parseExec(cls,data,exec):
if data !=None and exec != None:
@ -42,3 +43,14 @@ class ComicLoader(ItemLoader):
def get_exec(self, value, str_exec):
return self.parseExec(value, str_exec)
def add_value(self, field_name, value, *processors, re=None, **kw):
if self.auto_replace_value(field_name, value):
return super().add_value(field_name, value, *processors, re=re, **kw)
def auto_replace_value(self, field_name, value):
if self.get_output_value(field_name) != None:
self._replace_value(field_name, value)
return False
else: return True

View File

@ -5,16 +5,17 @@
# useful for handling different item types with a single interface
import os, scrapy
import os, scrapy,logging,time,random
from Comics import settings
from Comics.utils.FileUtils import imageUtils
from Comics.utils.FileUtils import fileUtils
from Comics.utils.Constant import ComicPath
from Comics.items import ComicItem
from Comics.items import ImageItem
from Comics.items import ImagesItem
from scrapy.pipelines.images import ImagesPipeline
from Comics.exporters import ComicInfoXmlItemExporter
from Comics.exporters import ItemExporter
from Comics.exporters import JsonExport
from Comics.utils.FileUtils import CBZUtils
class ComicsPipeline:
@ -23,59 +24,40 @@ class ComicsPipeline:
# item就是yield后面的对象
def process_item(self, item, spider):
if isinstance(item, ComicItem):
item = ComicItem(ItemExporter().export_obj(item))
file = os.path.join("json", item['name'], item['chapter'])
fileUtils.save_file(f"{file}.json", item)
return item
file = os.path.join(settings.OUTPUT_DIR,"json", item['name'], item['chapter'])
data = JsonExport(file=file).export_json(item, if_return=True)
#item['images'] = data['images']
return data
# image解析
def close_spider(self,spider):
pass
class ImageParsePipeline:
def process_item(self, item, spider):
if isinstance(item, ComicItem):
count = 1
images_item = []
for image in item['list_img']:
(image_src, scramble) = [image.get("src"), image.get("scramble")]
count_image = "{:0>3d}".format(count)
suffix = "."+str(image_src).split(".")[-1]
image_name = count_image + suffix
if scramble:
de_str = str(image_src).split("/")[-1].replace(suffix, "==")
blocks_num = imageUtils.encodeImage(de_str)
image_name = ComicPath.getFileScrambleImageName(count=count_image, block=blocks_num, suffix=suffix)
image_path = os.path.join(item['name'], item['chapter'], image_name)
images_item.append(ImageItem(image_name=count_image + suffix, image_url=image_src, image_path=image_path))
count += 1
item['images'] = images_item
return item
class ImgDownloadPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
image = request.meta['item']
image_path = image['image_path']
en_image_path = os.path.join(os.path.dirname(image_path), image['image_name'])
if os.path.exists(os.path.join(settings.IMAGES_STORE, en_image_path)):
return en_image_path
else:
return image_path
def file_exits(self, image_path):
en_image_path = ComicPath().getFileScrambleImageSave(image_path, relative="fullpath")
return os.path.exists(os.path.join(settings.IMAGES_STORE, en_image_path))
def file_full_path(self, item, image): return os.path.join(item['name'], item['chapter'], image)
def file_path(self, request, response=None, info=None, *, item=None): return request.meta['path']
def get_media_requests(self, item, info):
for image in item['images']:
yield scrapy.Request(url=image['image_url'], meta={'item': image})
for image_url,image_path in zip(item['image_urls'],item['images']):
image_path = self.file_full_path(item, image_path)
if self.file_exits(image_path):
logging.info(f"file exists: {image_path}")
else:
logging.info(f"downloading {image_url} --> {image_path}")
yield scrapy.Request(url=image_url, meta={'path': image_path})
def item_completed(self, results, item, info):
info_img = []
for success, img in results:
img_path = os.path.join(settings.IMAGES_STORE, img['path'])
# 解密图片
img_path = imageUtils.deScrambleImagesByPath(img_path)
info_img.append(os.path.basename(img_path).split('.')[0])
item['images_name'] = ",".join(info_img)
item['images_name'] = results
# return item
# ComicInfoXml 生成
ComicInfoXmlItemExporter(comic=item['name'], chapter=item['chapter']).export_xml(item)
comic_info = ComicInfoXmlItemExporter(comic=item['name'], chapter=item['chapter']).export_xml(item)
# 打包
CBZUtils.packComicChapterCBZ(comic=item['name'], chapter=item['chapter'], remove=False)
CBZUtils.packComicChapterCBZ(comic=item['name'], chapter=item['chapter'],
comic_info_images= comic_info["Pages"], remove=False)
time.sleep(random.randint(5,10))

View File

@ -7,13 +7,14 @@
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from fake_useragent import UserAgent
import os
BOT_NAME = 'Comics'
SPIDER_MODULES = ['Comics.spiders']
NEWSPIDER_MODULE = 'Comics.spiders'
OUTPUT_DIR = "output"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Comics (+http://www.yourdomain.com)'
USER_AGENT = UserAgent().random
@ -22,22 +23,23 @@ ROBOTSTXT_OBEY = False
HTTPERROR_ALLOWED_CODES = [ 200 , 403]
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS = 16
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
IMAGES_STORE = 'images'
COMIC_INFO_XML_STORE = 'images'
DOWNLOAD_DELAY = 20
IMAGES_STORE = os.path.join(OUTPUT_DIR, 'images')
IMAGES_NAME_FORMAT = "{:0>3d}"
COMIC_INFO_XML_STORE = IMAGES_STORE
DOWNLOAD_DELAY = 0
#重试
RETRY_ENABLED = True
RETRY_TIMES = 10 # 想重试几次就写几
# 下面这行可要可不要
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 401]
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 16
CONCURRENT_REQUESTS_PER_IP = 16
PROXY_LIST = [
"http://127.0.0.1:7890",
]
@ -79,8 +81,9 @@ DOWNLOADER_MIDDLEWARES = {
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# 'scrapy.pipelines.images.ImagesPipeline' : 1,
'Comics.pipelines.ComicsPipeline': 300,
'Comics.pipelines.ImageParsePipeline': 400,
# 'Comics.pipelines.ImageParsePipeline': 400,
'Comics.pipelines.ImgDownloadPipeline': 500,
}
@ -102,10 +105,14 @@ AUTOTHROTTLE_DEBUG = False
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = [500, 502, 404, 403, 401]
HTTPCACHE_IGNORE_HTTP_CODES = [500, 502, 404]
#HTTPCACHE_STORAGE = 'Comics.middlewares.MyFilesystemCacheStorage'
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# Logging configuration
LOG_LEVEL = "INFO" # 日志等级
LOG_STDOUT = True # 标准化输出
CBZ_EXPORT_PATH = "CBZ"
#数据导出类 排序
COMIC_INFO_XML_FILE = "ComicInfo.xml"

View File

@ -1,18 +1,28 @@
import scrapy
import scrapy,logging,time
from Comics.items import ComicItem
from Comics.loader import ComicLoader
from itemadapter import ItemAdapter
from Comics.items import ComicInfoItem
from Comics.items import ListComicItem
class RmComicSpider(scrapy.Spider):
name = 'rm_comic'
allowed_domains = ['rm01.xyz']
main_url = 'https://rm01.xyz'
#start_urls = ['https://rm01.xyz/books/63b65185-f798-4c8f-a0b0-8811615908fd/0']
start_urls = 'https://rm01.xyz/books'
def start_requests(self):
yield scrapy.Request('https://rm01.xyz'
'/books/306ec1e2-f701-4fda-bb78-041ad6ec4020', callback=self.parse_comic)
yield scrapy.Request(self.start_urls, callback=self.books_comic)
def books_comic(self, response):
books_comic = ComicLoader(item=ListComicItem(), response=response)
data = books_comic.get_xpath('//script[@id="__NEXT_DATA__"]/text()')[0]
str_exec = "props.pageProps.books"
books = books_comic.get_exec(data, str_exec=str_exec)
for book in books:
books_comic.add_value('link', book['id'])
logging.info(f"downloading books %s" % book['name'])
time.sleep(3)
yield scrapy.Request(url=self.start_urls+"/"+book['id'], callback=self.parse_comic)
# 获取某个漫画的相关数据
# 获取到多个章节链接后进入下个流程
@ -43,7 +53,8 @@ class RmComicSpider(scrapy.Spider):
#comic_item.add_exec('dep', data, str_exec=str_exec+"description")
comic_item.add_value('index', response.meta['num'])
comic_item.add_exec('chapter', data, str_exec=str_exec + "chapterName")
comic_item.add_exec('list_img', data, str_exec+"images")
comic_item.add_exec('image_urls', data, str_exec+"images")
comic_item.add_exec('images', data, str_exec+"images")
comic = comic_item.load_item()
chapter_api_url = comic_item.get_exec(data, str_exec+"chapterAPIPath")
if chapter_api_url is not None:
@ -55,8 +66,10 @@ class RmComicSpider(scrapy.Spider):
def parse_chapter_api(self, response):
comic_item = ComicLoader(item=response.meta['item'], response=response)
comic_item.add_exec('chapter', response.text, str_exec='chapter.name')
comic_item.add_exec('list_img', response.text, str_exec='chapter.images')
comic_item.add_exec('image_urls', response.text, str_exec='chapter.images')
comic_item.add_exec('images', response.text, str_exec='chapter.images')
yield comic_item.load_item()
def parse(self, response):
raise NotImplementedError

View File

@ -12,7 +12,16 @@ class ComicPath:
def getFileScrambleImageName(cls,count,block,suffix=".jpg"): return cls.PREFIX_SCRAMBLE+str(block)+"_"+str(count)+suffix
@classmethod
def getFileScrambleImageSave(cls,file): return str(file).split("_")[-1]
def getFileScrambleImageSave(cls,file,relative=False, is_prefix=True):
file_name = str(file).split("_")[-1]
if relative:
file_name = os.path.basename(file_name)
if relative == "fullpath":
file_name = os.path.join(os.path.dirname(file), file_name)
if not is_prefix:
return file_name.split(".")[0]
else:
return file_name
#繁体中文转简体中文
@classmethod

View File

@ -16,6 +16,12 @@ class fileUtils:
fs.write(str(data))
fs.close()
@classmethod
def path(cls, file):
base_dir = os.path.dirname(file)
if not os.path.exists(base_dir): os.makedirs(base_dir)
return file
class CommonUtils:
@classmethod
def parseExec(cls,data,exec):
@ -29,20 +35,23 @@ class CommonUtils:
class imageUtils:
@classmethod
def deScrambleImagesByDir(cls,chapter_dir):
def descramble_images_by_dir(cls, chapter_dir):
if os.path.isfile(chapter_dir):
chapter_dir = os.path.dirname(chapter_dir)
scramble_count = 0
if os.path.exists(chapter_dir): #获取章节图片路径
dirs = os.listdir(chapter_dir)
for img in dirs:
while ComicPath.PREFIX_SCRAMBLE in os.listdir(chapter_dir):
for img in os.listdir(chapter_dir):
if img.startswith(ComicPath.PREFIX_SCRAMBLE):
imageUtils.encode_scramble_image(os.path.join(chapter_dir,img))
imageUtils.encode_scramble_image(os.path.join(chapter_dir, img))
scramble_count += 1
logging.debug(f"{ComicPath.PREFIX_SCRAMBLE} {scramble_count}")
return scramble_count
@classmethod
def deScrambleImagesByPath(cls, img_path, img_save=None):
if os.path.basename(img_path).startswith(ComicPath.PREFIX_SCRAMBLE):
if os.path.basename(img_path).\
startswith(ComicPath.PREFIX_SCRAMBLE) and os.path.exists(img_path):
img_path = imageUtils.encode_scramble_image(img_path, img_save)
return img_path
@ -186,16 +195,18 @@ class imageUtils:
if scramble_file_cache != None and os.path.exists(scramble_file_cache): os.remove(scramble_file_cache)
@classmethod
def encode_scramble_image(cls,imgpath,img_save=None):
image = Image.open(imgpath)
def encode_scramble_image(cls, img_path, img_save=None):
if not os.path.exists(img_path):
return
image = Image.open(img_path)
w, h = image.size
#image.show()
file_str = str(imgpath).split("=")
file_str = str(img_path).split("=")
#10_29.jpg
base_fn = file_str[-1].split("_")
blocks = int(base_fn[0])
if img_save == None:
save_path = os.path.join(os.path.dirname(imgpath),ComicPath.getFileScrambleImageSave(imgpath))
save_path = os.path.join(os.path.dirname(img_path),ComicPath.getFileScrambleImageSave(img_path))
else: save_path = img_save
# print(type(aid),type(img_name))
if blocks:
@ -230,10 +241,10 @@ class imageUtils:
newh += b_h
newimage.save(save_path)
print("解密成功=",save_path)
if os.path.exists(imgpath):
os.remove(imgpath)
print("remove=",imgpath)
logging.info(f"解密成功 {save_path}")
if os.path.exists(img_path):
os.remove(img_path)
logging.debug(f"remove {img_path}")
return save_path
@ -270,7 +281,7 @@ class CBZUtils:
logging.info(f"打包完成:{target_file}")
@classmethod
def packComicChapterCBZ(cls, comic, chapter, remove=True):
def packComicChapterCBZ(cls, comic, chapter, comic_info_images, remove=True):
images_chapter_path = os.path.join(IMAGES_STORE, comic, chapter)
cbz_chapter_path = os.path.join(CBZ_EXPORT_PATH, comic, chapter) + ".CBZ"
if os.path.exists(images_chapter_path):
@ -278,13 +289,15 @@ class CBZUtils:
for file in dirs:
if file.startswith(ComicPath.PREFIX_SCRAMBLE):
try:
os.remove(file)
imageUtils.deScrambleImagesByPath(os.path.join(images_chapter_path,file))
except Exception as e:
print(f"删除 {file} 发生错误 {e},已跳过")
return False
cls.zip_compression(images_chapter_path, cbz_chapter_path)
time.sleep(0.1)
if remove: shutil.rmtree(images_chapter_path)
# validation
cls.cbz_validate(cbz_chapter_path, comic_info_images)
return True
@classmethod
@ -338,3 +351,11 @@ class CBZUtils:
except Exception as e:
print(e)
return result
@classmethod
def cbz_validate(cls, zip_path, comic_info_images):
if len(cls.zip_info(zip_path)) == len(comic_info_images):
logging.info(f"validating successfully === {zip_path}")
else:
os.remove(zip_path)
logging.error(f"validating fail === {zip_path}")