This commit is contained in:
caiwx86 2024-07-15 04:14:02 +08:00
parent faee8328a4
commit ab8bb0412e
3 changed files with 14 additions and 10 deletions

View File

@ -26,27 +26,31 @@ ROBOTSTXT_OBEY = False
HTTPERROR_ALLOWED_CODES = [ 200 , 403]
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 3
CONCURRENT_REQUESTS = 16
# 允许重定向
MEDIA_ALLOW_REDIRECTS = True
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
IMAGES_STORE = OUTPUT_DIR
IMAGES_NAME_FORMAT = "{:0>3d}"
COMIC_INFO_XML_STORE = IMAGES_STORE
DOWNLOAD_DELAY = 0
# DOWNLOAD_DELAY = 0
# 随机化下载延迟
RANDOMIZE_DOWNLOAD_DELAY = True
#重试
RETRY_ENABLED = True
RETRY_TIMES = 2 # 想重试几次就写几
RETRY_TIMES = 3 # 想重试几次就写几
# 下面这行可要可不要
# RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 401]
RETRY_HTTP_CODES = [408, 401]
RETRY_HTTP_CODES = [408, 401, 504, 110]
# The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 16
CONCURRENT_REQUESTS_PER_IP = 16
PROXY_LIST = [
"http://127.0.0.1:7890",
# "http://10.0.10.117:8123",
# "http://127.0.0.1:7890",
# "http://proxy.local:20172",
]
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
@ -97,7 +101,7 @@ ITEM_PIPELINES = {
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
AUTOTHROTTLE_START_DELAY = 5
AUTOTHROTTLE_START_DELAY = 10
# The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to

View File

@ -6,12 +6,12 @@ from Comics.utils import Conf
class RmComicSpider(scrapy.Spider):
name = 'rm_comic'
allowed_domains = ['roum12.xyz']
allowed_domains = ['roum18.xyz']
main_url = 'https://'+allowed_domains[0]
start_urls = main_url+'/books'
# 遍历网站页数数据
def start_requests(self):
def start_requests(self):
for x in range(0,60):
yield scrapy.Request(self.start_urls+"?&page="+str(x), callback=self.books_comic)

View File

@ -27,7 +27,7 @@ class Conf():
def get_config_value(self, project, key=None):
# 使用Path类来处理文件路径
config_path = Path(os.path.join("Comics","spiders", project)+".yml")
#Path("Comics") / "spiders" / project / (project + ".yml")
#Path("Comics") / "spiders" / (project + ".yml")
# 检查项目是否存在
if not config_path.is_file():
return None