fix
This commit is contained in:
parent
faee8328a4
commit
ab8bb0412e
@ -26,27 +26,31 @@ ROBOTSTXT_OBEY = False
|
||||
|
||||
HTTPERROR_ALLOWED_CODES = [ 200 , 403]
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
CONCURRENT_REQUESTS = 3
|
||||
CONCURRENT_REQUESTS = 16
|
||||
|
||||
# 允许重定向
|
||||
MEDIA_ALLOW_REDIRECTS = True
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
IMAGES_STORE = OUTPUT_DIR
|
||||
IMAGES_NAME_FORMAT = "{:0>3d}"
|
||||
COMIC_INFO_XML_STORE = IMAGES_STORE
|
||||
DOWNLOAD_DELAY = 0
|
||||
# DOWNLOAD_DELAY = 0
|
||||
# 随机化下载延迟
|
||||
RANDOMIZE_DOWNLOAD_DELAY = True
|
||||
#重试
|
||||
RETRY_ENABLED = True
|
||||
RETRY_TIMES = 2 # 想重试几次就写几
|
||||
RETRY_TIMES = 3 # 想重试几次就写几
|
||||
# 下面这行可要可不要
|
||||
# RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 401]
|
||||
RETRY_HTTP_CODES = [408, 401]
|
||||
RETRY_HTTP_CODES = [408, 401, 504, 110]
|
||||
# The download delay setting will honor only one of:
|
||||
CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
CONCURRENT_REQUESTS_PER_IP = 16
|
||||
PROXY_LIST = [
|
||||
"http://127.0.0.1:7890",
|
||||
# "http://10.0.10.117:8123",
|
||||
# "http://127.0.0.1:7890",
|
||||
# "http://proxy.local:20172",
|
||||
]
|
||||
# Disable cookies (enabled by default)
|
||||
COOKIES_ENABLED = False
|
||||
@ -97,7 +101,7 @@ ITEM_PIPELINES = {
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
AUTOTHROTTLE_START_DELAY = 5
|
||||
AUTOTHROTTLE_START_DELAY = 10
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
|
||||
@ -6,12 +6,12 @@ from Comics.utils import Conf
|
||||
|
||||
class RmComicSpider(scrapy.Spider):
|
||||
name = 'rm_comic'
|
||||
allowed_domains = ['roum12.xyz']
|
||||
allowed_domains = ['roum18.xyz']
|
||||
main_url = 'https://'+allowed_domains[0]
|
||||
start_urls = main_url+'/books'
|
||||
|
||||
# 遍历网站页数数据
|
||||
def start_requests(self):
|
||||
def start_requests(self):
|
||||
for x in range(0,60):
|
||||
yield scrapy.Request(self.start_urls+"?&page="+str(x), callback=self.books_comic)
|
||||
|
||||
|
||||
@ -27,7 +27,7 @@ class Conf():
|
||||
def get_config_value(self, project, key=None):
|
||||
# 使用Path类来处理文件路径
|
||||
config_path = Path(os.path.join("Comics","spiders", project)+".yml")
|
||||
#Path("Comics") / "spiders" / project / (project + ".yml")
|
||||
#Path("Comics") / "spiders" / (project + ".yml")
|
||||
# 检查项目是否存在
|
||||
if not config_path.is_file():
|
||||
return None
|
||||
|
||||
Loading…
Reference in New Issue
Block a user