fix

2024-07-15 04:14:02 +08:00 · 2024-07-15 04:14:02 +08:00 · ab8bb0412e
commit ab8bb0412e
parent faee8328a4
3 changed files with 14 additions and 10 deletions
--- a/Comics/settings.py
+++ b/Comics/settings.py
@ -26,27 +26,31 @@ ROBOTSTXT_OBEY = False

 HTTPERROR_ALLOWED_CODES = [ 200 , 403]
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
-CONCURRENT_REQUESTS = 3 
+CONCURRENT_REQUESTS = 16 

+# 允许重定向
+MEDIA_ALLOW_REDIRECTS = True
 # Configure a delay for requests for the same website (default: 0)
 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
 IMAGES_STORE = OUTPUT_DIR
 IMAGES_NAME_FORMAT = "{:0>3d}"
 COMIC_INFO_XML_STORE = IMAGES_STORE 
-DOWNLOAD_DELAY = 0 
+# DOWNLOAD_DELAY = 0
+# 随机化下载延迟
+RANDOMIZE_DOWNLOAD_DELAY = True 
 #重试
 RETRY_ENABLED = True
-RETRY_TIMES = 2 # 想重试几次就写几
+RETRY_TIMES = 3 # 想重试几次就写几
 # 下面这行可要可不要
 # RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 401]
-RETRY_HTTP_CODES = [408, 401]
+RETRY_HTTP_CODES = [408, 401, 504, 110]
 # The download delay setting will honor only one of:
 CONCURRENT_REQUESTS_PER_DOMAIN = 16
 CONCURRENT_REQUESTS_PER_IP = 16
 PROXY_LIST = [
-    "http://127.0.0.1:7890",
-#    "http://10.0.10.117:8123",
+#    "http://127.0.0.1:7890",
+#    "http://proxy.local:20172",
 ]
 # Disable cookies (enabled by default)
 COOKIES_ENABLED = False
@ -97,7 +101,7 @@ ITEM_PIPELINES = {
 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
 AUTOTHROTTLE_ENABLED = True
 # The initial download delay
-AUTOTHROTTLE_START_DELAY = 5
+AUTOTHROTTLE_START_DELAY = 10 
 # The maximum download delay to be set in case of high latencies
 AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
--- a/Comics/spiders/rm_comic.py
+++ b/Comics/spiders/rm_comic.py
@ -6,12 +6,12 @@ from Comics.utils import Conf

 class RmComicSpider(scrapy.Spider):
    name = 'rm_comic'
-    allowed_domains = ['roum12.xyz']
+    allowed_domains = ['roum18.xyz']
    main_url = 'https://'+allowed_domains[0]
    start_urls = main_url+'/books'
    
    # 遍历网站页数数据 
-    def start_requests(self):
+    def start_requests(self):        
        for x in range(0,60):
            yield scrapy.Request(self.start_urls+"?&page="+str(x), callback=self.books_comic)

--- a/Comics/utils.py
+++ b/Comics/utils.py
@ -27,7 +27,7 @@ class Conf():
    def get_config_value(self, project, key=None):
        # 使用Path类来处理文件路径
        config_path = Path(os.path.join("Comics","spiders", project)+".yml")
-        #Path("Comics") / "spiders" / project / (project + ".yml")
+        #Path("Comics") / "spiders" / (project + ".yml")
        # 检查项目是否存在
        if not config_path.is_file():
            return None