# Scrapy settings for Comics project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from fake_useragent import UserAgent import os PROJECT_KEY = "current_project" BOT_NAME = 'Comics' SPIDER_MODULES = ['Comics.spiders'] NEWSPIDER_MODULE = 'Comics.spiders' BASE_OUTPUT = "" #BASE_OUTPUT = "/mnt/Comics" OUTPUT_DIR = os.path.join(BASE_OUTPUT, "output") # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'Comics (+http://www.yourdomain.com)' USER_AGENT = UserAgent().random # Override the default request headers: # Obey robots.txt rules ROBOTSTXT_OBEY = False HTTPERROR_ALLOWED_CODES = [ 200 , 403, 301, 302] # Configure maximum concurrent requests performed by Scrapy (default: 16) CONCURRENT_REQUESTS = 16 # 允许重定向 MEDIA_ALLOW_REDIRECTS = True # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs IMAGES_STORE = OUTPUT_DIR IMAGES_NAME_FORMAT = "{:0>3d}" COMIC_INFO_XML_STORE = IMAGES_STORE # 随机化下载延迟 # RANDOMIZE_DOWNLOAD_DELAY = True #重试 DOWNLOAD_DELAY = 1 # 每次请求间隔2秒 # 增加下载超时时间 DOWNLOAD_TIMEOUT = 30 # 30秒 # 增加重试之间的延迟 RETRY_DELAY = 5 # 5秒 RETRY_ENABLED = True RETRY_TIMES = 5 # 想重试几次就写几 # 下面这行可要可不要 # RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 401] RETRY_HTTP_CODES = [408, 401, 504, 110, 500, 502, 503, 522, 524, 429] # The download delay setting will honor only one of: CONCURRENT_REQUESTS_PER_DOMAIN = 16 CONCURRENT_REQUESTS_PER_IP = 16 PROXY_LIST = [ # "http://127.0.0.1:7890", # "http://proxy.local:20172", ] # Disable cookies (enabled by default) COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'Comics.middlewares.ComicsSpiderMiddleware': 543, # 'Comics.middlewares.ProxyMiddleware' : 100, # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { # 'Comics.middlewares.ComicsDownloaderMiddleware': 543, # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500, 'Comics._utils.middlewares.ProxyMiddleware': 100, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400, } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { # 'scrapy.pipelines.images.ImagesPipeline' : 1, 'Comics.pipelines.ComicsPipeline': 300, # 'Comics.pipelines.ImageParsePipeline': 400, # 'Comics.pipelines.IconDownloadPipeline': 400, 'Comics.pipelines.ImgDownloadPipeline': 500, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html AUTOTHROTTLE_ENABLED = True # The initial download delay AUTOTHROTTLE_START_DELAY = 10 # The maximum download delay to be set in case of high latencies AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings HTTPCACHE_ENABLED = True HTTPCACHE_EXPIRATION_SECS = 0 HTTPCACHE_DIR = os.path.join(BASE_OUTPUT,'httpcache') HTTPCACHE_ALLOW_PREFIXS = [ 'jpg', 'png', 'gif', 'JPG', "PNG", "JPEG"] HTTPCACHE_PROXY_DOMAINS = [ 'r5.rmcdn3.xyz' ] HTTPCACHE_STORAGE = 'Comics._utils.middlewares.MyFilesystemCacheStorage' # 队列 SCHEDULER = "scrapy.core.scheduler.Scheduler" SCHEDULER_DISK_QUEUE = "scrapy.squeues.PickleFifoDiskQueue" SCHEDULER_MEMORY_QUEUE = "scrapy.squeues.FifoMemoryQueue" # Logging configuration LOG_LEVEL = "INFO" # 日志等级 LOG_STDOUT = True # 标准化输出 CBZ_EXPORT_PATH = os.path.join(BASE_OUTPUT,"CBZ") OLD_CBZ_EXPORT_PATH = os.path.join(BASE_OUTPUT,"Old_CBZ") #数据导出类 排序 COMIC_INFO_XML_FILE = "ComicInfo.xml" COMIC_INFO_XSD_FILE = "Comics/assets/ComicInfo_2.1.xsd"