"""Scrapy settings for ss-crawler project.""" BOT_NAME = 'ss-crawler' SPIDER_MODULES = ['scrapy_project.spiders'] NEWSPIDER_MODULE = 'scrapy_project.spiders' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Set to True if you want to respect robots.txt # Configure delays for requests DOWNLOAD_DELAY = 1.0 # Default delay in seconds RANDOMIZE_DOWNLOAD_DELAY = True # Add random jitter to delay RANDOMIZE_DOWNLOAD_DELAY_RANGE = 0.5 # 0.5 to 1.5x of DOWNLOAD_DELAY # Concurrent requests CONCURRENT_REQUESTS = 1 # Conservative default CONCURRENT_REQUESTS_PER_DOMAIN = 1 # Request settings DOWNLOAD_TIMEOUT = 30 RETRY_TIMES = 3 RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429, 403] # Also retry 403 (Forbidden) # User agent USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' # Enable and configure pipelines ITEM_PIPELINES = { 'scrapy_project.pipelines.ValidationPipeline': 300, 'scrapy_project.pipelines.DeduplicationPipeline': 400, 'scrapy_project.pipelines.StoragePipeline': 500, 'scrapy_project.pipelines.ExportPipeline': 600, } # Enable and configure middlewares DOWNLOADER_MIDDLEWARES = { 'scrapy_project.middlewares.CustomHeadersMiddleware': 400, 'scrapy_project.middlewares.ProxyMiddleware': 500, # Proxy before rate limiting 'scrapy_project.middlewares.RateLimitMiddleware': 543, 'scrapy_project.middlewares.CustomRetryMiddleware': 550, 'scrapy_project.middlewares.StateMiddleware': 600, } # Proxy settings PROXIES = [] # List of proxy URLs (e.g., ['http://proxy1:port', 'http://proxy2:port']) PROXY_MODE = 'rotate' # 'rotate' or 'random' PROXY_MAX_FAILURE_RATE = 0.5 # Maximum failure rate (0.0-1.0) before removing proxy (default: 0.5 = 50%) PROXY_MIN_REQUESTS = 5 # Minimum requests before evaluating failure rate PROXY_ENABLE_HEALTH_CHECK = True # Enable health checking and auto-removal of bad proxies # Enable or disable extensions EXTENSIONS = { 'scrapy.extensions.telnet.TelnetConsole': None, } # Configure logging LOG_LEVEL = 'INFO' LOG_FORMAT = '%(asctime)s [%(name)s] %(levelname)s: %(message)s' LOG_DATEFORMAT = '%Y-%m-%d %H:%M:%S' # Data output settings DATA_OUTPUT_DIR = 'data' EXPORT_DIR = 'data/exports' EXPORT_FORMAT = 'woocommerce' # Options: woocommerce, shopify, excel, json, all # State management STATE_FILE = None # Set to path if resume capability is needed # Enable and configure cache HTTPCACHE_ENABLED = False # Set to True to enable caching HTTPCACHE_EXPIRATION_SECS = 3600 HTTPCACHE_DIR = 'data/cache' # AutoThrottle settings (optional, for adaptive delays) AUTOTHROTTLE_ENABLED = False AUTOTHROTTLE_START_DELAY = 1 AUTOTHROTTLE_MAX_DELAY = 10 AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 AUTOTHROTTLE_DEBUG = False # Memory settings MEMUSAGE_ENABLED = True MEMUSAGE_LIMIT_MB = 500 MEMUSAGE_WARNING_MB = 400 # Stats STATS_CLASS = 'scrapy.statscollectors.MemoryStatsCollector' # Enable feed exports FEEDS = {} # Can be configured dynamically # Disable cookies (optional, can be enabled per spider) COOKIES_ENABLED = True # Request headers (can be overridden by config) DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'Accept-Encoding': 'gzip, deflate', }