"""Scrapy settings for ss-crawler project."""

BOT_NAME = 'ss-crawler'

SPIDER_MODULES = ['scrapy_project.spiders']
NEWSPIDER_MODULE = 'scrapy_project.spiders'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False  # Set to True if you want to respect robots.txt

# Configure delays for requests
DOWNLOAD_DELAY = 1.0  # Default delay in seconds
RANDOMIZE_DOWNLOAD_DELAY = True  # Add random jitter to delay
RANDOMIZE_DOWNLOAD_DELAY_RANGE = 0.5  # 0.5 to 1.5x of DOWNLOAD_DELAY

# Concurrent requests
CONCURRENT_REQUESTS = 1  # Conservative default
CONCURRENT_REQUESTS_PER_DOMAIN = 1

# Request settings
DOWNLOAD_TIMEOUT = 30
RETRY_TIMES = 3
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429, 403]  # Also retry 403 (Forbidden)

# User agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'

# Enable and configure pipelines
ITEM_PIPELINES = {
    'scrapy_project.pipelines.ValidationPipeline': 300,
    'scrapy_project.pipelines.DeduplicationPipeline': 400,
    'scrapy_project.pipelines.StoragePipeline': 500,
    'scrapy_project.pipelines.ExportPipeline': 600,
}

# Enable and configure middlewares
DOWNLOADER_MIDDLEWARES = {
    'scrapy_project.middlewares.CustomHeadersMiddleware': 400,
    'scrapy_project.middlewares.ProxyMiddleware': 500,  # Proxy before rate limiting
    'scrapy_project.middlewares.RateLimitMiddleware': 543,
    'scrapy_project.middlewares.CustomRetryMiddleware': 550,
    'scrapy_project.middlewares.StateMiddleware': 600,
}

# Proxy settings
PROXIES = []  # List of proxy URLs (e.g., ['http://proxy1:port', 'http://proxy2:port'])
PROXY_MODE = 'rotate'  # 'rotate' or 'random'
PROXY_MAX_FAILURE_RATE = 0.5  # Maximum failure rate (0.0-1.0) before removing proxy (default: 0.5 = 50%)
PROXY_MIN_REQUESTS = 5  # Minimum requests before evaluating failure rate
PROXY_ENABLE_HEALTH_CHECK = True  # Enable health checking and auto-removal of bad proxies

# Enable or disable extensions
EXTENSIONS = {
    'scrapy.extensions.telnet.TelnetConsole': None,
}

# Configure logging
LOG_LEVEL = 'INFO'
LOG_FORMAT = '%(asctime)s [%(name)s] %(levelname)s: %(message)s'
LOG_DATEFORMAT = '%Y-%m-%d %H:%M:%S'

# Data output settings
DATA_OUTPUT_DIR = 'data'
EXPORT_DIR = 'data/exports'
EXPORT_FORMAT = 'woocommerce'  # Options: woocommerce, shopify, excel, json, all

# State management
STATE_FILE = None  # Set to path if resume capability is needed

# Enable and configure cache
HTTPCACHE_ENABLED = False  # Set to True to enable caching
HTTPCACHE_EXPIRATION_SECS = 3600
HTTPCACHE_DIR = 'data/cache'

# AutoThrottle settings (optional, for adaptive delays)
AUTOTHROTTLE_ENABLED = False
AUTOTHROTTLE_START_DELAY = 1
AUTOTHROTTLE_MAX_DELAY = 10
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
AUTOTHROTTLE_DEBUG = False

# Memory settings
MEMUSAGE_ENABLED = True
MEMUSAGE_LIMIT_MB = 500
MEMUSAGE_WARNING_MB = 400

# Stats
STATS_CLASS = 'scrapy.statscollectors.MemoryStatsCollector'

# Enable feed exports
FEEDS = {}  # Can be configured dynamically

# Disable cookies (optional, can be enabled per spider)
COOKIES_ENABLED = True

# Request headers (can be overridden by config)
DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en',
    'Accept-Encoding': 'gzip, deflate',
}