"""Scrapy middlewares for rate limiting, retry, and headers.""" import random import time from typing import Optional, Dict from scrapy import signals from scrapy.downloadermiddlewares.retry import RetryMiddleware from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware from scrapy.utils.response import response_status_message class RateLimitMiddleware: """Middleware to enforce rate limiting between requests.""" def __init__(self, delay: float = 1.0, randomize_delay: bool = True): """ Initialize rate limit middleware. Args: delay: Base delay in seconds between requests randomize_delay: Whether to add random jitter to delay """ self.delay = delay self.randomize_delay = randomize_delay self.last_request_time = {} @classmethod def from_crawler(cls, crawler): """Create middleware instance from crawler settings.""" delay = crawler.settings.getfloat('DOWNLOAD_DELAY', 1.0) randomize = crawler.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY', True) return cls(delay=delay, randomize_delay=randomize) def process_request(self, request, spider): """Process request and enforce rate limit.""" domain = request.url.split('/')[2] if '/' in request.url else request.url # Check if we need to wait if domain in self.last_request_time: elapsed = time.time() - self.last_request_time[domain] # Calculate delay (with optional jitter) delay = self.delay if self.randomize_delay: delay = delay * (0.5 + random.random()) if elapsed < delay: sleep_time = delay - elapsed time.sleep(sleep_time) # Update last request time self.last_request_time[domain] = time.time() return None class CustomRetryMiddleware(RetryMiddleware): """Custom retry middleware with exponential backoff.""" def __init__(self, settings): super().__init__(settings) self.max_retry_times = settings.getint('RETRY_TIMES', 3) self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES')) def process_response(self, request, response, spider): """Process response and retry if necessary.""" # Check for blocking indicators if response.status == 403: # Check response content for blocking indicators try: content_lower = response.text.lower() if hasattr(response, 'text') and response.text else '' except: content_lower = '' # Handle headers (may be bytes or string) server_header = response.headers.get('Server', b'') if isinstance(server_header, bytes): server_header = server_header.decode('utf-8', errors='ignore') else: server_header = str(server_header) blocking_indicators = [] if 'datadome' in content_lower or 'DataDome' in server_header: blocking_indicators.append('DataDome protection') if 'captcha' in content_lower: blocking_indicators.append('CAPTCHA challenge') if 'cloudflare' in content_lower or 'cf-ray' in response.headers: blocking_indicators.append('Cloudflare protection') if 'access denied' in content_lower or 'blocked' in content_lower: blocking_indicators.append('Access denied message') if blocking_indicators: spider.logger.error( f"🚫 BLOCKED: {response.status} Forbidden detected on {request.url}. " f"Blocking indicators: {', '.join(blocking_indicators)}. " f"Server: {server_header or 'Unknown'}" ) else: spider.logger.warning(f"403 Forbidden on {request.url} (no specific blocking indicators found)") if response.status == 429: spider.logger.error( f"⚠️ RATE LIMITED: 429 Too Many Requests on {request.url}. " f"Consider increasing delay between requests." ) if response.status in self.retry_http_codes: reason = response_status_message(response.status) return self._retry(request, reason, spider) or response return response def _retry(self, request, reason, spider): """Retry request with exponential backoff.""" retry_times = request.meta.get('retry_times', 0) + 1 if retry_times <= self.max_retry_times: # Calculate exponential backoff delay delay = 2 ** retry_times + random.uniform(0, 1) spider.logger.debug(f"Retrying {request} (attempt {retry_times}/{self.max_retry_times}) after {delay:.2f}s") retry_req = request.copy() retry_req.meta['retry_times'] = retry_times retry_req.dont_filter = True retry_req.priority = request.priority + 1 # Add delay time.sleep(delay) return retry_req else: spider.logger.error(f"Gave up retrying {request} after {retry_times} attempts") class CustomHeadersMiddleware(UserAgentMiddleware): """Middleware to set custom headers from config.""" def __init__(self, user_agent: str = None): """Initialize headers middleware.""" self.user_agent = user_agent or 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' self.custom_headers = {} @classmethod def from_crawler(cls, crawler): """Create middleware instance from crawler settings.""" user_agent = crawler.settings.get('USER_AGENT') return cls(user_agent=user_agent) def process_request(self, request, spider): """Process request and set headers.""" # Set User-Agent if self.user_agent: request.headers['User-Agent'] = self.user_agent # Set custom headers from spider config if available if hasattr(spider, 'config') and spider.config: headers = spider.config.get('headers', {}) for key, value in headers.items(): if key.lower() != 'user-agent': # Don't override User-Agent if already set request.headers[key] = value # For Etsy, add additional headers to avoid 403 if 'etsy.com' in request.url: if 'Referer' not in request.headers: request.headers['Referer'] = 'https://www.etsy.com/' if 'Accept-Language' not in request.headers: request.headers['Accept-Language'] = 'en-US,en;q=0.9' if 'Accept-Encoding' not in request.headers: request.headers['Accept-Encoding'] = 'gzip, deflate, br' if 'Connection' not in request.headers: request.headers['Connection'] = 'keep-alive' if 'Upgrade-Insecure-Requests' not in request.headers: request.headers['Upgrade-Insecure-Requests'] = '1' if 'Sec-Fetch-Dest' not in request.headers: request.headers['Sec-Fetch-Dest'] = 'document' if 'Sec-Fetch-Mode' not in request.headers: request.headers['Sec-Fetch-Mode'] = 'navigate' if 'Sec-Fetch-Site' not in request.headers: request.headers['Sec-Fetch-Site'] = 'same-origin' return None class ProxyMiddleware: """ Middleware to handle proxy rotation with health checking and auto-removal of bad proxies. Features: - Health tracking (success/failure counts, failure rate) - Auto-remove bad proxies (when failure rate exceeds threshold) - Retry with different proxy on failure - Statistics tracking """ def __init__(self, proxies: Optional[list] = None, proxy_mode: str = 'rotate', max_failure_rate: float = 0.5, min_requests: int = 5, enable_health_check: bool = True): """ Initialize proxy middleware. Args: proxies: List of proxy URLs (e.g., ['http://proxy1:port', 'http://proxy2:port']) proxy_mode: 'rotate' (rotate through proxies) or 'random' (random selection) max_failure_rate: Maximum failure rate (0.0-1.0) before removing proxy (default: 0.5 = 50%) min_requests: Minimum requests before evaluating failure rate (default: 5) enable_health_check: Enable health checking and auto-removal (default: True) """ self.proxies = list(proxies) if proxies else [] self.original_proxies = list(self.proxies) # Keep original list self.proxy_mode = proxy_mode self.current_proxy_index = 0 # Health tracking self.enable_health_check = enable_health_check self.max_failure_rate = max_failure_rate self.min_requests = min_requests self.proxy_stats: Dict[str, Dict] = {} # {proxy: {'success': int, 'failure': int, 'last_failure': float}} # Initialize stats for all proxies for proxy in self.proxies: if proxy not in self.proxy_stats: self.proxy_stats[proxy] = {'success': 0, 'failure': 0, 'last_failure': 0} @classmethod def from_crawler(cls, crawler): """Create middleware instance from crawler settings.""" proxies = crawler.settings.getlist('PROXIES', []) proxy_mode = crawler.settings.get('PROXY_MODE', 'rotate') max_failure_rate = crawler.settings.getfloat('PROXY_MAX_FAILURE_RATE', 0.5) min_requests = crawler.settings.getint('PROXY_MIN_REQUESTS', 5) enable_health_check = crawler.settings.getbool('PROXY_ENABLE_HEALTH_CHECK', True) return cls( proxies=proxies, proxy_mode=proxy_mode, max_failure_rate=max_failure_rate, min_requests=min_requests, enable_health_check=enable_health_check ) def _get_available_proxies(self): """Get list of available (not removed) proxies.""" if not self.enable_health_check: return self.proxies available = [] for proxy in self.proxies: stats = self.proxy_stats.get(proxy, {'success': 0, 'failure': 0}) total = stats['success'] + stats['failure'] # Skip if not enough requests to evaluate if total < self.min_requests: available.append(proxy) continue # Calculate failure rate failure_rate = stats['failure'] / total if total > 0 else 0 # Only include if failure rate is acceptable if failure_rate <= self.max_failure_rate: available.append(proxy) return available def _select_proxy(self): """Select a proxy based on mode and health.""" available = self._get_available_proxies() if not available: # If all proxies are bad, reset and use all (with warning) if self.proxies: return self.proxies[0] # Fallback to first proxy return None if self.proxy_mode == 'random': return random.choice(available) else: # rotate # Find index in available list if available: # Try to maintain rotation order proxy = available[self.current_proxy_index % len(available)] self.current_proxy_index = (self.current_proxy_index + 1) % len(available) return proxy return None def _mark_proxy_success(self, proxy: str, spider): """Mark proxy as successful.""" if proxy and self.enable_health_check: if proxy not in self.proxy_stats: self.proxy_stats[proxy] = {'success': 0, 'failure': 0, 'last_failure': 0} self.proxy_stats[proxy]['success'] += 1 def _mark_proxy_failure(self, proxy: str, spider): """Mark proxy as failed and check if should be removed.""" if proxy and self.enable_health_check: if proxy not in self.proxy_stats: self.proxy_stats[proxy] = {'success': 0, 'failure': 0, 'last_failure': 0} stats = self.proxy_stats[proxy] stats['failure'] += 1 stats['last_failure'] = time.time() total = stats['success'] + stats['failure'] failure_rate = stats['failure'] / total if total > 0 else 0 # Log warning if failure rate is high if total >= self.min_requests and failure_rate > self.max_failure_rate: spider.logger.warning( f"⚠️ Proxy {proxy} has high failure rate: {failure_rate:.1%} " f"({stats['failure']}/{total}). Will be excluded from rotation." ) def _get_proxy_stats_summary(self) -> str: """Get summary of proxy statistics.""" if not self.proxy_stats: return "No proxy statistics available" lines = [] for proxy, stats in self.proxy_stats.items(): total = stats['success'] + stats['failure'] if total > 0: success_rate = stats['success'] / total lines.append( f" {proxy}: {stats['success']} success, {stats['failure']} failure " f"({success_rate:.1%} success rate)" ) return "\n".join(lines) if lines else "No requests processed yet" def process_request(self, request, spider): """Set proxy for request.""" # Check if proxy is already set in request meta (e.g., from retry) if 'proxy' in request.meta: return None # Get proxies from spider config if available if hasattr(spider, 'config') and spider.config: config_proxies = spider.config.get('proxies', {}).get('list', []) if config_proxies: # Update proxies list new_proxies = list(config_proxies) if new_proxies != self.original_proxies: self.proxies = new_proxies self.original_proxies = list(new_proxies) # Initialize stats for new proxies for proxy in new_proxies: if proxy not in self.proxy_stats: self.proxy_stats[proxy] = {'success': 0, 'failure': 0, 'last_failure': 0} self.proxy_mode = spider.config.get('proxies', {}).get('mode', 'rotate') # If no proxies configured, skip if not self.proxies: return None # Select proxy proxy = self._select_proxy() if not proxy: spider.logger.warning("No available proxies, skipping proxy assignment") return None # Set proxy in request meta request.meta['proxy'] = proxy request.meta['proxy_retry_count'] = request.meta.get('proxy_retry_count', 0) spider.logger.debug(f"Using proxy: {proxy} (retry: {request.meta['proxy_retry_count']})") return None def process_response(self, request, response, spider): """Track proxy success/failure based on response.""" proxy = request.meta.get('proxy') if proxy: # Consider 2xx and 3xx as success if 200 <= response.status < 400: self._mark_proxy_success(proxy, spider) else: # 4xx and 5xx are failures (but might be retried) # We'll mark as failure in process_exception if it's a connection error # For HTTP errors, we mark as failure only if it's a blocking indicator if response.status in [403, 429]: # These are likely proxy-related blocks self._mark_proxy_failure(proxy, spider) spider.logger.warning( f"Proxy {proxy} returned {response.status} for {request.url}. " f"Marking as failure." ) return response def process_exception(self, request, exception, spider): """Handle proxy errors and retry with different proxy.""" proxy = request.meta.get('proxy') if proxy: # Mark proxy as failed self._mark_proxy_failure(proxy, spider) # Check if we should retry with a different proxy proxy_retry_count = request.meta.get('proxy_retry_count', 0) max_proxy_retries = 3 # Max retries with different proxies if proxy_retry_count < max_proxy_retries: available = self._get_available_proxies() # Remove current proxy from available list available = [p for p in available if p != proxy] if available: # Select a different proxy new_proxy = random.choice(available) if self.proxy_mode == 'random' else available[0] spider.logger.info( f"🔄 Retrying with different proxy: {new_proxy} " f"(previous: {proxy}, attempt {proxy_retry_count + 1}/{max_proxy_retries})" ) # Create new request with different proxy retry_req = request.copy() retry_req.meta['proxy'] = new_proxy retry_req.meta['proxy_retry_count'] = proxy_retry_count + 1 retry_req.dont_filter = True return retry_req else: spider.logger.error( f"❌ No available proxies left. All proxies have failed. " f"Original error: {exception}" ) else: spider.logger.error( f"❌ Max proxy retries ({max_proxy_retries}) reached for {request.url}. " f"Last proxy: {proxy}, Error: {exception}" ) return None def spider_closed(self, spider): """Log proxy statistics when spider closes.""" if self.proxy_stats: spider.logger.info("📊 Proxy Statistics Summary:") spider.logger.info(self._get_proxy_stats_summary()) # Log removed proxies available = self._get_available_proxies() removed = [p for p in self.original_proxies if p not in available] if removed: spider.logger.warning(f"⚠️ Removed {len(removed)} bad proxy(ies): {', '.join(removed)}") class StateMiddleware: """Middleware to save crawl state for resume capability.""" def __init__(self, state_file: Optional[str] = None): """Initialize state middleware.""" self.state_file = state_file self.visited_urls = set() @classmethod def from_crawler(cls, crawler): """Create middleware instance from crawler settings.""" state_file = crawler.settings.get('STATE_FILE') return cls(state_file=state_file) def spider_opened(self, spider): """Called when spider is opened. Load state if exists.""" if self.state_file: import json from pathlib import Path state_path = Path(self.state_file) if state_path.exists(): try: with open(state_path, 'r') as f: state = json.load(f) self.visited_urls = set(state.get('visited_urls', [])) spider.logger.info(f"Loaded state: {len(self.visited_urls)} visited URLs") except Exception as e: spider.logger.warning(f"Could not load state: {e}") def process_request(self, request, spider): """Check if URL was already visited.""" # Don't block start URLs - they should always be crawled # Only block if this is a resume crawl (state_file is set) and URL is not a start URL if self.state_file and request.url in self.visited_urls: # Check if this is a start URL start_urls = getattr(spider, 'start_urls', []) if request.url not in start_urls: spider.logger.debug(f"Skipping already visited URL: {request.url}") from scrapy.exceptions import IgnoreRequest raise IgnoreRequest(f"URL already visited: {request.url}") return None def process_response(self, request, response, spider): """Mark URL as visited after successful response.""" self.visited_urls.add(request.url) return response def spider_closed(self, spider): """Called when spider is closed. Save state.""" if self.state_file: import json from pathlib import Path state_path = Path(self.state_file) state_path.parent.mkdir(parents=True, exist_ok=True) state = { 'visited_urls': list(self.visited_urls), 'total_visited': len(self.visited_urls) } try: with open(state_path, 'w') as f: json.dump(state, f, indent=2) spider.logger.info(f"Saved state: {len(self.visited_urls)} visited URLs to {state_path}") except Exception as e: spider.logger.error(f"Could not save state: {e}")