#!/usr/bin/env python3 """ Script to check if a website is blocking our crawler. Tests a URL and analyzes the response for blocking indicators. """ import sys import requests from pathlib import Path from urllib.parse import urlparse # Add project root to path project_root = Path(__file__).parent.parent sys.path.insert(0, str(project_root)) def check_blocking(url: str, headers: dict = None): """Check if a URL is blocking our requests.""" if headers is None: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Referer': 'https://www.etsy.com/' if 'etsy.com' in url else None, } # Remove None values headers = {k: v for k, v in headers.items() if v is not None} print(f"Testing URL: {url}") print(f"Headers: {headers.get('User-Agent', 'N/A')[:50]}...") print("-" * 60) try: response = requests.get(url, headers=headers, timeout=30, allow_redirects=True) print(f"Status Code: {response.status_code}") # Check status code blocking_indicators = [] if response.status_code == 403: blocking_indicators.append("❌ BLOCKED: 403 Forbidden") elif response.status_code == 429: blocking_indicators.append("⚠️ RATE LIMITED: 429 Too Many Requests") elif response.status_code == 503: blocking_indicators.append("⚠️ SERVICE UNAVAILABLE: 503") elif response.status_code != 200: blocking_indicators.append(f"⚠️ UNEXPECTED STATUS: {response.status_code}") # Check response content for blocking indicators content = response.text.lower() blocking_keywords = [ 'access denied', 'blocked', 'captcha', 'cloudflare', 'challenge', 'security check', 'bot detection', 'rate limit', 'too many requests', 'forbidden', 'unauthorized access', 'please verify you are human', 'verify you are not a robot' ] found_keywords = [] for keyword in blocking_keywords: if keyword in content: found_keywords.append(keyword) if found_keywords: blocking_indicators.append(f"⚠️ BLOCKING KEYWORDS FOUND: {', '.join(found_keywords)}") # Check response size (very small responses might be error pages) if len(response.text) < 1000 and response.status_code == 200: blocking_indicators.append("⚠️ SUSPICIOUS: Very small response (< 1KB) for 200 OK") # Check for expected content (for Etsy, look for product listings) if 'etsy.com' in url and '/shop/' in url: if 'data-listing-id' not in response.text: blocking_indicators.append("⚠️ WARNING: No product listings found (missing 'data-listing-id')") else: # Count products import re product_count = len(re.findall(r'data-listing-id="(\d+)"', response.text)) print(f"Products found in HTML: {product_count}") if product_count == 0: blocking_indicators.append("⚠️ WARNING: No products found on page") elif product_count < 10: blocking_indicators.append(f"⚠️ WARNING: Only {product_count} products found (expected 20-40)") # Check headers for blocking indicators if 'cf-ray' in response.headers: blocking_indicators.append("ℹ️ INFO: Using Cloudflare (may have protection)") if 'server' in response.headers: print(f"Server: {response.headers['server']}") # Print results print(f"Response Size: {len(response.text):,} bytes") print(f"Content-Type: {response.headers.get('Content-Type', 'N/A')}") if blocking_indicators: print("\n" + "=" * 60) print("BLOCKING DETECTION RESULTS:") print("=" * 60) for indicator in blocking_indicators: print(f" {indicator}") print("=" * 60) return True else: print("\n✅ No blocking detected - request appears successful") return False except requests.exceptions.Timeout: print("❌ ERROR: Request timeout") return True except requests.exceptions.ConnectionError: print("❌ ERROR: Connection error") return True except Exception as e: print(f"❌ ERROR: {e}") return True if __name__ == '__main__': if len(sys.argv) < 2: print("Usage: python check_blocking.py ") print("Example: python check_blocking.py https://www.etsy.com/shop/KappClass") sys.exit(1) url = sys.argv[1] is_blocked = check_blocking(url) sys.exit(1 if is_blocked else 0)