#!/usr/bin/env python3 """ Script để test proxy trước khi dùng trong crawler. Usage: python3 scripts/test_proxy.py http://proxy.example.com:8080 python3 scripts/test_proxy.py http://user:pass@proxy.example.com:8080 python3 scripts/test_proxy.py http://proxy1:8080 http://proxy2:8080 """ import sys import time import httpx from urllib.parse import urlparse def test_proxy(proxy_url: str, test_url: str = "https://httpbin.org/ip", timeout: int = 10): """ Test một proxy. Args: proxy_url: URL của proxy (e.g., http://proxy:8080) test_url: URL để test (mặc định: httpbin.org/ip) timeout: Timeout trong giây """ print(f"\n{'='*60}") print(f"Testing proxy: {proxy_url}") print(f"Test URL: {test_url}") print(f"{'='*60}") # Parse proxy URL parsed = urlparse(proxy_url) proxy_dict = { "http://": proxy_url, "https://": proxy_url } # Headers giống browser headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', } try: start_time = time.time() with httpx.Client(proxies=proxy_dict, headers=headers, timeout=timeout, follow_redirects=True) as client: response = client.get(test_url) elapsed = time.time() - start_time print(f"✅ Status Code: {response.status_code}") print(f"⏱️ Response Time: {elapsed:.2f}s") print(f"📄 Response:") try: print(f" {response.text[:200]}") except: print(f" (Binary content)") # Test với Etsy nếu test URL là httpbin if "httpbin.org" in test_url: print(f"\n🔍 Testing với Etsy...") try: etsy_response = client.get("https://www.etsy.com/shop/KappClass", timeout=timeout) print(f" Etsy Status: {etsy_response.status_code}") if etsy_response.status_code == 403: print(f" ⚠️ WARNING: Bị block bởi Etsy (403)") elif etsy_response.status_code == 200: print(f" ✅ OK: Proxy có thể access Etsy") except Exception as e: print(f" ❌ Error: {e}") return True except httpx.TimeoutException: print(f"❌ TIMEOUT: Proxy không phản hồi sau {timeout}s") return False except httpx.ProxyError as e: print(f"❌ PROXY ERROR: {e}") return False except httpx.ConnectError as e: print(f"❌ CONNECTION ERROR: Không thể kết nối đến proxy") print(f" {e}") return False except Exception as e: print(f"❌ ERROR: {type(e).__name__}: {e}") return False def main(): if len(sys.argv) < 2: print("Usage: python3 scripts/test_proxy.py [proxy_url2] ...") print("\nExamples:") print(" python3 scripts/test_proxy.py http://proxy.example.com:8080") print(" python3 scripts/test_proxy.py http://user:pass@proxy.example.com:8080") print(" python3 scripts/test_proxy.py http://proxy1:8080 http://proxy2:8080") print("\nTest với URL khác:") print(" python3 scripts/test_proxy.py http://proxy:8080 --url https://www.etsy.com") sys.exit(1) # Parse arguments proxy_urls = [] test_url = "https://httpbin.org/ip" i = 1 while i < len(sys.argv): arg = sys.argv[i] if arg == "--url" and i + 1 < len(sys.argv): test_url = sys.argv[i + 1] i += 2 elif not arg.startswith("--"): proxy_urls.append(arg) i += 1 else: i += 1 if not proxy_urls: print("❌ Error: Cần ít nhất một proxy URL") sys.exit(1) print(f"🧪 Testing {len(proxy_urls)} proxy(ies)...") print(f"Test URL: {test_url}") results = [] for proxy_url in proxy_urls: success = test_proxy(proxy_url, test_url) results.append((proxy_url, success)) time.sleep(1) # Delay giữa các test # Summary print(f"\n{'='*60}") print(f"📊 SUMMARY") print(f"{'='*60}") for proxy_url, success in results: status = "✅ OK" if success else "❌ FAILED" print(f"{status}: {proxy_url}") successful = sum(1 for _, success in results if success) print(f"\n✅ {successful}/{len(results)} proxy(ies) working") if successful > 0: print(f"\n💡 Để dùng proxy trong crawler, thêm vào config YAML:") print(f"proxies:") print(f" enabled: true") print(f" list:") for proxy_url, success in results: if success: print(f" - {proxy_url}") print(f" mode: rotate") if __name__ == "__main__": main()