#!/usr/bin/env python3 """Test script to crawl only 5 products with safe rate limiting.""" import sys from pathlib import Path from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings # Add project root to path project_root = Path(__file__).parent sys.path.insert(0, str(project_root)) from scrapy_project.spiders.generic_spider import GenericSpider def main(): """Run test crawl for 5 products.""" config_file = 'configs/gardenkindtees-com.yaml' if not Path(config_file).exists(): print(f"โŒ Config file not found: {config_file}") sys.exit(1) # Get settings and modify for safe testing settings = get_project_settings() # Safe rate limiting - 3 seconds delay between requests settings.set('DOWNLOAD_DELAY', 3.0) settings.set('RANDOMIZE_DOWNLOAD_DELAY', True) settings.set('RANDOMIZE_DOWNLOAD_DELAY_RANGE', 0.5) # 1.5-4.5 seconds settings.set('CONCURRENT_REQUESTS', 1) settings.set('CONCURRENT_REQUESTS_PER_DOMAIN', 1) # Limit to 5 products settings.set('CLOSESPIDER_ITEMCOUNT', 5) # Logging settings.set('LOG_LEVEL', 'INFO') print("๐Ÿงช Testing crawler with 5 products") print(" Config: gardenkindtees-com.yaml") print(" Delay: 3.0s (with randomization)") print(" Limit: 5 products") print(" Reviews: Enabled\n") # Create crawler process process = CrawlerProcess(settings) process.crawl( GenericSpider, config_file=config_file ) print("๐Ÿš€ Starting crawl...\n") try: process.start(stop_after_crawl=True) print("\nโœ… Test crawl completed!") print("\n๐Ÿ“Š Check results in:") print(" - data/exports/") print(" - data/products.json") except KeyboardInterrupt: print("\nโš ๏ธ Crawl interrupted by user") sys.exit(1) except Exception as e: print(f"\nโŒ Crawl failed: {e}", file=sys.stderr) import traceback traceback.print_exc() sys.exit(1) if __name__ == '__main__': main()