#!/usr/bin/env python3 """CLI interface for SS Crawler.""" import os import sys import click import threading import time from pathlib import Path from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings # Add project root to path project_root = Path(__file__).parent sys.path.insert(0, str(project_root)) from scrapy_project.utils.detector import WebsiteDetector from scrapy_project.utils.analyzer import analyze_website from scrapy_project.spiders.generic_spider import GenericSpider class Spinner: """Simple loading spinner.""" def __init__(self, message="Loading"): self.spinner_chars = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'] self.message = message self.stop_spinner = False self.spinner_thread = None def _spin(self): """Run spinner animation.""" i = 0 while not self.stop_spinner: char = self.spinner_chars[i % len(self.spinner_chars)] click.echo(f"\r{char} {self.message}", nl=False) sys.stdout.flush() time.sleep(0.1) i += 1 # Clear spinner line click.echo("\r" + " " * (len(self.message) + 3) + "\r", nl=False) sys.stdout.flush() def __enter__(self): """Start spinner.""" self.spinner_thread = threading.Thread(target=self._spin, daemon=True) self.spinner_thread.start() return self def __exit__(self, exc_type, exc_val, exc_tb): """Stop spinner.""" self.stop_spinner = True if self.spinner_thread: self.spinner_thread.join(timeout=0.5) @click.group() def cli(): """SS Crawler - Safe and efficient web scraping tool.""" pass @cli.command() @click.argument('url') @click.option('--output', '-o', default=None, help='Output path for config file') @click.option('--start-urls', '-u', multiple=True, help='Additional start URLs') @click.option('--count-all', is_flag=True, help='Count all products across all pages/collections (slower but complete)') def check(url: str, output: str, start_urls: tuple, count_all: bool): """Check website and generate configuration file.""" # Ensure URL has scheme if not url.startswith(('http://', 'https://')): url = 'https://' + url click.echo(f"🔍 Checking website: {url}") detector = WebsiteDetector() config = detector.detect(url) if not config: click.echo("❌ Could not detect website type", err=True) sys.exit(1) # Display detection results click.echo(f"\n✅ Detected: {config.name} ({config.type})") click.echo(f"📝 Notes: {config.notes}") # Generate recommendations recommendations = detector._get_recommendations(config) if recommendations: click.echo("\n💡 Recommendations:") for rec in recommendations: click.echo(f" - {rec}") # Generate YAML config start_urls_list = list(start_urls) if start_urls else None try: config_path = detector.generate_yaml_config( url=url, start_urls=start_urls_list, output_path=Path(output) if output else None ) click.echo(f"\n✅ Configuration file generated: {config_path}") # Automatically analyze website if count_all: click.echo("\n📊 Analyzing website (counting all products - this may take longer)...") spinner_msg = "Counting all products across all collections..." else: click.echo("\n📊 Analyzing website...") spinner_msg = "Discovering collections and counting products..." try: # Show loading spinner with Spinner(spinner_msg): analysis = analyze_website(config_path, save_to_file=True, count_all=count_all) # Display formatted results from scrapy_project.utils.analyzer import format_analysis_results formatted_output = format_analysis_results(analysis) click.echo("\n" + formatted_output) # Analysis file info if analysis.get('analysis_file'): click.echo(f"\n💾 Analysis saved to: {analysis['analysis_file']}") if analysis.get('analysis_json_file'): click.echo(f" JSON version: {analysis['analysis_json_file']}") except Exception as e: click.echo(f"⚠️ Analysis failed: {e}", err=True) import traceback click.echo(f" Error details: {traceback.format_exc()}", err=True) click.echo(" You can still use the config file to crawl") click.echo(f"\n📋 Review and edit the config file if needed, then run:") click.echo(f" ./ss-crawler crawl {Path(config_path).name}") except Exception as e: click.echo(f"❌ Error generating config: {e}", err=True) sys.exit(1) @cli.command() @click.argument('config_or_domain', required=False) @click.option('--config', '-c', help='Path to YAML configuration file (alternative to positional argument)') @click.option('--url', '-u', help='URL to crawl (requires --auto-detect)') @click.option('--auto-detect', is_flag=True, help='Auto-detect website configuration') @click.option('--safe', is_flag=True, help='Safe mode: slow crawl (delay 3.0s, conservative)') @click.option('--medium', is_flag=True, help='Medium mode: balanced crawl (delay 1.5s)') @click.option('--fast', is_flag=True, help='Fast mode: quick crawl (delay 0.5s, use with caution)') @click.option('--output', '-o', default='data/exports', help='Output directory for exports') @click.option('--format', '-f', type=click.Choice(['woocommerce', 'shopify', 'excel', 'json', 'all'], case_sensitive=False), default='woocommerce', help='Export format') @click.option('--resume', is_flag=True, help='Resume from checkpoint') @click.option('--checkpoint', default=None, help='Path to checkpoint file') @click.option('--log-level', default='INFO', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR'], case_sensitive=False), help='Logging level') def crawl(config_or_domain: str, config: str, url: str, auto_detect: bool, safe: bool, medium: bool, fast: bool, output: str, format: str, resume: bool, checkpoint: str, log_level: str): """Crawl products from website. Usage: ./ss-crawler crawl themarblecoffee.com ./ss-crawler crawl themarblecoffee.com.yaml ./ss-crawler crawl themarblecoffee.com --safe ./ss-crawler crawl themarblecoffee.com --fast ./ss-crawler crawl --config configs/example.yaml ./ss-crawler crawl --url https://example.com --auto-detect """ # Determine config file path config_path = None # Track if user provided a URL directly (not just domain) start_url_override = None # Priority: config_or_domain argument > --config option > auto-detect if config_or_domain: # Check if it's a file path (ends with .yaml or .yml) if config_or_domain.endswith(('.yaml', '.yml')): config_path = config_or_domain # If just filename, check in configs directory if not Path(config_path).exists() and not Path(config_path).is_absolute(): configs_path = Path('configs') / config_path if configs_path.exists(): config_path = str(configs_path) else: # It's a domain or URL, try to find config file domain = config_or_domain # Check if it's a full URL (with path) is_full_url = domain.startswith(('http://', 'https://')) and '/' in domain[8:] if is_full_url: # User provided a full URL - use it as start URL start_url_override = [domain] # Extract domain for config lookup from urllib.parse import urlparse parsed = urlparse(domain) domain = parsed.netloc else: # Remove protocol if present (but no path) if domain.startswith(('http://', 'https://')): from urllib.parse import urlparse parsed = urlparse(domain) domain = parsed.netloc # Remove www. prefix if domain.startswith('www.'): domain = domain[4:] # Try to find config file using utility function from scrapy_project.utils.config_utils import find_config_file_for_domain found_config = find_config_file_for_domain(domain) if found_config: config_path = str(found_config) # If not found, suggest to check first if not config_path: click.echo(f"❌ Config file not found for domain: {domain}", err=True) click.echo(f" Please run: ./ss-crawler check {domain}", err=True) click.echo(f" Or provide full path: ./ss-crawler crawl configs/{domain}.yaml", err=True) sys.exit(1) elif config: config_path = config elif auto_detect and url: # Auto-detect mode click.echo(f"🔍 Auto-detecting website: {url}") detector = WebsiteDetector() config_obj = detector.detect(url) if not config_obj: click.echo("❌ Could not detect website type", err=True) sys.exit(1) # Generate temporary config temp_config_path = Path('configs') / 'temp_auto_detect.yaml' temp_config_path.parent.mkdir(exist_ok=True) config_path = str(detector.generate_yaml_config(url=url, output_path=temp_config_path)) click.echo(f"✅ Generated config: {config_path}") else: click.echo("❌ Error: Config file or domain required.", err=True) click.echo(" Usage: ./ss-crawler crawl ", err=True) click.echo(" Example: ./ss-crawler crawl themarblecoffee.com", err=True) click.echo(" Or: ./ss-crawler crawl --url --auto-detect", err=True) sys.exit(1) # Validate config file exists config_path_obj = Path(config_path) if not config_path_obj.exists(): click.echo(f"❌ Config file not found: {config_path}", err=True) click.echo(f" Please run: ./ss-crawler check ", err=True) sys.exit(1) config = str(config_path) # Determine speed mode speed_mode = None if safe: speed_mode = 'safe' elif medium: speed_mode = 'medium' elif fast: speed_mode = 'fast' # Speed presets speed_presets = { 'safe': { 'delay': 3.0, 'download_delay': 3.0, 'concurrent_requests': 1, 'retry_times': 5, 'download_timeout': 60 }, 'medium': { 'delay': 1.5, 'download_delay': 1.5, 'concurrent_requests': 1, 'retry_times': 3, 'download_timeout': 30 }, 'fast': { 'delay': 0.5, 'download_delay': 0.5, 'concurrent_requests': 2, 'retry_times': 2, 'download_timeout': 20 } } # Load Scrapy settings settings = get_project_settings() # Update settings from CLI arguments settings.set('EXPORT_DIR', output) settings.set('EXPORT_FORMAT', format.lower()) settings.set('LOG_LEVEL', log_level) if resume and checkpoint: settings.set('STATE_FILE', checkpoint) # Apply speed preset if specified if speed_mode: preset = speed_presets[speed_mode] settings.set('DOWNLOAD_DELAY', preset['delay']) settings.set('RANDOMIZE_DOWNLOAD_DELAY', True) settings.set('CONCURRENT_REQUESTS', preset['concurrent_requests']) settings.set('CONCURRENT_REQUESTS_PER_DOMAIN', preset['concurrent_requests']) settings.set('RETRY_TIMES', preset['retry_times']) settings.set('DOWNLOAD_TIMEOUT', preset['download_timeout']) click.echo(f" Speed mode: {speed_mode.upper()} (delay: {preset['delay']}s)") # Load config and update settings import yaml with open(config, 'r', encoding='utf-8') as f: config_data = yaml.safe_load(f) # Override rate limiting if speed mode is specified if speed_mode and 'rate_limiting' in config_data: preset = speed_presets[speed_mode] config_data['rate_limiting']['delay'] = preset['delay'] config_data['rate_limiting']['download_delay'] = preset['delay'] config_data['rate_limiting']['concurrent_requests'] = preset['concurrent_requests'] # Load proxy settings from config if available if config_data.get('proxies', {}).get('enabled', False): proxies = config_data.get('proxies', {}).get('list', []) proxy_mode = config_data.get('proxies', {}).get('mode', 'rotate') settings.set('PROXIES', proxies) settings.set('PROXY_MODE', proxy_mode) click.echo(f" Using {len(proxies)} proxy(ies) - Mode: {proxy_mode}") # Create output directories Path(output).mkdir(parents=True, exist_ok=True) Path('data').mkdir(exist_ok=True) Path('logs').mkdir(exist_ok=True) # Create crawler process process = CrawlerProcess(settings) # Add spider with optional start URL override spider_kwargs = {'config_file': config} if start_url_override: spider_kwargs['start_urls_override'] = start_url_override click.echo(f" Start URL: {start_url_override[0]}") process.crawl( GenericSpider, **spider_kwargs ) click.echo(f"\n🚀 Starting crawl...") click.echo(f" Config: {Path(config).name}") if speed_mode: preset = speed_presets[speed_mode] click.echo(f" Speed mode: {speed_mode.upper()} (delay: {preset['delay']}s, concurrent: {preset['concurrent_requests']})") click.echo(f" Export format: {format}") click.echo(f" Output directory: {output}") # Start crawling with progress spinner try: # Start spinner with Spinner("Crawling products... (this may take a while)"): process.start(stop_after_crawl=True) click.echo("\n✅ Crawl completed successfully!") # Show file output info click.echo(f"\n📁 Check output files in:") click.echo(f" - {Path(output).absolute()}") click.echo(f" - {Path('data').absolute()}") except KeyboardInterrupt: click.echo("\n⚠️ Crawl interrupted by user") sys.exit(1) except Exception as e: click.echo(f"\n❌ Crawl failed: {e}", err=True) sys.exit(1) @cli.command() @click.argument('input_file', type=click.Path(exists=True)) @click.option('--format', '-f', type=click.Choice(['woocommerce', 'shopify', 'excel', 'json', 'all'], case_sensitive=False), default='woocommerce', help='Export format') @click.option('--output', '-o', default='data/exports', help='Output directory') def export(input_file: str, format: str, output: str): """Export products from JSON file to various formats.""" import json import pandas as pd from datetime import datetime from scrapy_project.items import Product click.echo(f"📦 Exporting products from: {input_file}") # Load JSON file with open(input_file, 'r', encoding='utf-8') as f: data = json.load(f) if not data: click.echo("❌ No products found in file", err=True) sys.exit(1) # Convert to Product models products = [] for item in data: try: product = Product(**item) products.append(product) except Exception as e: click.echo(f"⚠️ Warning: Skipping invalid product: {e}") click.echo(f"✅ Loaded {len(products)} products") # Create output directory Path(output).mkdir(parents=True, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # Export based on format if format == 'woocommerce' or format == 'all': rows = [p.to_woocommerce_csv_row() for p in products] df = pd.DataFrame(rows) filepath = Path(output) / f"woocommerce_products_{timestamp}.csv" df.to_csv(filepath, index=False, encoding='utf-8-sig') click.echo(f"✅ Exported to WooCommerce: {filepath}") if format == 'shopify' or format == 'all': # Use to_shopify_csv_rows() to get multiple rows per product (one per image) all_rows = [] for product in products: product_rows = product.to_shopify_csv_rows() all_rows.extend(product_rows) df = pd.DataFrame(all_rows) filepath = Path(output) / f"shopify_products_{timestamp}.csv" df.to_csv(filepath, index=False, encoding='utf-8-sig') click.echo(f"✅ Exported {len(products)} products ({len(all_rows)} rows) to Shopify: {filepath}") if format == 'excel' or format == 'all': rows = [p.model_dump() for p in products] df = pd.DataFrame(rows) filepath = Path(output) / f"products_{timestamp}.xlsx" df.to_excel(filepath, index=False, engine='openpyxl') click.echo(f"✅ Exported to Excel: {filepath}") if format == 'json' or format == 'all': rows = [p.model_dump() for p in products] filepath = Path(output) / f"products_{timestamp}.json" with open(filepath, 'w', encoding='utf-8') as f: json.dump(rows, f, ensure_ascii=False, indent=2, default=str) click.echo(f"✅ Exported to JSON: {filepath}") click.echo("\n✅ Export completed!") @cli.command() def version(): """Show version information.""" click.echo("SS Crawler v0.1.0") click.echo("A safe and efficient web scraping tool for product information") if __name__ == '__main__': cli()