#!/usr/bin/env python3 """ Cleanup duplicate config files - ensure only one config file per domain. This script: 1. Finds all config files 2. Groups them by normalized domain 3. Keeps the file with dash format (themarblecoffee-com.yaml) as it's the standard 4. Removes duplicates (themarblecoffee.com.yaml) """ import sys from pathlib import Path import re import yaml # Add project root to path project_root = Path(__file__).parent.parent sys.path.insert(0, str(project_root)) from scrapy_project.utils.config_utils import normalize_domain_to_slug, find_config_file_for_domain def normalize_config_filename_to_domain(filename: str) -> str: """Extract domain from config filename.""" # Remove extension name = Path(filename).stem # If it has dashes, it's likely domain-slug format # Convert back to domain: themarblecoffee-com -> themarblecoffee.com if '-' in name: # Try to convert back (last dash might be part of domain) # Simple heuristic: if it looks like domain-slug, convert last dash to dot parts = name.split('-') if len(parts) >= 2: # Assume last part is TLD or part of domain # For now, just return as-is and let the system handle it return name.replace('-', '.', 1) # Replace first dash only return name def get_domain_from_config_file(config_path: Path) -> str: """Extract domain from config file by reading its content.""" try: with open(config_path, 'r', encoding='utf-8') as f: config = yaml.safe_load(f) if config and 'website' in config and 'base_url' in config['website']: from urllib.parse import urlparse parsed = urlparse(config['website']['base_url']) domain = parsed.netloc if domain.startswith('www.'): domain = domain[4:] return domain except Exception: pass # Fallback: extract from filename return normalize_config_filename_to_domain(config_path.name) def cleanup_duplicate_configs(dry_run: bool = True): """Find and remove duplicate config files.""" config_dir = Path('configs') if not config_dir.exists(): print(f"Config directory not found: {config_dir}") return all_configs = list(config_dir.glob('*.yaml')) + list(config_dir.glob('*.yml')) # Group by domain domain_to_files = {} for config_file in all_configs: domain = get_domain_from_config_file(config_file) if domain not in domain_to_files: domain_to_files[domain] = [] domain_to_files[domain].append(config_file) # Find duplicates duplicates_found = False files_to_remove = [] for domain, files in domain_to_files.items(): if len(files) > 1: duplicates_found = True print(f"\n⚠️ Domain '{domain}' has {len(files)} config files:") # Sort files: prefer dash format (themarblecoffee-com.yaml) - this is the standard format # Files with dash come first (True < False in sorting) files_sorted = sorted(files, key=lambda f: ('.' in f.stem and '-' not in f.stem, f.name)) # Keep the first one (prefer dash format, or first if all have same format) keep_file = files_sorted[0] remove_files = files_sorted[1:] print(f" ✅ Keep: {keep_file.name}") for f in remove_files: print(f" ❌ Remove: {f.name}") files_to_remove.append(f) if not duplicates_found: print("✅ No duplicate config files found!") return if dry_run: print(f"\n🔍 DRY RUN: Would remove {len(files_to_remove)} duplicate file(s)") print(" Run with --execute to actually remove files") else: print(f"\n🗑️ Removing {len(files_to_remove)} duplicate file(s)...") for f in files_to_remove: try: f.unlink() print(f" ✅ Removed: {f.name}") except Exception as e: print(f" ❌ Failed to remove {f.name}: {e}") if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description='Cleanup duplicate config files') parser.add_argument('--execute', action='store_true', help='Actually remove files (default: dry run)') args = parser.parse_args() cleanup_duplicate_configs(dry_run=not args.execute)