"""Website detection and configuration generation for Scrapy."""

import re
import yaml
from pathlib import Path
from typing import Dict, Optional, List, Any
from urllib.parse import urlparse
from dataclasses import dataclass, field


@dataclass
class WebsiteConfig:
    """Suggested configuration for a website."""
    name: str
    type: str  # e-commerce, marketplace, etc.
    selectors: Dict[str, str]
    rate_limit: float  # seconds between requests
    requires_js: bool
    headers: Dict[str, str]
    notes: str
    extra: Dict[str, Any] = field(default_factory=dict)


class WebsiteDetector:
    """Detect website type and suggest optimal crawling configuration."""
    
    def __init__(self):
        self.last_generated_slug: Optional[str] = None
    
    # Known website patterns
    WEBSITE_PATTERNS = {
        'shopee': {
            'pattern': r'shopee\.(vn|sg|my|th|ph|id|tw|br)',
            'type': 'marketplace',
            'config': {
                'name': 'Shopee',
                'selectors': {
                    'product_container': '[data-testid="product-item"]',
                    'title': '[data-testid="product-title"]',
                    'price': '[data-testid="product-price"]',
                    'image': 'img[data-testid="product-image"]',
                    'link': 'a[data-testid="product-link"]'
                },
                'rate_limit': 1.5,
                'requires_js': True,
                'headers': {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                    'Accept': 'application/json',
                    'Accept-Language': 'vi-VN,vi;q=0.9'
                },
                'notes': 'Shopee uses API endpoints. Consider using their public API.'
            }
        },
        'tiki': {
            'pattern': r'tiki\.vn',
            'type': 'ecommerce',
            'config': {
                'name': 'Tiki',
                'selectors': {
                    'product_container': '.product-item, [data-view-id="product_list_item"]',
                    'title': '.product-title, [data-view-id="product_list_item_title"]',
                    'price': '.product-price, [data-view-id="product_list_item_price"]',
                    'image': '.product-image img, [data-view-id="product_list_item_image"] img',
                    'link': 'a.product-item, a[data-view-id="product_list_item_link"]'
                },
                'rate_limit': 1.0,
                'requires_js': False,
                'headers': {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                    'Accept': 'text/html,application/xhtml+xml',
                    'Accept-Language': 'vi-VN,vi;q=0.9'
                },
                'notes': 'Tiki has good HTML structure. Can crawl without JS.'
            }
        },
        'lazada': {
            'pattern': r'lazada\.(vn|sg|my|th|ph|id)',
            'type': 'marketplace',
            'config': {
                'name': 'Lazada',
                'selectors': {
                    'product_container': '[data-qa-locator="product-item"]',
                    'title': '[data-qa-locator="product-title"]',
                    'price': '[data-qa-locator="product-price"]',
                    'image': '[data-qa-locator="product-image"] img',
                    'link': '[data-qa-locator="product-link"]'
                },
                'rate_limit': 1.2,
                'requires_js': True,
                'headers': {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                    'Accept': 'text/html,application/xhtml+xml'
                },
                'notes': 'Lazada uses dynamic loading. May need Selenium for full content.'
            }
        },
        'sendo': {
            'pattern': r'sendo\.vn',
            'type': 'marketplace',
            'config': {
                'name': 'Sendo',
                'selectors': {
                    'product_container': '.product-item, .productListItem',
                    'title': '.product-title, .productName',
                    'price': '.product-price, .price',
                    'image': '.product-image img, .productImg img',
                    'link': 'a.product-item, a.productLink'
                },
                'rate_limit': 1.0,
                'requires_js': False,
                'headers': {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                    'Accept': 'text/html,application/xhtml+xml'
                },
                'notes': 'Sendo has straightforward HTML structure.'
            }
        },
        'amazon': {
            'pattern': r'amazon\.(com|co\.uk|de|fr|jp|in)',
            'type': 'ecommerce',
            'config': {
                'name': 'Amazon',
                'selectors': {
                    'product_container': '[data-component-type="s-search-result"]',
                    'title': 'h2 a span, .s-title-instructions-style span',
                    'price': '.a-price .a-offscreen, .a-price-whole',
                    'image': '[data-component-type="s-product-image"] img',
                    'link': 'h2 a.a-link-normal'
                },
                'rate_limit': 2.0,
                'requires_js': False,
                'headers': {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                    'Accept': 'text/html,application/xhtml+xml',
                    'Accept-Language': 'en-US,en;q=0.9'
                },
                'notes': 'Amazon has strict anti-bot measures. Use proxies and rotate User-Agents.'
            }
        },
        'woocommerce': {
            'pattern': r'.*',
            'type': 'woocommerce',
            'config': {
                'name': 'Generic WooCommerce',
                'selectors': {
                    'product_container': '.product, .type-product, li.product',
                    'title': '.woocommerce-loop-product__title, h2.woocommerce-loop-product__title, .product-title',
                    'price': '.price, .woocommerce-Price-amount',
                    'image': '.wp-post-image, .attachment-woocommerce_thumbnail img',
                    'link': 'a.woocommerce-LoopProduct-link, a.product-link'
                },
                'rate_limit': 1.0,
                'requires_js': False,
                'headers': {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                    'Accept': 'text/html,application/xhtml+xml'
                },
                'notes': 'WooCommerce sites have standard structure. Check for pagination.'
            }
        },
        'shopify': {
            'pattern': r'.*',
            'type': 'shopify',
            'config': {
                'name': 'Generic Shopify',
                'selectors': {
                    'product_container': (
                        '.product-card, .card--product, .product-item, .grid__item--product, '
                        '.product-card-wrapper, .card-wrapper.product-card-wrapper, '
                        'li.product, [class*="product-card"], [class*="product-item"]'
                    ),
                    'title': (
                        '.product-card__title, .card__heading, .product-item__title, '
                        '.product-title, h2.product-title, h3.product-title, '
                        '.card__title, .product-card__name'
                    ),
                    'price': (
                        '.price__current, .price-item--regular, .product-item__price, '
                        '.product-card__price, .price, .product-price, .money'
                    ),
                    'image': (
                        '.product-card__media img, .card__media img, '
                        '.product-item__image img, .product-card__image img, '
                        '.product-image img, .product-card img'
                    ),
                    'link': (
                        '.card-wrapper, .product-card__link, .product-item__link, '
                        'a[href*="/products/"], .product-link'
                    )
                },
                'rate_limit': 1.0,
                'requires_js': False,
                'headers': {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                    'Accept': 'text/html,application/xhtml+xml'
                },
                'notes': 'Shopify sites vary. May need to inspect specific site structure.'
            }
        }
    }
    
    def detect(self, url: str) -> Optional[WebsiteConfig]:
        """
        Detect website type from URL and return suggested configuration.
        
        Args:
            url: The website URL to detect
            
        Returns:
            WebsiteConfig if detected, None otherwise
        """
        parsed = urlparse(url)
        domain = parsed.netloc.lower()
        
        # Special-case Etsy shops (require /shop/<ShopName>)
        if 'etsy.com' in domain:
            etsy_config = self._detect_etsy_shop(parsed)
            if etsy_config:
                return etsy_config
        
        # Try to match known patterns (skip generic patterns first)
        specific_patterns = {k: v for k, v in self.WEBSITE_PATTERNS.items() if k not in ['woocommerce', 'shopify']}
        for site_key, site_info in specific_patterns.items():
            if re.search(site_info['pattern'], domain):
                config_data = site_info['config']
                return WebsiteConfig(
                    name=config_data['name'],
                    type=site_info['type'],
                    selectors=config_data['selectors'],
                    rate_limit=config_data['rate_limit'],
                    requires_js=config_data['requires_js'],
                    headers=config_data['headers'],
                    notes=config_data['notes'],
                    extra={}
                )
        
        # Generic detection - try to detect WooCommerce or Shopify
        return self._detect_generic_ecommerce(url)
    
    def _detect_etsy_shop(self, parsed_url):
        """Detect Etsy shop URLs and return configuration."""
        path_parts = [part for part in (parsed_url.path or '').split('/') if part]
        if len(path_parts) >= 2 and path_parts[0].lower() == 'shop':
            shop_name = path_parts[1]
            scheme = parsed_url.scheme or 'https'
            netloc = parsed_url.netloc or 'www.etsy.com'
            shop_url = f"{scheme}://{netloc}/shop/{shop_name}"
            rss_url = f"{scheme}://{netloc}/shop/{shop_name}/rss"
            extra = {
                'shop_name': shop_name,
                'shop_path': f"/shop/{shop_name}",
                'shop_url': shop_url,
                'rss_url': rss_url,
                'max_pages': 200  # Etsy shop pages can have many pages
            }
            return WebsiteConfig(
                name=f"Etsy Shop ({shop_name})",
                type='etsy_shop',
                selectors={
                    'product_container': (
                        'div.js-merch-stash-check-listing.v2-listing-card[data-listing-id], '
                        'div[data-listing-id].v2-listing-card, '
                        'div.js-merch-stash-check-listing[data-listing-id]'
                    ),
                    'title': (
                        'h3.v2-listing-card__title, '
                        'a.listing-link::attr(title), '
                        '.v2-listing-card__title'
                    ),
                    'price': (
                        'div.n-listing-card__price span.currency-value, '
                        '.n-listing-card__price .currency-value, '
                        'span.currency-value'
                    ),
                    'image': (
                        'div.v2-listing-card__img img, '
                        'a.listing-link img[src*="etsystatic.com"], '
                        '.v2-listing-card__img img[src*="etsystatic.com"]'
                    ),
                    'link': (
                        'a.listing-link[href*="/listing/"], '
                        'a[href*="/listing/"][data-listing-link]'
                    ),
                    'description': 'a.listing-link::attr(title)',
                    'sku': (
                        'div.js-merch-stash-check-listing::attr(data-listing-id), '
                        'a.listing-link::attr(data-listing-id), '
                        '[data-listing-id]::attr(data-listing-id)'
                    )
                },
                rate_limit=1.0,  # Slightly slower for HTML pages
                requires_js=False,
                headers={
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
                },
                notes='Detected Etsy shop. Crawling from shop page HTML with pagination to get all products.',
                extra=extra
            )
        return None
    
    def _detect_generic_ecommerce(self, url: str) -> Optional[WebsiteConfig]:
        """Detect generic e-commerce platforms."""
        return WebsiteConfig(
            name='Generic E-commerce',
            type='generic',
            selectors={
                'product_container': '.product, .product-item, [class*="product"]',
                'title': 'h2, h3, .title, .product-title',
                'price': '.price, [class*="price"]',
                'image': 'img',
                'link': 'a'
            },
            rate_limit=1.5,
            requires_js=False,
            headers={
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                'Accept': 'text/html,application/xhtml+xml'
            },
            notes='Generic detection. You may need to customize selectors for this site.',
            extra={}
        )
    
    def suggest_config(self, url: str) -> Dict:
        """
        Suggest complete configuration for a website.
        
        Args:
            url: The website URL
            
        Returns:
            Dictionary with suggested configuration
        """
        config = self.detect(url)
        if not config:
            return {
                'error': 'Could not detect website type',
                'suggestion': 'Please provide manual configuration'
            }
        
        return {
            'detected': True,
            'website_name': config.name,
            'website_type': config.type,
            'url': url,
            'selectors': config.selectors,
            'rate_limit_seconds': config.rate_limit,
            'requires_javascript': config.requires_js,
            'suggested_headers': config.headers,
            'notes': config.notes,
            'recommendations': self._get_recommendations(config)
        }
    
    def _get_recommendations(self, config: WebsiteConfig) -> List[str]:
        """Get recommendations based on detected website."""
        recommendations = []
        
        if config.requires_js:
            recommendations.append("Consider using Selenium or Playwright for JavaScript rendering")
        
        if config.rate_limit > 1.5:
            recommendations.append("This site has strict rate limiting. Use proxies if crawling at scale")
        
        if 'shopee' in config.name.lower() or 'lazada' in config.name.lower():
            recommendations.append("Check if this site has a public API - it may be more efficient")
        
        recommendations.append("Always respect robots.txt and implement proper delays")
        recommendations.append("Test with a small sample first before full crawl")
        
        return recommendations
    
    def generate_yaml_config(
        self, 
        url: str, 
        start_urls: Optional[List[str]] = None,
        output_path: Optional[Path] = None
    ) -> str:
        """
        Generate YAML configuration file for Scrapy.
        
        Args:
            url: The website URL
            start_urls: Optional list of start URLs (defaults to url)
            output_path: Optional output path for config file
            
        Returns:
            Path to generated config file
        """
        config = self.detect(url)
        if not config:
            raise ValueError(f"Could not detect website type for {url}")
        
        parsed = urlparse(url)
        base_url = f"{parsed.scheme}://{parsed.netloc}"
        
        # Generate config dict
        extra = config.extra or {}
        if config.type == 'etsy_shop':
            shop_name = extra.get('shop_name')
            shop_url = extra.get('shop_url') or url
            rss_url = extra.get('rss_url')
            max_pages = extra.get('max_pages', 200)
            yaml_config = {
                'website': {
                    'name': config.name,
                    'base_url': base_url,
                    'type': config.type
                },
                'etsy': {
                    'shop_name': shop_name,
                    'shop_url': shop_url,
                    'rss_url': rss_url,
                    'max_pages': max_pages
                },
                'crawling': {
                    'mode': 'html',  # Use HTML mode to crawl shop page with pagination
                    'start_urls': [shop_url],
                    'pagination': {
                        'enabled': True,
                        'selector': 'a[data-page], a.wt-action-group__item[href*="page="], a[href*="ref=items-pagination"]',
                        'max_pages': max_pages
                    },
                    'follow_links': False
                },
                'selectors': config.selectors,  # Use HTML selectors instead of empty
                'rate_limiting': {
                    'delay': config.rate_limit,
                    'download_delay': config.rate_limit,
                    'concurrent_requests': 1
                },
                'headers': config.headers,
                'features': {
                    'requires_javascript': config.requires_js,
                    'api_endpoints': [],
                    'custom_middleware': []
                },
                'proxies': {
                    'enabled': False,
                    'list': [],
                    'mode': 'rotate'
                },
                'notes': config.notes
            }
        else:
            yaml_config = {
                'website': {
                    'name': config.name,
                    'base_url': base_url,
                    'type': config.type
                },
                'crawling': {
                    'start_urls': start_urls or [url],
                    'pagination': {
                        'enabled': True,
                        'selector': '.next-page a, .pagination a.next, a[rel="next"]',
                        'max_pages': 50
                    },
                    'follow_links': True
                },
                'selectors': {
                    **config.selectors,
                    'description': config.selectors.get('description', '.description, .product-description, .product-detail'),
                    'category': config.selectors.get('category', '.breadcrumb a:last-child, .category, .product-category'),
                    'sku': config.selectors.get('sku', '[data-sku], .sku, [itemprop="sku"]'),
                    'attributes': config.selectors.get('attributes', '.product-attributes, .attributes, table.specs')
                },
                'rate_limiting': {
                    'delay': config.rate_limit,
                    'download_delay': config.rate_limit,
                    'concurrent_requests': 1
                },
                'headers': config.headers,
                'features': {
                    'requires_javascript': config.requires_js,
                    'api_endpoints': [],
                    'custom_middleware': []
                },
                'proxies': {
                    'enabled': False,
                    'list': [],
                    'mode': 'rotate'  # 'rotate' or 'random'
                },
                'notes': config.notes
            }
        
        # Determine output path - use domain name if not specified
        if not output_path:
            # Extract domain name from URL (e.g., themarblecoffee.com)
            domain = parsed.netloc
            # Remove www. prefix if present
            if domain.startswith('www.'):
                domain = domain[4:]
            
            # Use utility function to ensure consistent naming
            from scrapy_project.utils.config_utils import get_config_path_for_domain
            output_path = get_config_path_for_domain(domain)
            domain_slug = output_path.stem
            
            # Handle Etsy shop special case
            if config.type == 'etsy_shop' and extra.get('shop_name'):
                shop_slug = re.sub(r'[^a-z0-9]+', '-', extra.get('shop_name').lower())
                domain_slug = f"{domain_slug}-{shop_slug}"
                output_path = Path('configs') / f"{domain_slug}.yaml"
        else:
            domain_slug = Path(output_path).stem
        
        output_path = Path(output_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        # Check if config file already exists - merge instead of overwrite
        existing_config = {}
        if output_path.exists():
            try:
                with open(output_path, 'r', encoding='utf-8') as f:
                    existing_config = yaml.safe_load(f) or {}
                # Merge existing config with new config (new config takes precedence for basic fields)
                # But preserve important settings like mode, crawl_reviews, etc.
                if 'crawling' in existing_config:
                    # Preserve existing crawling settings (mode, crawl_reviews, etc.)
                    existing_crawling = existing_config.get('crawling', {})
                    new_crawling = yaml_config.get('crawling', {})
                    # Merge: keep existing important settings, update basic ones
                    merged_crawling = {**existing_crawling, **new_crawling}
                    # But preserve these important settings from existing config
                    for key in ['mode', 'crawl_reviews']:
                        if key in existing_crawling:
                            merged_crawling[key] = existing_crawling[key]
                    yaml_config['crawling'] = merged_crawling
                
                # Preserve website type if already set
                if 'website' in existing_config and 'type' in existing_config['website']:
                    existing_type = existing_config['website'].get('type')
                    if existing_type and existing_type != 'generic':
                        yaml_config['website']['type'] = existing_type
                
                # Preserve export domain_slug
                if 'export' in existing_config and 'domain_slug' in existing_config['export']:
                    yaml_config.setdefault('export', {})
                    yaml_config['export']['domain_slug'] = existing_config['export']['domain_slug']
                    domain_slug = existing_config['export']['domain_slug']
            except Exception as e:
                # If merge fails, log warning but continue with new config
                import logging
                logger = logging.getLogger(__name__)
                logger.warning(f"Failed to merge existing config: {e}. Using new config.")
        
        yaml_config.setdefault('export', {})
        yaml_config['export']['domain_slug'] = domain_slug
        self.last_generated_slug = domain_slug
        
        # Write YAML file (merged or new)
        with open(output_path, 'w', encoding='utf-8') as f:
            yaml.dump(yaml_config, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
        
        return str(output_path)