"""Website detection and configuration generation for Scrapy.""" import re import yaml from pathlib import Path from typing import Dict, Optional, List, Any from urllib.parse import urlparse from dataclasses import dataclass, field @dataclass class WebsiteConfig: """Suggested configuration for a website.""" name: str type: str # e-commerce, marketplace, etc. selectors: Dict[str, str] rate_limit: float # seconds between requests requires_js: bool headers: Dict[str, str] notes: str extra: Dict[str, Any] = field(default_factory=dict) class WebsiteDetector: """Detect website type and suggest optimal crawling configuration.""" def __init__(self): self.last_generated_slug: Optional[str] = None # Known website patterns WEBSITE_PATTERNS = { 'shopee': { 'pattern': r'shopee\.(vn|sg|my|th|ph|id|tw|br)', 'type': 'marketplace', 'config': { 'name': 'Shopee', 'selectors': { 'product_container': '[data-testid="product-item"]', 'title': '[data-testid="product-title"]', 'price': '[data-testid="product-price"]', 'image': 'img[data-testid="product-image"]', 'link': 'a[data-testid="product-link"]' }, 'rate_limit': 1.5, 'requires_js': True, 'headers': { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'application/json', 'Accept-Language': 'vi-VN,vi;q=0.9' }, 'notes': 'Shopee uses API endpoints. Consider using their public API.' } }, 'tiki': { 'pattern': r'tiki\.vn', 'type': 'ecommerce', 'config': { 'name': 'Tiki', 'selectors': { 'product_container': '.product-item, [data-view-id="product_list_item"]', 'title': '.product-title, [data-view-id="product_list_item_title"]', 'price': '.product-price, [data-view-id="product_list_item_price"]', 'image': '.product-image img, [data-view-id="product_list_item_image"] img', 'link': 'a.product-item, a[data-view-id="product_list_item_link"]' }, 'rate_limit': 1.0, 'requires_js': False, 'headers': { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml', 'Accept-Language': 'vi-VN,vi;q=0.9' }, 'notes': 'Tiki has good HTML structure. Can crawl without JS.' } }, 'lazada': { 'pattern': r'lazada\.(vn|sg|my|th|ph|id)', 'type': 'marketplace', 'config': { 'name': 'Lazada', 'selectors': { 'product_container': '[data-qa-locator="product-item"]', 'title': '[data-qa-locator="product-title"]', 'price': '[data-qa-locator="product-price"]', 'image': '[data-qa-locator="product-image"] img', 'link': '[data-qa-locator="product-link"]' }, 'rate_limit': 1.2, 'requires_js': True, 'headers': { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml' }, 'notes': 'Lazada uses dynamic loading. May need Selenium for full content.' } }, 'sendo': { 'pattern': r'sendo\.vn', 'type': 'marketplace', 'config': { 'name': 'Sendo', 'selectors': { 'product_container': '.product-item, .productListItem', 'title': '.product-title, .productName', 'price': '.product-price, .price', 'image': '.product-image img, .productImg img', 'link': 'a.product-item, a.productLink' }, 'rate_limit': 1.0, 'requires_js': False, 'headers': { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml' }, 'notes': 'Sendo has straightforward HTML structure.' } }, 'amazon': { 'pattern': r'amazon\.(com|co\.uk|de|fr|jp|in)', 'type': 'ecommerce', 'config': { 'name': 'Amazon', 'selectors': { 'product_container': '[data-component-type="s-search-result"]', 'title': 'h2 a span, .s-title-instructions-style span', 'price': '.a-price .a-offscreen, .a-price-whole', 'image': '[data-component-type="s-product-image"] img', 'link': 'h2 a.a-link-normal' }, 'rate_limit': 2.0, 'requires_js': False, 'headers': { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml', 'Accept-Language': 'en-US,en;q=0.9' }, 'notes': 'Amazon has strict anti-bot measures. Use proxies and rotate User-Agents.' } }, 'woocommerce': { 'pattern': r'.*', 'type': 'woocommerce', 'config': { 'name': 'Generic WooCommerce', 'selectors': { 'product_container': '.product, .type-product, li.product', 'title': '.woocommerce-loop-product__title, h2.woocommerce-loop-product__title, .product-title', 'price': '.price, .woocommerce-Price-amount', 'image': '.wp-post-image, .attachment-woocommerce_thumbnail img', 'link': 'a.woocommerce-LoopProduct-link, a.product-link' }, 'rate_limit': 1.0, 'requires_js': False, 'headers': { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml' }, 'notes': 'WooCommerce sites have standard structure. Check for pagination.' } }, 'shopify': { 'pattern': r'.*', 'type': 'shopify', 'config': { 'name': 'Generic Shopify', 'selectors': { 'product_container': ( '.product-card, .card--product, .product-item, .grid__item--product, ' '.product-card-wrapper, .card-wrapper.product-card-wrapper, ' 'li.product, [class*="product-card"], [class*="product-item"]' ), 'title': ( '.product-card__title, .card__heading, .product-item__title, ' '.product-title, h2.product-title, h3.product-title, ' '.card__title, .product-card__name' ), 'price': ( '.price__current, .price-item--regular, .product-item__price, ' '.product-card__price, .price, .product-price, .money' ), 'image': ( '.product-card__media img, .card__media img, ' '.product-item__image img, .product-card__image img, ' '.product-image img, .product-card img' ), 'link': ( '.card-wrapper, .product-card__link, .product-item__link, ' 'a[href*="/products/"], .product-link' ) }, 'rate_limit': 1.0, 'requires_js': False, 'headers': { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml' }, 'notes': 'Shopify sites vary. May need to inspect specific site structure.' } } } def detect(self, url: str) -> Optional[WebsiteConfig]: """ Detect website type from URL and return suggested configuration. Args: url: The website URL to detect Returns: WebsiteConfig if detected, None otherwise """ parsed = urlparse(url) domain = parsed.netloc.lower() # Special-case Etsy shops (require /shop/) if 'etsy.com' in domain: etsy_config = self._detect_etsy_shop(parsed) if etsy_config: return etsy_config # Try to match known patterns (skip generic patterns first) specific_patterns = {k: v for k, v in self.WEBSITE_PATTERNS.items() if k not in ['woocommerce', 'shopify']} for site_key, site_info in specific_patterns.items(): if re.search(site_info['pattern'], domain): config_data = site_info['config'] return WebsiteConfig( name=config_data['name'], type=site_info['type'], selectors=config_data['selectors'], rate_limit=config_data['rate_limit'], requires_js=config_data['requires_js'], headers=config_data['headers'], notes=config_data['notes'], extra={} ) # Generic detection - try to detect WooCommerce or Shopify return self._detect_generic_ecommerce(url) def _detect_etsy_shop(self, parsed_url): """Detect Etsy shop URLs and return configuration.""" path_parts = [part for part in (parsed_url.path or '').split('/') if part] if len(path_parts) >= 2 and path_parts[0].lower() == 'shop': shop_name = path_parts[1] scheme = parsed_url.scheme or 'https' netloc = parsed_url.netloc or 'www.etsy.com' shop_url = f"{scheme}://{netloc}/shop/{shop_name}" rss_url = f"{scheme}://{netloc}/shop/{shop_name}/rss" extra = { 'shop_name': shop_name, 'shop_path': f"/shop/{shop_name}", 'shop_url': shop_url, 'rss_url': rss_url, 'max_pages': 200 # Etsy shop pages can have many pages } return WebsiteConfig( name=f"Etsy Shop ({shop_name})", type='etsy_shop', selectors={ 'product_container': ( 'div.js-merch-stash-check-listing.v2-listing-card[data-listing-id], ' 'div[data-listing-id].v2-listing-card, ' 'div.js-merch-stash-check-listing[data-listing-id]' ), 'title': ( 'h3.v2-listing-card__title, ' 'a.listing-link::attr(title), ' '.v2-listing-card__title' ), 'price': ( 'div.n-listing-card__price span.currency-value, ' '.n-listing-card__price .currency-value, ' 'span.currency-value' ), 'image': ( 'div.v2-listing-card__img img, ' 'a.listing-link img[src*="etsystatic.com"], ' '.v2-listing-card__img img[src*="etsystatic.com"]' ), 'link': ( 'a.listing-link[href*="/listing/"], ' 'a[href*="/listing/"][data-listing-link]' ), 'description': 'a.listing-link::attr(title)', 'sku': ( 'div.js-merch-stash-check-listing::attr(data-listing-id), ' 'a.listing-link::attr(data-listing-id), ' '[data-listing-id]::attr(data-listing-id)' ) }, rate_limit=1.0, # Slightly slower for HTML pages requires_js=False, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' }, notes='Detected Etsy shop. Crawling from shop page HTML with pagination to get all products.', extra=extra ) return None def _detect_generic_ecommerce(self, url: str) -> Optional[WebsiteConfig]: """Detect generic e-commerce platforms.""" return WebsiteConfig( name='Generic E-commerce', type='generic', selectors={ 'product_container': '.product, .product-item, [class*="product"]', 'title': 'h2, h3, .title, .product-title', 'price': '.price, [class*="price"]', 'image': 'img', 'link': 'a' }, rate_limit=1.5, requires_js=False, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml' }, notes='Generic detection. You may need to customize selectors for this site.', extra={} ) def suggest_config(self, url: str) -> Dict: """ Suggest complete configuration for a website. Args: url: The website URL Returns: Dictionary with suggested configuration """ config = self.detect(url) if not config: return { 'error': 'Could not detect website type', 'suggestion': 'Please provide manual configuration' } return { 'detected': True, 'website_name': config.name, 'website_type': config.type, 'url': url, 'selectors': config.selectors, 'rate_limit_seconds': config.rate_limit, 'requires_javascript': config.requires_js, 'suggested_headers': config.headers, 'notes': config.notes, 'recommendations': self._get_recommendations(config) } def _get_recommendations(self, config: WebsiteConfig) -> List[str]: """Get recommendations based on detected website.""" recommendations = [] if config.requires_js: recommendations.append("Consider using Selenium or Playwright for JavaScript rendering") if config.rate_limit > 1.5: recommendations.append("This site has strict rate limiting. Use proxies if crawling at scale") if 'shopee' in config.name.lower() or 'lazada' in config.name.lower(): recommendations.append("Check if this site has a public API - it may be more efficient") recommendations.append("Always respect robots.txt and implement proper delays") recommendations.append("Test with a small sample first before full crawl") return recommendations def generate_yaml_config( self, url: str, start_urls: Optional[List[str]] = None, output_path: Optional[Path] = None ) -> str: """ Generate YAML configuration file for Scrapy. Args: url: The website URL start_urls: Optional list of start URLs (defaults to url) output_path: Optional output path for config file Returns: Path to generated config file """ config = self.detect(url) if not config: raise ValueError(f"Could not detect website type for {url}") parsed = urlparse(url) base_url = f"{parsed.scheme}://{parsed.netloc}" # Generate config dict extra = config.extra or {} if config.type == 'etsy_shop': shop_name = extra.get('shop_name') shop_url = extra.get('shop_url') or url rss_url = extra.get('rss_url') max_pages = extra.get('max_pages', 200) yaml_config = { 'website': { 'name': config.name, 'base_url': base_url, 'type': config.type }, 'etsy': { 'shop_name': shop_name, 'shop_url': shop_url, 'rss_url': rss_url, 'max_pages': max_pages }, 'crawling': { 'mode': 'html', # Use HTML mode to crawl shop page with pagination 'start_urls': [shop_url], 'pagination': { 'enabled': True, 'selector': 'a[data-page], a.wt-action-group__item[href*="page="], a[href*="ref=items-pagination"]', 'max_pages': max_pages }, 'follow_links': False }, 'selectors': config.selectors, # Use HTML selectors instead of empty 'rate_limiting': { 'delay': config.rate_limit, 'download_delay': config.rate_limit, 'concurrent_requests': 1 }, 'headers': config.headers, 'features': { 'requires_javascript': config.requires_js, 'api_endpoints': [], 'custom_middleware': [] }, 'proxies': { 'enabled': False, 'list': [], 'mode': 'rotate' }, 'notes': config.notes } else: yaml_config = { 'website': { 'name': config.name, 'base_url': base_url, 'type': config.type }, 'crawling': { 'start_urls': start_urls or [url], 'pagination': { 'enabled': True, 'selector': '.next-page a, .pagination a.next, a[rel="next"]', 'max_pages': 50 }, 'follow_links': True }, 'selectors': { **config.selectors, 'description': config.selectors.get('description', '.description, .product-description, .product-detail'), 'category': config.selectors.get('category', '.breadcrumb a:last-child, .category, .product-category'), 'sku': config.selectors.get('sku', '[data-sku], .sku, [itemprop="sku"]'), 'attributes': config.selectors.get('attributes', '.product-attributes, .attributes, table.specs') }, 'rate_limiting': { 'delay': config.rate_limit, 'download_delay': config.rate_limit, 'concurrent_requests': 1 }, 'headers': config.headers, 'features': { 'requires_javascript': config.requires_js, 'api_endpoints': [], 'custom_middleware': [] }, 'proxies': { 'enabled': False, 'list': [], 'mode': 'rotate' # 'rotate' or 'random' }, 'notes': config.notes } # Determine output path - use domain name if not specified if not output_path: # Extract domain name from URL (e.g., themarblecoffee.com) domain = parsed.netloc # Remove www. prefix if present if domain.startswith('www.'): domain = domain[4:] # Use utility function to ensure consistent naming from scrapy_project.utils.config_utils import get_config_path_for_domain output_path = get_config_path_for_domain(domain) domain_slug = output_path.stem # Handle Etsy shop special case if config.type == 'etsy_shop' and extra.get('shop_name'): shop_slug = re.sub(r'[^a-z0-9]+', '-', extra.get('shop_name').lower()) domain_slug = f"{domain_slug}-{shop_slug}" output_path = Path('configs') / f"{domain_slug}.yaml" else: domain_slug = Path(output_path).stem output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) # Check if config file already exists - merge instead of overwrite existing_config = {} if output_path.exists(): try: with open(output_path, 'r', encoding='utf-8') as f: existing_config = yaml.safe_load(f) or {} # Merge existing config with new config (new config takes precedence for basic fields) # But preserve important settings like mode, crawl_reviews, etc. if 'crawling' in existing_config: # Preserve existing crawling settings (mode, crawl_reviews, etc.) existing_crawling = existing_config.get('crawling', {}) new_crawling = yaml_config.get('crawling', {}) # Merge: keep existing important settings, update basic ones merged_crawling = {**existing_crawling, **new_crawling} # But preserve these important settings from existing config for key in ['mode', 'crawl_reviews']: if key in existing_crawling: merged_crawling[key] = existing_crawling[key] yaml_config['crawling'] = merged_crawling # Preserve website type if already set if 'website' in existing_config and 'type' in existing_config['website']: existing_type = existing_config['website'].get('type') if existing_type and existing_type != 'generic': yaml_config['website']['type'] = existing_type # Preserve export domain_slug if 'export' in existing_config and 'domain_slug' in existing_config['export']: yaml_config.setdefault('export', {}) yaml_config['export']['domain_slug'] = existing_config['export']['domain_slug'] domain_slug = existing_config['export']['domain_slug'] except Exception as e: # If merge fails, log warning but continue with new config import logging logger = logging.getLogger(__name__) logger.warning(f"Failed to merge existing config: {e}. Using new config.") yaml_config.setdefault('export', {}) yaml_config['export']['domain_slug'] = domain_slug self.last_generated_slug = domain_slug # Write YAML file (merged or new) with open(output_path, 'w', encoding='utf-8') as f: yaml.dump(yaml_config, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return str(output_path)