"""Platform/CMS/Technology detector for websites.""" import re from typing import Dict, Optional, List from urllib.parse import urlparse class PlatformDetector: """Detect platform, CMS, and technology stack of a website.""" # Platform detection patterns PLATFORM_PATTERNS = { 'shopify': { 'indicators': [ r'shopify\.com', r'cdn\.shopify\.com', r'\.myshopify\.com', r'shopify-analytics', r'shopify\.theme', r'Shopify\.checkout', ], 'name': 'Shopify', 'type': 'ecommerce_platform', 'cms': 'Shopify', 'tech': ['Liquid', 'React', 'Vue.js'], 'features': { 'collections': True, 'categories': False, 'api_available': True, 'requires_js': True } }, 'woocommerce': { 'indicators': [ r'woocommerce', r'wp-content/plugins/woocommerce', r'wc-', r'add-to-cart\.php', r'woocommerce-active', ], 'name': 'WooCommerce', 'type': 'ecommerce_platform', 'cms': 'WordPress', 'tech': ['WordPress', 'PHP', 'MySQL'], 'features': { 'collections': False, 'categories': True, 'api_available': True, 'requires_js': False } }, 'wordpress': { 'indicators': [ r'wp-content', r'wp-includes', r'wordpress', r'/wp-admin/', r'wp-json', ], 'name': 'WordPress', 'type': 'cms', 'cms': 'WordPress', 'tech': ['WordPress', 'PHP', 'MySQL'], 'features': { 'collections': False, 'categories': True, 'api_available': True, 'requires_js': False } }, 'magento': { 'indicators': [ r'magento', r'/static/version', r'requirejs-config\.js', r'Magento_', ], 'name': 'Magento', 'type': 'ecommerce_platform', 'cms': 'Magento', 'tech': ['PHP', 'MySQL', 'RequireJS'], 'features': { 'collections': False, 'categories': True, 'api_available': True, 'requires_js': True } }, 'bigcommerce': { 'indicators': [ r'bigcommerce\.com', r'cdn\.bigcommerce\.com', r'stencil\.bigcommerce\.com', ], 'name': 'BigCommerce', 'type': 'ecommerce_platform', 'cms': 'BigCommerce', 'tech': ['Stencil', 'Handlebars'], 'features': { 'collections': False, 'categories': True, 'api_available': True, 'requires_js': True } }, 'prestashop': { 'indicators': [ r'prestashop', r'/modules/', r'presta', ], 'name': 'PrestaShop', 'type': 'ecommerce_platform', 'cms': 'PrestaShop', 'tech': ['PHP', 'MySQL', 'Smarty'], 'features': { 'collections': False, 'categories': True, 'api_available': True, 'requires_js': False } }, 'redbubble': { 'indicators': [ r'redbubble\.com', r'redbubble', r'/shop/', r'/products/', ], 'name': 'Redbubble', 'type': 'marketplace', 'cms': 'Redbubble', 'tech': ['React', 'JavaScript', 'Node.js'], 'features': { 'collections': True, 'categories': True, 'api_available': False, 'requires_js': True } }, } def detect(self, html_content: str, url: str, headers: Optional[Dict] = None) -> Dict: """ Detect platform from HTML content, URL, and headers. Args: html_content: HTML content of the page url: Website URL headers: HTTP headers (optional) Returns: Dictionary with platform information """ html_lower = html_content.lower() url_lower = url.lower() # Check each platform for platform_key, platform_info in self.PLATFORM_PATTERNS.items(): for indicator in platform_info['indicators']: if re.search(indicator, html_lower, re.IGNORECASE) or re.search(indicator, url_lower, re.IGNORECASE): return { 'platform': platform_info['name'], 'platform_key': platform_key, 'type': platform_info['type'], 'cms': platform_info['cms'], 'tech_stack': platform_info['tech'], 'features': platform_info['features'], 'confidence': 'high' } # Check for generic indicators detected = { 'platform': 'Unknown', 'platform_key': 'unknown', 'type': 'generic', 'cms': 'Unknown', 'tech_stack': [], 'features': { 'collections': False, 'categories': True, 'api_available': False, 'requires_js': False }, 'confidence': 'low' } # Detect JavaScript frameworks if re.search(r'react|react-dom', html_lower): detected['tech_stack'].append('React') if re.search(r'vue\.|vuejs', html_lower): detected['tech_stack'].append('Vue.js') if re.search(r'angular', html_lower): detected['tech_stack'].append('Angular') # Detect if requires JS if re.search(r'__next__|next\.js', html_lower): detected['tech_stack'].append('Next.js') detected['features']['requires_js'] = True return detected