"""Scrapy items for product data.""" import scrapy from typing import Optional, List, Dict from datetime import datetime from pydantic import BaseModel, HttpUrl, Field, field_validator class ProductItem(scrapy.Item): """Scrapy item for product data (raw dict format).""" # Basic information name = scrapy.Field() price = scrapy.Field() original_price = scrapy.Field() currency = scrapy.Field() description = scrapy.Field() images = scrapy.Field() product_url = scrapy.Field() sku = scrapy.Field() category = scrapy.Field() attributes = scrapy.Field() # Product attributes (size, color, etc.) availability = scrapy.Field() stock_status = scrapy.Field() rating = scrapy.Field() review_count = scrapy.Field() # Reviews (list of dicts with: author, rating, date, content, images, variant) # Each review dict can contain: author, rating, date, content, images (list of image URLs), variant (optional product variant info like "Navy / M") reviews = scrapy.Field() # Metadata source_url = scrapy.Field() crawled_at = scrapy.Field() class Product(BaseModel): """Pydantic model for validated product data.""" name: str = Field(..., min_length=1, description="Product name") price: Optional[float] = Field(None, ge=0, description="Product price") original_price: Optional[float] = Field(None, ge=0, description="Original price if on sale") currency: str = Field(default="VND", description="Currency code") description: Optional[str] = Field(None, description="Product description") images: List[str] = Field(default_factory=list, description="List of image URLs") product_url: Optional[str] = Field(None, description="Product page URL") sku: Optional[str] = Field(None, description="Product SKU/ID") category: Optional[str] = Field(None, description="Product category") attributes: Optional[Dict[str, str]] = Field(None, description="Product attributes (size, color, etc.)") availability: Optional[str] = Field(None, description="Availability status") stock_status: Optional[str] = Field(None, description="Stock status (in_stock, out_of_stock)") rating: Optional[float] = Field(None, ge=0, le=5, description="Product rating (0-5)") review_count: Optional[int] = Field(None, ge=0, description="Number of reviews") reviews: List[Dict] = Field(default_factory=list, description="List of product reviews. Each review is a dict with: author, rating, date, content, images (optional list of image URLs)") # Metadata source_url: Optional[str] = Field(None, description="Source website URL") crawled_at: datetime = Field(default_factory=datetime.now, description="Crawl timestamp") @field_validator('images') @classmethod def validate_images(cls, v: List[str]) -> List[str]: """Ensure image URLs are valid.""" return [img.strip() for img in v if img and img.strip()] @field_validator('name') @classmethod def validate_name(cls, v: str) -> str: """Clean product name.""" return v.strip() if v else "" @classmethod def from_scrapy_item(cls, item: ProductItem) -> 'Product': """Create Product from Scrapy item.""" data = dict(item) # Convert images to list if it's a string if isinstance(data.get('images'), str): data['images'] = [data['images']] if data['images'] else [] elif not isinstance(data.get('images'), list): data['images'] = [] return cls(**data) def to_woocommerce_csv_row(self) -> dict: """Convert to WooCommerce CSV format.""" return { 'Type': 'simple', 'SKU': self.sku or '', 'Name': self.name, 'Published': 1, 'Is featured?': 0, 'Visibility in catalog': 'visible', 'Short description': (self.description or '')[:200], 'Description': self.description or '', 'Date sale price starts': '', 'Date sale price ends': '', 'Tax status': 'taxable', 'Tax class': '', 'In stock?': 1 if self.stock_status == 'in_stock' else 0, 'Stock': '', 'Low stock amount': '', 'Backorders allowed?': 0, 'Sold individually?': 0, 'Weight (kg)': '', 'Length (cm)': '', 'Width (cm)': '', 'Height (cm)': '', 'Allow customer reviews?': 1, 'Purchase note': '', 'Sale price': self.original_price if self.original_price and self.price else '', 'Regular price': self.price or '', 'Categories': self.category or '', 'Tags': '', 'Shipping class': '', 'Images': '|'.join(self.images) if self.images else '', 'Download limit': '', 'Download expiry days': '', 'Parent': '', 'Grouped products': '', 'Upsells': '', 'Cross-sells': '', 'External URL': '', 'Button text': '', 'Position': 0 } def to_shopify_csv_row(self) -> dict: """Convert to Shopify CSV format (single row, backward compatible).""" return self.to_shopify_csv_rows()[0] if self.to_shopify_csv_rows() else {} def to_shopify_csv_rows(self) -> List[dict]: """ Convert to Shopify CSV format with multiple rows (one per image). Returns: List of dicts, each representing a row in Shopify CSV format. Each row has the same product data, but different Image Src and Image Position. """ handle = (self.sku or self.name.lower().replace(' ', '-'))[:255] # Base row data (same for all rows) base_row = { 'Handle': handle, 'Title': self.name[:255], 'Body (HTML)': self.description or '', 'Vendor': '', 'Type': self.category or '', 'Tags': '', 'Published': 'TRUE', 'Option1 Name': 'Title', 'Option1 Value': 'Default Title', 'Option2 Name': '', 'Option2 Value': '', 'Option3 Name': '', 'Option3 Value': '', 'Variant SKU': self.sku or '', 'Variant Grams': '', 'Variant Inventory Tracker': 'shopify', 'Variant Inventory Qty': '', 'Variant Inventory Policy': 'deny', 'Variant Fulfillment Service': 'manual', 'Variant Price': self.price or '', 'Variant Compare At Price': self.original_price or '', 'Variant Requires Shipping': 'TRUE', 'Variant Taxable': 'TRUE', 'Variant Barcode': '', 'Image Alt Text': self.name, 'Gift Card': 'FALSE', 'SEO Title': self.name, 'SEO Description': (self.description[:160] if self.description else ''), 'Google Shopping / Google Product Category': '', 'Google Shopping / Gender': '', 'Google Shopping / Age Group': '', 'Google Shopping / MPN': '', 'Google Shopping / AdWords Grouping': '', 'Google Shopping / AdWords Labels': '', 'Google Shopping / Condition': 'new', 'Google Shopping / Custom Product': 'FALSE', 'Google Shopping / Custom Label 0': '', 'Google Shopping / Custom Label 1': '', 'Google Shopping / Custom Label 2': '', 'Google Shopping / Custom Label 3': '', 'Google Shopping / Custom Label 4': '', 'Variant Image': '', 'Variant Weight Unit': 'kg', 'Variant Tax Code': '', 'Cost per item': '', 'Included / Vietnam': 'TRUE', 'Status': 'active' } # Create one row per image rows = [] if self.images: for position, image_url in enumerate(self.images, start=1): row = base_row.copy() row['Image Src'] = image_url row['Image Position'] = position rows.append(row) else: # If no images, create one row without image row = base_row.copy() row['Image Src'] = '' row['Image Position'] = 1 rows.append(row) return rows