U
    84iH_                     @   sx   d Z ddlZddlZddlmZ ddlmZmZmZm	Z	 ddl
mZ ddlmZmZ eG dd dZG d	d
 d
ZdS )z:Website detection and configuration generation for Scrapy.    N)Path)DictOptionalListAny)urlparse)	dataclassfieldc                   @   st   e Zd ZU dZeed< eed< eeef ed< eed< eed< eeef ed< eed< e	e
d	Zeeef ed
< dS )WebsiteConfigz&Suggested configuration for a website.nametype	selectors
rate_limitrequires_jsheadersnotes)default_factoryextraN)__name__
__module____qualname____doc__str__annotations__r   floatboolr	   dictr   r    r   r   "./scrapy_project/utils/detector.pyr
      s   
r
   c                   @   s  e Zd ZdZdd Zdddddd	d
dddddddddddddddddddddddd ddd!ddd"dd#d$d%d&d'd(dd)ddd d*d+ddd,dd-d.d/d0d1d2ddddd d*d3ddd4dd5d6d7d8d9d:dd;ddd d<dd=ddd>d?d@dAdBdCdDdEddddd d*dFddd>dGdHdIdJdKdLdMddddd d*dNdddOZeee dPdQdRZ	dSdT Z
eee dPdUdVZeedPdWdXZeee dYdZd[Zd`eeee  ee ed]d^d_Zd\S )aWebsiteDetectorz?Detect website type and suggest optimal crawling configuration.c                 C   s
   d | _ d S )N)last_generated_slug)selfr   r   r   __init__   s    zWebsiteDetector.__init__z!shopee\.(vn|sg|my|th|ph|id|tw|br)ZmarketplaceZShopeez[data-testid="product-item"]z[data-testid="product-title"]z[data-testid="product-price"]z img[data-testid="product-image"]za[data-testid="product-link"]product_containertitlepriceimagelink      ?T<Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36zapplication/jsonzvi-VN,vi;q=0.9)
User-AgentAcceptzAccept-Languagez;Shopee uses API endpoints. Consider using their public API.)r   r   r   r   r   r   )patternr   configztiki\.vnZ	ecommerceZTikiz1.product-item, [data-view-id="product_list_item"]z8.product-title, [data-view-id="product_list_item_title"]z8.product-price, [data-view-id="product_list_item_price"]z@.product-image img, [data-view-id="product_list_item_image"] imgz8a.product-item, a[data-view-id="product_list_item_link"]      ?Ftext/html,application/xhtml+xmlz3Tiki has good HTML structure. Can crawl without JS.zlazada\.(vn|sg|my|th|ph|id)ZLazadaz [data-qa-locator="product-item"]z![data-qa-locator="product-title"]z![data-qa-locator="product-price"]z%[data-qa-locator="product-image"] imgz [data-qa-locator="product-link"]g333333?r+   r,   z@Lazada uses dynamic loading. May need Selenium for full content.z	sendo\.vnZSendoz.product-item, .productListItemz.product-title, .productNamez.product-price, .pricez#.product-image img, .productImg imgza.product-item, a.productLinkz)Sendo has straightforward HTML structure.z amazon\.(com|co\.uk|de|fr|jp|in)ZAmazonz'[data-component-type="s-search-result"]z+h2 a span, .s-title-instructions-style spanz%.a-price .a-offscreen, .a-price-wholez+[data-component-type="s-product-image"] imgzh2 a.a-link-normalg       @zen-US,en;q=0.9zHAmazon has strict anti-bot measures. Use proxies and rotate User-Agents.z.*woocommercezGeneric WooCommercez#.product, .type-product, li.productzT.woocommerce-loop-product__title, h2.woocommerce-loop-product__title, .product-titlez!.price, .woocommerce-Price-amountz5.wp-post-image, .attachment-woocommerce_thumbnail imgz.a.woocommerce-LoopProduct-link, a.product-linkz@WooCommerce sites have standard structure. Check for pagination.shopifyzGeneric Shopifyz.product-card, .card--product, .product-item, .grid__item--product, .product-card-wrapper, .card-wrapper.product-card-wrapper, li.product, [class*="product-card"], [class*="product-item"]z.product-card__title, .card__heading, .product-item__title, .product-title, h2.product-title, h3.product-title, .card__title, .product-card__namezq.price__current, .price-item--regular, .product-item__price, .product-card__price, .price, .product-price, .moneyz.product-card__media img, .card__media img, .product-item__image img, .product-card__image img, .product-image img, .product-card imgz].card-wrapper, .product-card__link, .product-item__link, a[href*="/products/"], .product-linkz@Shopify sites vary. May need to inspect specific site structure.)shopeeZtikilazadaZsendoZamazonr2   r3   )urlreturnc           	      C   s   t |}|j }d|kr,| |}|r,|S dd | j D }| D ]X\}}t|d |rH|d }t|d |d |d |d	 |d
 |d |d i d  S qH| 	|S )z
        Detect website type from URL and return suggested configuration.
        
        Args:
            url: The website URL to detect
            
        Returns:
            WebsiteConfig if detected, None otherwise
        zetsy.comc                 S   s   i | ]\}}|d kr||qS ))r2   r3   r   ).0kvr   r   r   
<dictcomp>   s       z*WebsiteDetector.detect.<locals>.<dictcomp>r-   r.   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )
r   netloclower_detect_etsy_shopWEBSITE_PATTERNSitemsresearchr
   _detect_generic_ecommerce)	r!   r6   parseddomainZetsy_configZspecific_patternsZsite_keyZ	site_infoZconfig_datar   r   r   detect   s*    


zWebsiteDetector.detectc           	      C   s   dd |j pddD }t|dkr|d  dkr|d }|jpFd	}|jpPd
}| d| d| }| d| d| d}|d| ||dd}td| dddddddddddddddd |d!S d"S )#z/Detect Etsy shop URLs and return configuration.c                 S   s   g | ]}|r|qS r   r   )r8   partr   r   r   
<listcomp>   s      z5WebsiteDetector._detect_etsy_shop.<locals>.<listcomp> /   r   Zshop   httpszwww.etsy.com://z/shop/z/rss   )	shop_nameZ	shop_pathshop_urlrss_url	max_pageszEtsy Shop ()	etsy_shopzdiv.js-merch-stash-check-listing.v2-listing-card[data-listing-id], div[data-listing-id].v2-listing-card, div.js-merch-stash-check-listing[data-listing-id]zOh3.v2-listing-card__title, a.listing-link::attr(title), .v2-listing-card__titlezjdiv.n-listing-card__price span.currency-value, .n-listing-card__price .currency-value, span.currency-valuezydiv.v2-listing-card__img img, a.listing-link img[src*="etsystatic.com"], .v2-listing-card__img img[src*="etsystatic.com"]zJa.listing-link[href*="/listing/"], a[href*="/listing/"][data-listing-link]za.listing-link::attr(title)zdiv.js-merch-stash-check-listing::attr(data-listing-id), a.listing-link::attr(data-listing-id), [data-listing-id]::attr(data-listing-id))r$   r%   r&   r'   r(   descriptionskur/   FzoMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36z?text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8r1   zUDetected Etsy shop. Crawling from shop page HTML with pagination to get all products.r<   N)pathsplitlenr>   schemer=   r
   )	r!   Z
parsed_url
path_partsrQ   r\   r=   rR   rS   r   r   r   r   r?      sB    


 ,z!WebsiteDetector._detect_etsy_shopc              
   C   s*   t ddddddddd	d
ddddi dS )z$Detect generic e-commerce platforms.zGeneric E-commercegenericz+.product, .product-item, [class*="product"]zh2, h3, .title, .product-titlez.price, [class*="price"]Zimgar#   r)   Fr*   r0   r1   zEGeneric detection. You may need to customize selectors for this site.r<   )r
   )r!   r6   r   r   r   rD   .  s"    z)WebsiteDetector._detect_generic_ecommercec                 C   sF   |  |}|sdddS d|j|j||j|j|j|j|j| |d
S )z
        Suggest complete configuration for a website.
        
        Args:
            url: The website URL
            
        Returns:
            Dictionary with suggested configuration
        zCould not detect website typez#Please provide manual configuration)errorZ
suggestionT)
ZdetectedZwebsite_nameZwebsite_typer6   r   Zrate_limit_secondsrequires_javascriptZsuggested_headersr   recommendations)	rG   r   r   r   r   r   r   r   _get_recommendations)r!   r6   r.   r   r   r   suggest_configD  s     

zWebsiteDetector.suggest_config)r.   r7   c                 C   sf   g }|j r|d |jdkr(|d d|j ksDd|j krN|d |d |d |S )	z.Get recommendations based on detected website.z>Consider using Selenium or Playwright for JavaScript renderingr)   zDThis site has strict rate limiting. Use proxies if crawling at scaler4   r5   z>Check if this site has a public API - it may be more efficientz5Always respect robots.txt and implement proper delaysz0Test with a small sample first before full crawl)r   appendr   r   r>   )r!   r.   rb   r   r   r   rc   b  s    





z$WebsiteDetector._get_recommendationsN)r6   
start_urlsoutput_pathr7   c                 C   s  |  |}|std| t|}|j d|j }|jp>i }|jdkr|d}|dp`|}	|d}
|dd}|j||jd	||	|
|d
d|	gdd|ddd|j	|j
|j
dd|j|jg g ddg dd|jd	}n|j||jd	|p|gdddddd|j	|j	dd|j	dd|j	dd|j	d d!d"|j
|j
dd|j|jg g ddg dd|jd#}|s|j}|d$r|d%d& }d'd(lm} ||}|j}|jdkr|drtd)d*|d }| d*| }td+| d, }n
t|j}t|}|jjddd- i }| rzt|d.d/d0}t|pTi }W 5 Q R X d1|kr|d1i }|d1i }||}d2D ]}||kr|| ||< q||d1< d3|krd4|d3 kr|d3 d4}|r|d5kr||d3 d4< d6|kr:d7|d6 kr:|d6i  |d6 d7 |d6 d7< |d6 d7 }W nH tk
r } z(d'd&l}|t }|!d8| d9 W 5 d&}~X Y nX |d6i  ||d6 d7< || _"t|d:d/d0}tj#||dddd; W 5 Q R X t$|S )<aB  
        Generate YAML configuration file for Scrapy.
        
        Args:
            url: The website URL
            start_urls: Optional list of start URLs (defaults to url)
            output_path: Optional output path for config file
            
        Returns:
            Path to generated config file
        z"Could not detect website type for rO   rV   rQ   rR   rS   rT   rP   )r   base_urlr   )rQ   rR   rS   rT   htmlTzUa[data-page], a.wt-action-group__item[href*="page="], a[href*="ref=items-pagination"])enabledselectorrT   F)moderf   
paginationfollow_linksrM   )ZdelayZdownload_delayZconcurrent_requests)ra   Zapi_endpointsZcustom_middlewarerotate)rj   listrl   )	websiteetsycrawlingr   rate_limitingr   featuresproxiesr   z/.next-page a, .pagination a.next, a[rel="next"]2   )rf   rm   rn   rW   z3.description, .product-description, .product-detailcategoryz6.breadcrumb a:last-child, .category, .product-categoryrX   z"[data-sku], .sku, [itemprop="sku"]
attributesz-.product-attributes, .attributes, table.specs)rW   rx   rX   ry   )rq   rs   r   rt   r   ru   rv   r   zwww.   Nr   )get_config_path_for_domainz
[^a-z0-9]+-Zconfigsz.yaml)parentsexist_okrzutf-8)encodingrs   )rl   Zcrawl_reviewsrq   r   r^   exportdomain_slugz!Failed to merge existing config: z. Using new config.w)Zdefault_flow_styleZallow_unicode	sort_keys)%rG   
ValueErrorr   r\   r=   r   r   getr   r   r   r   r   r   
startswithZ!scrapy_project.utils.config_utilsr{   stemrB   subr>   r   parentmkdirexistsopenyaml	safe_load
setdefault	ExceptionloggingZ	getLoggerr   Zwarningr    dumpr   )r!   r6   rf   rg   r.   rE   rh   r   rQ   rR   rS   rT   yaml_configrF   r{   r   Z	shop_slugZexisting_configfZexisting_crawlingZnew_crawlingZmerged_crawlingkeyZexisting_typeer   Zloggerr   r   r   generate_yaml_configt  s    





,
*




$z$WebsiteDetector.generate_yaml_config)NN)r   r   r   r   r"   r@   r   r   r
   rG   r?   rD   r   rd   r   rc   r   r   r   r   r   r   r      s    ,&>  
r   )r   rB   r   pathlibr   typingr   r   r   r   urllib.parser   Zdataclassesr   r	   r
   r   r   r   r   r   <module>   s   