U
    TDi                    @   s   d Z ddlZddlmZ ddlmZmZ ddlmZm	Z	 ddl
Z
ddlZddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ G dd dejZdS )z+Generic spider with configurable selectors.    Nunescape)ListOptional)urljoinurlparse)Request)Selector)get_project_settingsdatetime)ProductItemShopItem)PlatformDetectorc                       s  e Zd ZdZdZdUee ee d fddZee	ddd	Z
d
d Zeeee dddZee	dddZdd Zdd Zdd Zdd Zdd Zee ee dddZdd  Zd!d" Zeee d#d$d%Zee d&d'd(Zd)d* Zd+d, Zd-d. Zd/d0 Zed&d1d2Zed&d3d4Zd5d6 Z e!d&d7d8Z"d9d: Z#eed;d<d=Z$eee% d>d?d@Z&eedAdBdCZ'eedDdEdFZ(ed&dGdHZ)dIdJ Z*dKdL Z+eee dMdNdOZ,e	ee dPdQdRZ-e	edPdSdTZ.  Z/S )VGenericSpiderzAGeneric spider that uses configurable selectors from YAML config.ZgenericN)config_filestart_urls_overridec                    s   t  j|| || _|r"| |ni | _|| _d| _t 	d| _
d| _d| _i | _d| _d| _d| _d| _|   t }|dd| _dS )z
        Initialize generic spider.
        
        Args:
            config_file: Path to YAML configuration file
            start_urls_override: Optional list of start URLs to override config
        Fz%Y%m%d-%H%M%SNhtml   Z
USER_AGENTz+Mozilla/5.0 (compatible; GenericSpider/1.0))super__init__r   _load_configconfigr   _shopify_selectors_appliedr   nowstrftimeZrun_timestampexport_domainmodeetsy_configis_etsy_rssetsy_max_pagesetsy_rss_urlcrawl_reviews_setup_from_configr
   get
user_agent)selfr   r   argskwargsZsettings	__class__ (scrapy_project/spiders/generic_spider.pyr      s     zGenericSpider.__init__)r   returnc              
   C   sb   ddl }ddlm} ||}| s2td| t|ddd}||W  5 Q R  S Q R X dS )z"Load configuration from YAML file.r   N)PathzConfig file not found: rutf-8)encoding)yamlpathlibr.   existsFileNotFoundErroropenZ	safe_load)r&   r   r2   r.   Zconfig_pathfr+   r+   r,   r   2   s    zGenericSpider._load_configc              	   C   s  | j s
dS | j di }|dr,|d | _| j di dd| _| jdkrTd| _| jd	k| _| jr| j d
i pvi | _| jd| _zt| jdd| _	W n t
tfk
r   d| _	Y nX | jr| j| _nDd| j krd| j d kr| j d d | _n| jr| jr| jg| _d| j krBd| j d krB| j d d | _n2| jrnt| jd }|j d|j | _nd| _| j di dd| _| j di | _| j di di | _| jdd| _| jdd| _| jdd| _d| _t | _t | _| j di dd }|dk| _| jr:| jd |    t | _!| j di pTi }| j d i phi }t"|d!dp|d!d| _#zt|d"d#| _$W n  t
tfk
r   d#| _$Y nX d| _%t | _&dS )$z Setup spider from configuration.NZexportZdomain_slugcrawlingr   r   shopify_apiTZetsy_rssZetsyrss_url	max_pages
   
start_urlsZwebsitebase_urlr   z:// r"   F	selectorsZ
paginationenabledselectorz .next-page a, .pagination a.next2   r   typeshopifyz8Shopify mode forced from config (website.type = shopify)Zfeatures
crawl_shopshop_max_pages   )'r   r$   r   r   
is_shopifyr   r   r!   intr    	TypeError
ValueErrorr   r=   r>   r   Zschemenetlocr"   r@   Zpagination_configpagination_enabledpagination_selectorr;   current_pagesetvisited_urlsr   platform_detectorlowerloggerinfo!_apply_shopify_fallback_selectorscollections_crawledboolrF   rG   _shop_scheduled_shop_urls_scheduled)r&   Zexport_settingsparsedZwebsite_typeZcrawling_cfgZfeatures_cfgr+   r+   r,   r#   >   sh    





 z GenericSpider._setup_from_config)basehrefr-   c                 C   s@   |sdS |  }|r|dr"dS t||}|ddd }|S )z8Normalize and absolutize a URL; returns None if invalid.N)javascript:data:#r   r   )strip
startswithr   split)r&   r]   r^   fullr+   r+   r,   _normalize_url   s    
zGenericSpider._normalize_url)	page_typer-   c           $   	      s  i }| d pd | d p(d | d p<d | d pPd | d pdd d}dd	 | D |d
< |dp|d}|r| |d< |dp|d}|r| |d< | d pd }|r| |j|p||d< g }| d pg }	d}
|	D ]H}| |j|}|s8q|  t fdd|
D r|	| q|rt
 }g }|D ](}||krqz|| |	| qz|dd |d< g }g }|	D ]}|sΐq| }| dr |ddd ddd  }|r ||kr |	| | dr|ddd ddd  }|r||kr|	| q|r|dd |d< |r|dd |d < g }| d! pg D ]T}|pd }|sΐqzt|}|	| W n tk
r   Y qY nX q|rJ|dd" |d#< fd$d%|D ]}|d&pL|d'}t|trl|rh|d nd}|r~t| nd}|d(kr2|dr|dst|d |d< |d)r|d*st|d) |d*< |dr|dst|d |d< |d+rn|d+}t|tr>|g}t|trn|dg  |d d,d- |D  |d.r|dg  |d 	t|d.  |d/r|d g  |d  	t|d/  |d0}t|tr2g }d1D ]*}||r|	t||  q|r2|d2g  |d2 	d3| q2d4D ]}||krN||pjg } t| tr~| g} t| trNg }!t
 }| D ]D}"|"sqt|" }#|#r|#|krƐq||# |!	|# q|!dd5 ||< qN||d6< |j|d7< |S )8z;Extract best-effort shop/site signals from a page response.ztitle::textr?   z'meta[name="description"]::attr(content)z,meta[property="og:site_name"]::attr(content)z.meta[property="og:description"]::attr(content)z!link[rel="canonical"]::attr(href))titledescriptionog_site_nameog_descriptionZ	canonicalc                 S   s   i | ]\}}|r||qS r+   r+   .0kvr+   r+   r,   
<dictcomp>   s       z4GenericSpider._extract_shop_info.<locals>.<dictcomp>metarj   rh   nameri   rk   z(meta[property="og:image"]::attr(content)logoa::attr(href))	facebook.cominstagram.comz
tiktok.comzyoutube.comzyoutu.bepinterest.comtwitter.comzx.comzlinkedin.comc                 3   s   | ]}| kV  qd S Nr+   )rm   drT   r+   r,   	<genexpr>   s     z3GenericSpider._extract_shop_info.<locals>.<genexpr>NrC   social_linksmailto::r   ?r   ztel:emailsphones(script[type="application/ld+json"]::text   Zjsonldc                 3   sT   t | tr$| D ]} |E d H  qn,t | trP| V  |  D ]} |E d H  q<d S ry   )
isinstancelistdictvalues)nodexro   )walkr+   r,   r      s    

z.GenericSpider._extract_shop_info.<locals>.walk@typez	["@type"])ZorganizationZlocalbusinessstoreurlr>   ZsameAsc                 S   s   g | ]}t |tr|qS r+   )r   str)rm   sr+   r+   r,   
<listcomp>  s     
 z4GenericSpider._extract_shop_info.<locals>.<listcomp>emailZ	telephoneZaddress)ZstreetAddressZaddressLocalityZaddressRegionZ
postalCodeZaddressCountry	addresses, )r   r   r}   r   d   rg   page_url)cssr$   rb   itemsrf   r   getallrT   anyappendrQ   addrc   rd   jsonloads	Exceptionr   r   r   
setdefaultextendr   join)$r&   responserg   	extractedrq   rr   ri   Zog_imageZsocialshrefsZsocial_domainsr^   re   seenZdedupedr   r   r   hr   ZphoneZjsonld_blocksrawr\   objtZt_strZsame_asZaddrpartsrn   keyZvalscleanedro   Zvvr+   )rT   r   r,   _extract_shop_info   s    

 
 
	





z GenericSpider._extract_shop_infoc                 #   s`  t | j| jkrdS dddddddd	d
dddddddddddddg}|d pTg }|D ]6}|sdqZ|  tfdddD rZ|| qZ|D ]}t | j| jkr q\| 	| j
p|j|  sqz(t| j
p|jj}t j|krW qW n tk
r   Y nX t fdddD r$q | jkr2q| j  t | j| jddd id!V  qdS )"z>Schedule a limited set of likely shop/info pages for crawling.N/z/aboutz	/about-usz/pages/aboutz/pages/about-usz/contactz/contact-usz/pages/contactz/pages/contact-usz/faqz
/pages/faqz/pages/helpz/helpz	/shippingz/pages/shippingz/pages/shipping-policyz/returnsz/pages/returnsz/pages/return-policyz/policies/privacy-policyz/policies/refund-policyz/policies/terms-of-servicert   c                 3   s   | ]}| kV  qd S ry   r+   )rm   rn   )r   r+   r,   r|   I  s     z5GenericSpider._schedule_shop_pages.<locals>.<genexpr>)ZaboutZcontactZprivacyZtermsZrefundr-   ZshippingZfaqc                 3   s   | ]}|   kV  qd S ry   r{   )rm   Zseg)re   r+   r,   r|   [  s     )
/products/	/product//collections/z/collection/Tshop_page_type	shop_page)r   callbackerrbackdont_filterrq   )lenr[   rG   r   r   rb   rT   r   r   rf   r>   r   r   rM   r   r   r   parse_shop_pageerrback_httpbin)r&   r   Z
candidatesr   r^   Zbase_netlocr+   )re   r   r,   _schedule_shop_pages5  sp                    
z"GenericSpider._schedule_shop_pagesc              
   c   s   |j dpd}zP| j||d}t| jp,|j}|jdd}td| j||j||t	
 dV  W n< tk
r } z| jd|j d	|  W 5 d
}~X Y nX d
S )z+Parse a shop/info page and emit a ShopItem.r   r   rg   www.r?   shopZ	item_typer>   domainr   rg   r   Z
crawled_atzparse_shop_page failed for : N)rq   r$   r   r   r>   r   rM   replacer   r   r   r   rU   debug)r&   r   rg   r   r\   r   er+   r+   r,   r   j  s     	zGenericSpider.parse_shop_pagec                 c   s   | j r|| jp| jr| jd nd}|s4| jd dS td| jd D ]4}|dkrT|n| d| }t|| j| j	ddV  qDn | jD ]}t|| j
| j	dV  qdS )	zGenerate initial requests.r   Nz0Etsy RSS mode enabled but no RSS URL configured.r   z?page=T)r   r   r   r   r   r   r   )r   r!   r=   rU   errorranger    r   parse_etsy_rssr   parse)r&   r:   pager   r   r+   r+   r,   start_requests}  s    
zGenericSpider.start_requestsc                 #   s  | j r| |S t| ddrt| ddsd| _zP| j|dd}t| jpJ|j}|j	dd}t
d	| j||jd|t d
V  W n4 tk
r } z| jd|  W 5 d}~X Y nX z| |E dH  W n4 tk
r } z| jd|  W 5 d}~X Y nX | jdkr| js| jd z<|  }|rX| jd | j|j |E dH  W dS W nH tk
r } z(| jd| d | js|   W 5 d}~X Y nX | js| jdkr| || _| jrj| jd z<|  }|r| jd | j|j |E dH  W dS W n8 tk
rL } z| jd| d W 5 d}~X Y nX | jd | jsj|   | |}|r| j s| jd |   | jr| |}|D ]F}	|	| jkr| j|	 | jd|	  t|	| jd| j dV  q|!| j"#dd}
t$|
}
t%|
}| &|
}|r.|}
n|dkrD| jd |
sj| jd|j d |!d}
|
r~t%|
dkr| jd|j d  |!d!' }t(|dd" D ]`}|rt)|j| t* fd#d$d%D rq | j+kr| j+  t | j| j d&V  qdS | jd't%|
 d(|j  |
D ]}z| ,||j}|r|#d)r|#d*d|#d)dd+d,d-d.d/d0d1d2g}t*fd3d$|D r| jd4  W q:d}rN-d5rW q:t*fd6d$d7D rW q:d8kr
d}nDd9krd}n4d:kr*d}n$d;krNd<krN-d=rNd}|r| j.#d>i #d?drr| j+kr| j+ t| j/d@t0|i| j dAV  nW q:n|V  n4s| jdB dC n| jdB dD  W nD tk
r4 } z$| j1dEt2|  W Y q:W 5 d}~X Y nX q:| j3r| j4| j5k r| 6|}|r|  j4dF7  _4| jdG| j4 dH|  t|| j| j d&V  n| jdI| j4 dJ| j5  n8| j3s| jdK n"| j4| j5kr| jdL| j5  dS )MzParse product list page.rF   FrZ   TZhomepager   r   r?   r   r   z!Homepage shop extraction failed: NzScheduling shop pages failed: r9   z*Shopify API mode enabled - using API crawlzUsing Shopify API for crawlingzShopify API crawl failed: z, falling back to HTMLz)Shopify detected - trying API crawl firstz)Falling back to HTML crawling for Shopifyz,Etsy shop detected - applying Etsy selectorszFound collection: )r   r   r   r   product_containerz.productr   zProduct container filter skipped because no valid links were found in the sampled nodes. Consider refining selectors.product_container for better accuracy.z%No valid product containers found on z/. Selectors may need adjustment for this theme.z+.product-item, .product, [class*="product"]zNo products found on z(, checking for category/product links...zVa[href*="/product"], a[href*=".html"], a.product-item, a[class*="product"]::attr(href)r   c                 3   s   | ]}|   kV  qd S ry   r{   )rm   Zexcludefull_urlr+   r,   r|     s     z&GenericSpider.parse.<locals>.<genexpr>)z	/checkoutz/cartz/accountz/loginz	/registerz/searchra   r_   r   Found  products on rr   product_urlu   các sản phẩm kháczother productszview allzsee allzshop allzall productszno products foundzno productsc                 3   s   | ]}|   kV  qd S ry   r{   rm   keyword)product_namer+   r,   r|   $  s     zSkipping invalid product name: )r_   r`   r~   c                 3   s   | ]}|   kV  qd S ry   r{   rm   extr   r+   r,   r|   0  s     )z.jpgz.jpegz.pngz.gifz.svgz.webpr   	/listing/r   r   z
/category/)zhttp://zhttps://r8   Zfollow_linksproduct)r   r   rq   r   zSkipping product 'z' - no product_urlz' - invalid URL: Error extracting product: r   zFollowing pagination to page r   z&No next page URL found. Current page: z, Max pages: zPagination is disabledzReached max pages limit: )7r   r   getattrrZ   r   r   r>   r   rM   r   r   r   r   r   rU   r   r   r   rX   rV   crawl_shopify_apir   warningr   rW   _detect_shopifyrI   _detect_etsy_apply_etsy_fallback_selectors_extract_collection_urlsr   r   r   r   r@   r$   r   r   _filter_product_containersr   rQ   r   r   rR   _extract_productrc   r   parse_product_detailr   r   r   rN   rP   r;   _get_next_page_url)r&   r   r   r\   r   r   Zapi_generatorZis_etsycollection_urlsZcollection_urlZproduct_containersZraw_container_countZfiltered_containersZpotential_linkslink	containerr   Zcategory_keywordsZis_valid_productZnext_page_urlr+   )r   r   r   r,   r     s<   
	$$



&










 

zGenericSpider.parsec              	   c   s  | d}|s&| jd|j  dS | jdt| d|j  |D ]}| d }| d }| d pzd	}t|}t|d
}|	d }	d	
|	d  }
|
s| d pd	}
d}d}|
r.td|
}|r.|ddd	}zt|}W n tk
r"   d}Y nX |d}| d }|sP| d }t }|rd| nd	|d< ||d< |p|d|d< |pd	 |d< |	r|	gng |d< ||d< | ||d< d|d< d|d< |j|d< d|d < d|d!< |V  qHdS )"z*Parse Etsy RSS feed entries into products.z//channel/itemzNo Etsy RSS items found on NzProcessing z Etsy RSS items from ztitle/text()zlink/text()zdescription/text()r?   textimg::attr(src)z.price::textz&string(//p[contains(@class, "price")])z([\d\.,]+)\s*([A-Z]{3})r   ,   z,string(//p[contains(@class, "description")])zstring()rr   priceZUSDcurrencyri   imagesr   skucategory
attributes
source_urlin_stockavailabilitystock_status)xpathrU   r   r   rV   r   r$   r   r	   r   r   r   rb   researchgroupr   floatrL   r   _extract_etsy_listing_id)r&   r   r   itemrh   r   Zdescription_rawZdescription_htmldesc_selector	image_url
price_textZprice_valuer   matchZnumber_partZdescription_textr   r+   r+   r,   r   t  sV    





zGenericSpider.parse_etsy_rss)r   r-   c                 C   s&   |sd S t d|}|r"|dS d S )Nz/listing/(\d+)r   )r   r   r   )r&   r   r  r+   r+   r,   r     s    
z&GenericSpider._extract_etsy_listing_idc           <      #   s  |j di }t|tr"t|}n|}| j}d}|d }|sP|d }|rn| |}|rn||d< d}|d }|r| |d< |sf|dd	}	|		d
D ]}
|

 }
d|
krqz`d|
ksd|
kr||
 }n||
d  }|r| |}|r||d< d}W  qfW q tk
rb } z$| jd|
 d|  W Y qW 5 d}~X Y qX q|dsL|dd}|	d
D ]}
|

 }
zld||
d  }|r|
 r| ||d< W  qL||
 }|r| ||d< W  qLW nF tk
rF } z&| jd|
 d|  W Y qW 5 d}~X Y nX q|dp\|j}d}|rz`||}|jd	dd }|	dd 	dd }|dr|dd }|r| }W n< tk
r } z| jd| d|  W 5 d}~X Y nX g }| jr<d d!d"d#d$d%d&d'd(d)d*d+d,g}| ||}|s\|d-d.}| ||g}|s|d/ }|s|d0 }g }|rt }|D ]}|r|d1rq|d2rd3| }n&|drt|j|}nt|j|}dd4lm}m} ||}|j}|  d5d6d7d8d9d:d;d<d=g	}t  fd>d?|D rHq||krh|!| |"| nd} t#|D ](\}!}"||"}#|#j|krt|!}  qqt| dk	r||j$}$||||  j$}%d@|$krt%|$d@dAgd nd}&d@|%krt%|%d@dAgd nd}'|&|'kr||| < q|r| jr|rg }(|&dBdC})|D ]B}||}*|*j   &dBdC}+| ks|)|+krJ|("| qJ|(r|(}|s| '|},|,r|,}|s| (|}-|-r|-}|r||dD< |dEsR|dEdF}.|.	d
D ]T}
|

 }
dG|
kr"|dH }/n||
d  }/|/r| |/|dE<  qRq|dIs |dIdJ}0|dK }1|1r| |1|dI< n|dL }2|2rt)|2D ]T}3|3r| |3}3dMdNdOdPdQg}4|3r|3 |4krt*|3dRkr|3|dI<  qq|dIs|0	d
D ]}
|

 }
d|
ksdS|
 ksdT|
 krNq||
d  }5|5r| |5}5dMdNdOdPdQg}4|5r|5 |4kr|5|dI<  qq|dIs dU|jkr t+,dV|j}6|6r |6-dW}7|dX|7 dY|7 dZ }8|8r| |8|dI< n|7&dBd. |dI< | /||}9|9r:|9|d[< | j0rz>| 1|}:|:r~|:|d\< | j2d]t*|: d^|d_  W n> tk
r }; z| j3d`|j d|;  W 5 d};~;X Y nX |V  dS )azParse product detail page.r   Fz1//meta[@property="product:price:amount"]/@contentz"//meta[@itemprop="price"]/@contentr   Tz3//meta[@property="product:price:currency"]/@contentr   z.price, [class*="price"]r   zmeta[::text::attr(Error with price selector '': Nri   z0#description, .description, .product-description z ::textz!Error with description selector 'r   r   r   r   ra   .htmlz(Unable to extract slug from product URL r   z.product__media imgz.product__media-item imgzmedia-gallery imgzproduct-media imgz.product-media imgz.product-gallery imgz.product-images imgz.media--height imgz[data-media-id] imgz[class*="product-media"] imgz[class*="product-gallery"] imgzimg[data-gallery-thumbnail]z.product__media-wrapper imgimageimgr   img::attr(data-src)r`   //https:)r   parse_qsrs   ZiconZfaviconZbannerheaderZfooterZ
bee_smileyZ
pollinatorZcharityc                 3   s   | ]}| kV  qd S ry   r+   rm   patternZ
path_lowerr+   r,   r|   H  s     z5GenericSpider.parse_product_detail.<locals>.<genexpr>width0-r?   r   r   z0[data-sku], .sku, [itemprop="sku"], .product-skuz
[data-sku]z[data-sku]::attr(data-sku)r   zmeta[property="product:category"], .breadcrumb a:last-child, .category, nav[aria-label*="breadcrumb"] a:last-child, .breadcrumbs a:last-childz-//meta[@property="product:category"]/@contentzPnav[aria-label*="breadcrumb"] a::text, .breadcrumbs a::text, .breadcrumb a::texthomer   productsallZ
collectionr   Z
breadcrumbZbreadcrumbsr   /collections/([^/?#]+)r   za[href*="/collections/z#"]::text, [data-collection-handle="z"]::textr   reviewsz	Captured z reviews from detail page for rr   z+Failed to extract reviews from detail page )4rq   r$   r   r   r   r@   r   _parse_priceupperrd   rb   r   r   rU   r   r   r   _clean_textr   r   pathrstripendswithrT   rI   _extract_images_from_selectorrQ   rc   r   urllib.parser   r  r   r   r   	enumeratequeryrJ   r   _extract_images_from_ld_json_fetch_shopify_product_imagesreversedr   r   r   r   rh   _extract_attributesr"   "_extract_reviews_from_product_pagerV   r   )<r&   r   product_dictr   r@   Zprice_extractedZ
price_metar   Zcurrency_metaprice_selectorselr  r   r   ri   r   Zproduct_slugr   Zparsed_product_urlZslug_candidateZslug_errr   Zshopify_selectorsimage_selectorZnormalized_imagesZ
seen_basesr  img_urlr  r\   	base_pathskip_patternsZexisting_idxidxZexisting_urlZexisting_parsedZ
new_paramsZexisting_paramsZ	new_widthZexisting_widthZslug_filteredZslug_no_dash
parsed_urlZpath_no_dashZ	ld_imagesZshopify_imagesZsku_selectorr   Zcategory_selectorZcategory_metaZbreadcrumb_textsZ	link_textZ
skip_wordsr   Zcollection_matchcollection_slugZcollection_namer   r  Z
review_errr+   r  r,   r     s   



 *






$$






"
&



z"GenericSpider.parse_product_detailc                 C   s@  i }| dd}|r|| d}|D ]<}|d  }|d  }|r(|r(| ||| |< q(|s|| d| d}	|	D ]B}
|
d  }|rd	|kr|d	d
\}}| ||| |< q|s2|d  }|r| ||d< |d  }|r| ||d< |d  }|r2| ||d< |r<|S dS )z/Extract product attributes (size, color, etc.).r   z=.product-attributes, .attributes, .product-specs, table.specsz trz*td:first-child::text, th:first-child::textz(td:last-child::text, th:last-child::textz li, z .attribute-itemr  r   r   z+.size, .product-size, [class*="size"]::textZSizez..color, .product-color, [class*="color"]::textZColorz..brand, .product-brand, [class*="brand"]::textZBrandN)r$   r   r  rd   )r&   r   r@   r   Zattr_selectorZ	attr_rowsrowr   valueZ
attr_itemsr   r   sizeZcolorZbrandr+   r+   r,   r*    s6    z!GenericSpider._extract_attributes)r   r-   c                    s  z| j }|dd}d}|dD ]}| }z@d|ksBd|krR|| }n||d  }|rnW  qW q$ tk
r } z$| jd| d|  W Y q$W 5 d}~X Y q$X q$|s|d	 }|sW dS | |}|d
d}d}	|dD ]}| }z>d|kr|| }	n||d  }	|	r>W  qW q tk
r } z$| jd| d|  W Y qW 5 d}~X Y qX q|	s|d }	|	r| 	|	nd}
|dd}d|dD ]}| }zd|kr||  n||d    rddddddg}t
 fdd|D r:W qt
 fdddD r` W  qn0 drt
 fdddD r W  qW nF tk
r } z&| jd| d|  W Y qW 5 d}~X Y nX qʈsR|d }|D ]Zrddddddg}t
fdd|D r,qt
fd ddD r qRqrވd!rjdntddddd"d#g}t
fd$d|D rdnFt| jt
fd%ddD sdnt
fd&d|D rds(|d' }|D ]0t
fd(dd)D st| j q(q|d*d+}| |d,d- |dD }|sx|d. }|sx|d/ }|r|d0 nd}t }||d1< |
|d
< |r|gng |d2< |d3< ||d4< d5|d6< |W S  tk
r } z | jd7t|  W Y dS d}~X Y nX dS )8z,Extract product data from container element.rh   zh2, h3, .titleNr   r  r  zError with title selector 'r  zBh2::text, h3::text, .product-title::text, .product-item-link::textr   z.pricer  z$.price::text, [class*="price"]::textr   a::attr(href)ru   rx   rw   rv   ZshareZsharerc                 3   s   | ]}|   kV  qd S ry   r{   r  r^   r+   r,   r|   G  s     z1GenericSpider._extract_product.<locals>.<genexpr>c                 3   s   | ]}| kV  qd S ry   r+   r  r;  r+   r,   r|   N  s     )r   r   r   httpc                 3   s   | ]}| kV  qd S ry   r+   r  r;  r+   r,   r|   R  s     zError with link selector 'rt   c                 3   s   | ]}|   kV  qd S ry   r{   r  r   r+   r,   r|   `  s     c                 3   s   | ]}| kV  qd S ry   r+   r  r=  r+   r,   r|   d  s     )r_   r`   r~   ra   /share/sharerc                 3   s   | ]}|   kV  qd S ry   r{   r  r   r+   r,   r|   p  s     c                 3   s   | ]}| kV  qd S ry   r+   r  r   r+   r,   r|   u  s     c                 3   s   | ]}|   kV  qd S ry   r{   r  r   r+   r,   r|   w  s     zpancestor::a[contains(@href, "/products/") or contains(@href, "/listing/") or contains(@href, "/product/")]/@hrefc                 3   s   | ]}|   kV  qd S ry   r{   r  )parentr+   r,   r|     s     )ru   rx   rw   rv   r>  r?  r  r  c                 S   s   g | ]}|  pd qS )r  rb   )rm   r.  r+   r+   r,   r     s     z2GenericSpider._extract_product.<locals>.<listcomp>r   r  r   rr   r   r   r   ZVNDr   r   )r@   r$   rd   rb   r   r   rU   r   r  r  r   rc   r   r   r>   r   r#  r   r   r   )r&   r   r   r@   Ztitle_selectorrr   r.  r   r-  r  r   Zlink_selectorr2  Z	all_linksZparent_linksr/  
image_urlsr   r   r+   )r^   r   r@  r   r,   r     s    




$ 

zGenericSpider._extract_product)r-   c              
   C   s@  |j }d|krd|kr|d }|r|D ]}|r0d|kr0td|tj}|r0t|d}td|tj}|rt|dnd}||kr0t|j |}	| j	
d| d|	  |	  S q0t|d	 }
| j	
d
|
 d|  td|tj}|rt|d}|d }nd}d}|| jkrD| j	
d| j d dS |
dkrp|dkrp| j	
d| d dS |dd }| d| d}	| j	
d|
 d| d| d|	  |	S | jr|| jd  }|rt|j |S td|tj}|r<t|d}|d }tjdd| |tjd}|r<t|j |S dS )z"Get next page URL from pagination.etsy.com/shop/zLa[data-page]::attr(href), a.wt-action-group__item[href*="page="]::attr(href)zpage=z[?&]page=(\d+)r   z/Etsy pagination: Found pagination link to page r   z1div.js-merch-stash-check-listing[data-listing-id]zEtsy pagination check: Found r   r   z*Etsy pagination: Reached max pages limit ()Nr   z+Etsy pagination: No products found on page , stopping paginationr   z?ref=items-pagination&page=z&sort_order=relevancezEtsy pagination: Found z products on page z, generating page z URL: r:  zpage[=\-]?(\d+)zpage[=\-]?\d+flags)r   r   r   r   r   
IGNORECASErJ   r   r   rU   rV   r   r;   rd   rO   r$   sub)r&   r   Zcurrent_urlZpagination_linksr   Z
page_matchZpage_numZcurrent_page_matchZcurrent_page_numnext_urlZproduct_countZnext_page_numr>   Z	next_linkr+   r+   r,   r     sZ    

$z GenericSpider._get_next_page_urlc           	      C   sf   g }|s|S |D ]P}| |}|s$q|D ].}| |}|D ]}|r:||kr:|| q:q(|r qbq|S )zCExtract image URLs from selector supporting lazy-loaded attributes.)r   _extract_images_from_noder   )	r&   rB   Zcss_selectorsrB  r.  nodesr   node_imagesr0  r+   r+   r,   r#    s    

z+GenericSpider._extract_images_from_selectorc           	   	      s  g }t |ds|S |j}dddddddd	d
g	}|D ]}||}|sFq2|d	sZ|drdd |dD }|r|d dd }d|kr|dd}|drd| }n|drt| jpd|}|	  ddddg}t
 fdd|D rq2|| q2|S )zLExtract image URLs from a single img node by inspecting multiple attributes.attribzdata-srczdata-originalz
data-imagezdata-lazy-srczdata-srcsetz
data-bgsetzdata-bgZsrcsetsrcZbgsetc                 S   s   g | ]}|  r|  qS r+   rA  rm   partr+   r+   r,   r     s      z;GenericSpider._extract_images_from_node.<locals>.<listcomp>r   r   r  z{width}Z1024r  r  r   r?   Zajax_loaderZspinnerZplaceholderloaderc                 3   s   | ]}| kV  qd S ry   r+   r  Zvalue_lowerr+   r,   r|   #  s     z:GenericSpider._extract_images_from_node.<locals>.<genexpr>)hasattrrO  r$   r"  rd   r   rc   r   r>   rT   r   r   )	r&   r   ZurlsZattrsZcandidate_attrsattrr7  r   r2  r+   rT  r,   rL    sD    




z'GenericSpider._extract_images_from_nodec              	   C   s  g }| d }|D ]}| }|s(qzt|}W n tk
rN   Y qY nX t|tr^|n|g}|D ]}t|tsxqh|	d}|rdt
|krqh|	dp|	d}	|	sqht|	t
r|	g}	|	D ]L}
|
sq|
drd|
 }n|
dr|
}nt|j|
}||kr|| qqhq|S )	z'Extract image URLs from JSON-LD blocks.r   r   ZProductr  r   r  r  r<  )r   r   rb   r   r   r   r   r   r   r$   r   rc   r   r   r   )r&   r   r   ZscriptsZscriptdatarM  r   Z	node_typerN  r  r0  r+   r+   r,   r'  *  sB    






z*GenericSpider._extract_images_from_ld_jsonc              
   C   s:  d|j krg S zt|j j}|dd ddd }|sDg W S t| jpNdd| d}tj|d| j	idd	}|j
d
krg W S | }|dg }g }|D ]J}	|	sq|	drd|	 }	n|	drt| jpd|	}	|	|kr||	 q|W S  tk
r4 }
 z&| jd|j  d|
  g  W Y S d}
~
X Y nX dS )z7Fetch product images via Shopify product JSON endpoint.r   r  r   r   r?   z.js
User-Agentr<   headerstimeout   r   r  r  z$Shopify JSON image fetch failed for r   N)r   r   r   rd   rb   r   r>   requestsr$   r%   status_coder   rc   r   r   rU   r   )r&   r   r   handleZproduct_json_urlZresprW  r   Z
normalizedr  excr+   r+   r,   r(  P  s6    




z+GenericSpider._fetch_shopify_product_imagesc                 C   s>   z*| j |j|jt|j}|ddkW S    Y dS X dS )z!Detect if this is a Shopify site.Zplatform_keyrE   FN)rS   Zdetectr   r   r   rZ  r$   )r&   r   Zplatform_infor+   r+   r,   r   n  s    zGenericSpider._detect_shopifyc                    sr   z^d|j krd|j krW dS d|j krZdddddg}|j  t fd	d
|D rZW dS W dS    Y dS X dS )z$Detect if this is an Etsy shop page.rC  rD  Tzdata-listing-idzv2-listing-cardzjs-merch-stash-check-listingzetsystatic.comz	shop-homec                 3   s   | ]}| kV  qd S ry   r+   )rm   Z	indicatorZ
text_lowerr+   r,   r|     s     z-GenericSpider._detect_etsy.<locals>.<genexpr>FN)r   r   rT   r   )r&   r   Zetsy_indicatorsr+   ra  r,   r   z  s     

zGenericSpider._detect_etsyc                 C   sz   ddddddd}|  D ]F\}}| j|d }|sD|| j|< q||kr| d	| | j|< q| jd
| j  dS )z=Enhance selectors for Etsy shop pages when config is generic.zdiv.js-merch-stash-check-listing.v2-listing-card[data-listing-id], div[data-listing-id].v2-listing-card, div.js-merch-stash-check-listing[data-listing-id], [data-listing-id][data-page-type="shop"]zh3.v2-listing-card__title, a.listing-link::attr(title), .v2-listing-card__title, h3[class*="listing-card__title"], a[data-listing-link]::attr(title)zdiv.n-listing-card__price span.currency-value, .n-listing-card__price .currency-value, span.currency-value, .currency-value, [class*="listing-card__price"] .currency-valuezdiv.v2-listing-card__img img, a.listing-link img[src*="etsystatic.com"], .v2-listing-card__img img[src*="etsystatic.com"], img[src*="etsystatic.com"][data-listing-card-listing-image], [class*="listing-card__img"] imgzsa.listing-link[href*="/listing/"], a[href*="/listing/"][data-listing-link], a[data-listing-link][href*="/listing/"]zdiv.js-merch-stash-check-listing::attr(data-listing-id), a.listing-link::attr(data-listing-id), [data-listing-id]::attr(data-listing-id))r   rh   r   r  r   r   r?   r   z!Applied Etsy fallback selectors: N)r   r@   r$   rb   rU   rV   )r&   Zetsy_defaultsr   fallback_valueexistingr+   r+   r,   r     s    'z,GenericSpider._apply_etsy_fallback_selectorsc           	         s   t  }|d }|D ]}|r|dr8t|j| n |drH| nt|jd|  d krbqtd }|r|d	 }ddd	d
ddh}ddh}|
 rq||krqt fdd|D rq|  q|S )zHExtract collection URLs from response (Shopify pattern: /collections/*).z$a[href*="/collections/"]::attr(href)r   r<  r   r  r   z.atomz.oembedz.jsonz.xmlz.rssr	  r   filterc                 3   s   | ]}   |V  qd S ry   )rT   r"  r   r   r+   r,   r|     s     z9GenericSpider._extract_collection_urls.<locals>.<genexpr>)rQ   r   r   rc   r   r   r   r   r   rT   isdigitr   r   )	r&   r   r   Zcollection_linksr   r  r5  Zskip_extensionsZ
skip_slugsr+   r   r,   r     s0    

z&GenericSpider._extract_collection_urlsc                 C   s~   dddddd}|  D ]F\}}| j|d }|sB|| j|< q||kr| d| | j|< qd	| _| jd
| j  dS )zAEnhance selectors for Shopify storefronts when config is generic.z.product-card, .card--product, .product-item, .grid__item--product, .product-card-wrapper, .card-wrapper.product-card-wrapper, li.product, [class*="product-card"], [class*="product-item"], .product-tile, .product-block, .product-grid-itemz.product-card__title, .card__heading, .full-unstyled-link, .product-item__title, .product-title, h2.product-title, h3.product-title, .card__title, .product-card__name, .product-name, a[href*="/products/"]::text, [class*="product-title"]z.price__current, .price-item--regular, .price__container .money, .price-item--sale, .product-item__price, .product-card__price, .price, .product-price, .money, [class*="price"], .price-wrapper .money, .product__price .moneyz.product-card__media img, .card__media img, img.product__media-item, .product-item__image img, .product-card__image img, .product-image img, [class*="product-image"] img, .product-card img, .card img, a[href*="/products/"] imgz.card-wrapper, .product-card__link, .full-unstyled-link, a[href*="/products/"], .product-item__link, .product-card__link, .product-link, a.product-link, [class*="product-link"])r   rh   r   r  r   r?   r   Tz$Applied Shopify fallback selectors: N)r   r@   r$   rb   r   rU   rV   )r&   Zshopify_defaultsr   rb  rc  r+   r+   r,   rW     s    z/GenericSpider._apply_shopify_fallback_selectors)
containersr-   c                 C   s<   g }|D ].}| d }tdd |D r|| q|S )zJEnsure containers actually contain product links to avoid false positives.rt   c                 s   s   | ]}|od |kV  qdS )r   Nr+   )rm   r^   r+   r+   r,   r|   '  s     z;GenericSpider._filter_product_containers.<locals>.<genexpr>)r   r   r   r   )r&   rf  Zfilteredr   r   r+   r+   r,   r   "  s    z(GenericSpider._filter_product_containers)r  r-   c              	   C   sb   |sdS t dd|}|dd}z"t d|}|rBt|d W S W n ttfk
r\   Y nX dS )zParse price from text string.Nz[^\d.,]r?   r   .z	\d+\.?\d*r   )r   rJ  r   findallr   rL   
IndexError)r&   r  r   Znumbersr+   r+   r,   r  +  s    zGenericSpider._parse_price)r   r-   c                 C   s"   |sdS t dd|}| }|S )zClean extracted text.r?   z<[^>]+>)r   rJ  rb   )r&   r   r+   r+   r,   r  ?  s
    zGenericSpider._clean_text)r   r-   c              
   C   s   |r
| j sg S zrddl}|d tj|d| jidd}|jdkrZ| jd|  g W S dd	l	m
} |||jd
d}| |W S  tk
r } z$| jd| d|  g  W Y S d}~X Y nX dS )z
        Fetch and extract reviews from a product URL.
        Makes synchronous HTTP request.
        
        Args:
            product_url: Product page URL
            
        Returns:
            List of review dicts
        r   N333333?rX     rY  r\  z*Failed to fetch product page for reviews: HtmlResponser0   r   bodyr1   zError fetching reviews from r   )r"   timesleepr]  r$   r%   r^  rU   r   scrapy.httprm  contentr+  r   r   )r&   r   rp  r   rm  Zscrapy_responser   r+   r+   r,   _fetch_reviews_from_urlK  s.    


z%GenericSpider._fetch_reviews_from_urlc                    s
.  g }-z| d }|s(| d }|sL|j}td|}|rL|d}|rTz||dd}t|}zvddl	m
}	 |	|j}
|
j}d	}d
|jkr|jd
d dd dd }|r|rd| d| dd| d| dg}|D ]}zdd	l}|d tj|d| jidd}|jdkr| }t|trd|kr|d }t|trt|t|kr| jdt| dt| d |}W  qW q tk
r } z| jd|  W Y qW 5 d	}~X Y qX qW n6 tk
r } z| jd|  W 5 d	}~X Y nX t|tr|D ]}|dd }|d d!}ztt|}W n ttfk
r   d"}Y nX |d#p|d$p|d%p|d&p|d'p|d(p|d)p|d*pd}t|ttfr6dd+l m } z|!|" }W n tt#fk
r2   d}Y nX n|rHt$| nd}|s~t|% d	d }| jd,| d-|  |d.dp|d/d}|rt&|d0}d1'| d2(  }nd}g }|d3g } | rJt| trJ| D ]\}!t|!tr0|!d4p|!d5p|!d6}"|"rF|)|" nt|!t$r|)|! q|d7g }#|#rt|#tr|#D ]\}$t|$tr|$d4p|$d5p|$d6}"|"r|)|" nt|$t$rl|)|$ ql|r0|r0|)|||||r|ng dd8 q0|r| jd9t| d: W n6 tk
rR }% z| jd;|%  W 5 d	}%~%X Y nX |s,| d<}&|&D ]}'|' d= pd }|' d>}(|(rt|(nd}|dkr|' d? })ztt*t|)}W n ttfk
r   d"}Y nX |' d@ pp|' dA pp|' dB pp|' dC pp|' dD pp|' dE pp|' dF pp|' dG ppd}|r| }zJ|+ s|dHddId+ rdd+l m } t|}*|!|*" }W n tt#t,fk
r   Y nX |s | jdJ|  |' dK( }+d1'dLdM |+D }|' dN p4d },|,rd|,|krd|r`|, dO| }n|,}g }|' dP( }-|-r|-D ]0}"|"r|" rt-|j|" }.|)|. q|' dQ( }/|/r|/D ]0}"|"r|" rt-|j|" }.|)|. q|rh|rh|)|||||r|ng dR qh|	s| dS}0|0D ]\}'|' dTd }|' dU}1d}|1r|1.dVd}ztt|}W n ttfk
r   d"}Y nX |' dWd }|' dXd }g }|' dY( }-|-	r$|-D ]0}"|"r|" rt-|j|" }.|)|. q|' dZ( }/|/	rn|/D ]0}"|"	r<|" 	r<t-|j|" }.|)|. 	q<|r@|r@|)|||||	r|ng dd8 q@|,s8| d[ }2d	}3| d\ }4|4
r
td]|4}5|5	r|5d}3|2
s
td^|4}|
r
|d}2|2
r|3
s`|j}|2
s>td_|}|
r>|d}2|3
s`td`|}|
r`|d}3|2*rz| jda|2 db|3  |3*rN| jdc|jdd}6d}7de}8g }9t/ }:d	};d}<df}=|7|8k$r|;rLddgl	m0}>m1}? |>|;}@dhdi |@2 D }Adj|Akr |Adj= dk|Akrdl|Adk< |?|A}Bdm|3 dn|B }C|Adodp}D| jdq|D dr n ds}Edm|3 dt|2 du|E dv|7 }Cz2dd	l}|d tj|C|6dwd}F|FjdkrW $qddxl3m4}G |G|C|Fj5dydz}H|H d{}I|Is|7dkr| j6d| |H d}}I|Is| jd~|7 d W $qd}J|ID ]}K|K d }L|Lsx|K ddpDd}M|K d pXd}N|Msf|Nrt|M d|N nd	}L|Lr|L|:kr|:7|L |9)|K |Jd7 }Jn$|Ls| jd |9)|K |Jd7 }Jqt|9}O|7}P|;r.ddl	m0}> |>|;}@do|@kr.zt|@do d }PW n tt8fk
r,   Y nX | jdt|I d|P d|J d|O dr	 |7dkrt|Idwkrd}=| jdt|I d |H.d}Qt9|Q}R|7dkrt|Idkr|Rst|9}Sdm|3 dt|2 d|S }T| jdt|I d df}Uz|d tj|T|6dwd}V|Vjdkr|G|T|Vj5dydz}W|W d{}X|Xrt|Xdkrd}Y|XD ]}K|K d }L|Ls|K ddpd}M|K d pd}N|Ms|Nr|M d|N nd	}L|Lr|L|:kr|:7|L |9)|K |Yd7 }Yn|LsV|9)|K |Yd7 }YqV|Ydkr| jdt|X d|Y dt|9 dr |S|Y7 }Sd}Udm|3 dt|2 d|S }Z|d tj|Z|6dwd}[|[jdkr~q|G|Z|[j5dydz}\|\ d{}]|]rt|]dkrqd}^|]D ]}K|K d }L|Ls|K ddpd}M|K d pd}N|Ms|Nr|M d|N nd	}L|LrF|L|:krF|:7|L |9)|K |^d7 }^n|Ls|9)|K |^d7 }^q|^dkrpq| jdt|] d|S d|^ dt|9 dr	 |S|^7 }Sq>W n6 tk
r }_ z| jd|_  W 5 d	}_~_X Y nX |Ur| jdt|9  W $qn| jd |H.d}Qt9|Q}R|7dkr<t|9|J nd}`|7dkr|=r|Rs|Jdk s|`dkr|Jdkr|J|`d k rt|9}Sdm|3 dt|2 d|S }T| jd|J dt|9 dr z|d tj|T|6dwd}V|Vjdkr|G|T|Vj5dydz}W|W d{}X|Xrt|Xdkrd}Y|XD ]}K|K d }L|Ls~|K ddpJd}M|K d p^d}N|Msl|Nrz|M d|N nd	}L|Lr|L|:kr|:7|L |9)|K |Yd7 }Yn|Ls|9)|K |Yd7 }Yq|Ydkr| jdt|X d|Y dt|9 dr |S|Y7 }Sdm|3 dt|2 d|S }Z|d tj|Z|6dwd}[|[jdkrBqn|G|Z|[j5dydz}\|\ d{}]|]rnt|]dkrtqnd}^|]D ]}K|K d }L|Ls|K ddpd}M|K d pd}N|Ms|Nr|M d|N nd	}L|Lr
|L|:kr
|:7|L |9)|K |^d7 }^n|Ls||9)|K |^d7 }^q||^dkr4qn| jdt|] d|S d|^ dt|9 dr	 |S|^7 }Sq| jdt|9  W W $qW n6 tk
r }_ z| jd|_  W 5 d	}_~_X Y nX |H.d}at9|a}b|Jdkr|bs| jd|O d W $qn|Jdkr |br | jd |O}<d	}c| jddt: kr@|Pn|7 dr z|H d}cW n tk
rp   Y nX |cs ddddddddddddg}d|dD ]}ezb|H |e}c|crd1'|c d2( ;  t< fddMdD sdt$|c krW  q d	}cW n tk
r   Y qY nX q|csp|H.d}c|cs@|H.d}c|csP|H.d}c|csp|H.d}c|csp|H.d}c|cst|Idkr| jd|7 d W $q|7dkrt|Idwkp|7dko|Jdk}f|fr2t|9}Sdm|3 dt|2 d|S }T| jd|S d|7 dt|I d z|d tj|T|6dwd}V|Vjdkr|G|T|Vj5dydz}W|W d{}X|Xrt|Xdkrd}Y|XD ]}K|K d }L|Ls|K ddpd}M|K d pd}N|Ms|Nr|M d|N nd	}L|Lr|L|:kr|:7|L |9)|K |Yd7 }Yn|Lsp|9)|K |Yd7 }Yqp|Ydkr| jdt|X d|Y dt|9 dr |S|Y7 }Sdm|3 dt|2 d|S }Z|d tj|Z|6dwd}[|[jdkrq|G|Z|[j5dydz}\|\ d{}]|]rt|]dkrƐqd}^|]D ]}K|K d }L|Ls.|K ddpd}M|K d pd}N|Ms|Nr*|M d|N nd	}L|Lr\|L|:kr\|:7|L |9)|K |^d7 }^n|Ls|9)|K |^d7 }^q|^dkrq| jdt|] d|S d|^ dt|9 dr	 |S|^7 }SqT| jdt|9  W W $qn| jdt|X d W n6 tk
r0 }_ z| jd|_  W 5 d	}_~_X Y nX |Jdkrt| jdt|I d|7 d|J d |7d7 }7d	};W 
qn| jd|7 d W $q|crt=|cdÃrt|ct$sz t|cdkr|cd }cnd	}cW n tt,fk
r   Y nX d	}g|cr z|c dġ }gW n tk
r   Y nX |gsLz|c.dš }gW n tk
rJ   Y nX |gsz&t=|cdƃrvd|cj>krv|cj>d }gW n tk
r   Y nX |gsz*t=|cdǃrt=|cj?dȃr|cj?d}gW n tk
r   Y nX |grzddl@mA} ||g}gW n tk
r   Y nX |gs | jdʡ df}h|crzzt=|cd˃rLd1'|c d̡( nd}it=|cd˃rpd1'|c d͡( nd}jd|ikpd|ikpd|j; kpd|j; k}hW n tk
r   Y nX |h#r|g#stB r|7dk rt|9}Sdm|3 dt|2 d|S }T| jd|S  z|d tj|T|6dwd}V|Vjdkr|G|T|Vj5dydz}W|W d{}X|Xrt|Xdkrd}Y|XD ]}K|K d }L|Ls|K ddpd}M|K d pd}N|Ms|Nr|M d|N nd	}L|Lr|L|:kr|:7|L |9)|K |Yd7 }Yn|Lsh|9)|K |Yd7 }Yqh|Ydkr| jdt|X d|Y dt|9 dr |S|Y7 }Sd}7|7|8krdm|3 dt|2 d|S }Z|d tj|Z|6dwd}[|[jdkrq|G|Z|[j5dydz}\|\ d{}]|]rt|]dkr̐qd}^|]D ]}K|K d }L|Ls4|K ddp d}M|K d pd}N|Ms"|Nr0|M d|N nd	}L|Lrb|L|:krb|:7|L |9)|K |^d7 }^n|Ls|9)|K |^d7 }^q|^dkrq| jdt|] d|S d|^ dt|9 dr	 |S|^7 }S|7d7 }7qPW W $qW n6 tk
 r }_ z| jd|_  W 5 d	}_~_X Y nX |Jdk r| jd|J d՝ |7d7 }7ddl	m1}? |2dlt$|7dd؜}k|; rddl	m0}> |>|;}ld|lk rt|ld t r|ld d n|ld |kd< |?|k};W 
qnDt|9dk#r| jdڡ t|9}Sdm|3 dt|2 d|S }Tz|d tj|T|6dwd}V|Vjdk#r|G|T|Vj5dydz}W|W d{}X|X#rt|Xdk#rd}Y|XD ]}K|K d }L|L!s|K dd!pvd}M|K d !pd}N|M!s|N!r|M d|N nd	}L|L!r|L|:k!r|:7|L |9)|K |Yd7 }Yn|L!sJ|9)|K |Yd7 }Y!qJ|Ydk#r| jd|Y dt|9 dr |S|Y7 }S|7|8k #rdm|3 dt|2 d|S }Z|d tj|Z|6dwd}[|[jdk"rn#q|G|Z|[j5dydz}\|\ d{}]|]#rt|]dk"r#qd}^|]D ]}K|K d }L|L#s|K dd"pd}M|K d "pd}N|M"s|N#r|M d|N nd	}L|L#r6|L|:k#r6|:7|L |9)|K |^d7 }^n|L"s|9)|K |^d7 }^"q|^dk#r`#q| jdt|] d|S d|^ dt|9 dr	 |S|^7 }S|7d7 }7"q$W W $qW n6 tk
#r }_ z| jd|_  W 5 d	}_~_X Y nX | jdޡ W $q|g$rZ|gCdߡ$rFddl	m
}	 |	|g}md|mjDk$r@|mjE$pB|mjDdd nd};n|g};|7d7 }7W 
qnD|Jdk$r| jd|J d՝ |7d7 }7d	};W 
qn| jd W $qW nF tk
$r }n z&| jd|7 d|n  W Y $qW 5 d	}n~nX Y nX 
q|9*rN|;$rdn|7}o| jdt|9 d|o d |9}I|ID ]}| dd}M|M%sR| dd}M|M%s| dd}p|p%rtjFdd|ptjGd }M|M%pd }Mt| d}|dk%r| d}qt|q d}|dk%rd"}d}| d }r|r&rrz.dd+l m } t|r}s|sd }t|!|t" }W nV tt#tfk
&rn }u z0| jd|r d|u  | dd }W 5 d	}u~uX Y nX nz| dd}|&s| dD }|&rz.dd+l m } t|}s|sd }t|!|t" }W n  tt#tfk
&r   d}Y nX |&pd }|'rd|k'sd|; k'rd}g }v| dd},|,'rNd|,k'rN|v)|,  | dd}w|w'rr|v)|w  |v'sd1'| d2( }xtjFdtH|M d d|xtjGd}xtjFdd|xtjGd}xd1'|x }x|x'rt|xdk'r|v)|x d1'|v }d}y| dd}z|z(r |z }yn8d1'| d( }{td|{tjG}|||(rX||d }yg }dd }}| d}~|~D ]}| d }|(s| d }|(r|}|(rt-|C| }.|.|k(r|)|. | d̡ }|(rrd |k(rrtd|}|(rr|d}|}|(rrt-|C| }.|.|k(rr|)|. (qr| d( }|D ]`}"|}|")rJ| d|" d}|)rJ|.d}|)sJt-|C|" }.|.|k)rJ|)|. )qJ| d( }|D ]2}"|}|")rt-|C|" }.|.|k)r|)|. )q|M%r&|%r&|)|M||||*r|ng |y*r |yndd8 %q&|*rN| jd9t| d W n8 tk
*r } z| jd|  W 5 d	}~X Y nX |,s8| d}|D ]f}'|' d }|*s|' d	 }|*pd }t|' d
}|dk+r>|' d }z|+rtt|nd"}W n ttfk
+r<   d"}Y nX |' dd }|' dd }g }|' d( }-|-+r|-D ]0}"|"+r|" +rt-|j|" }.|)|. +q|' d( }/|/*r|/D ]0}"|"+r|" +rt-|j|" }.|)|. +qԐ*q|,r8|,r8|)|||||,r,|ng dd8 |-s| d}|D ]^}'|' dd }|' dd}z|,rtt|nd"}W n ttfk
,r   d"}Y nX |' dd }|' dd }g }|' d( }|-r2|D ]0}"|"-r |" -r t-|j|" }.|)|. -q |' d( }|-r~|D ]0}"|"-rL|" -rLt-|j|" }.|)|. -qL|,rN|,rN|)|||||-r|ng dd8 ,qN| jd9t| d W n8 tk
.r }u z| jId|u  W 5 d	}u~uX Y nX |S (  a  
        Extract product reviews from product page HTML.
        Supports multiple review platforms:
        - Judge.me
        - Yotpo
        - Loox
        - Shopify native reviews
        - Generic review markup (schema.org)
        
        Returns:
            List of review dicts with: author, rating, date, content, images, variant
            Each review dict contains:
            - author: Reviewer name
            - rating: Rating (1-5)
            - date: Review date
            - content: Review text content
            - images: List of image URLs (optional)
            - variant: Product variant info like "Navy / M" (optional, mainly for Loox)
            - images: List of image URLs attached to the review (optional)
        z/.jdgm-gallery-data.jdgm-hidden::attr(data-json)z[data-json]::attr(data-json)zdata-json=\'([^\']+)\'r   z&quot;"r   )r   Nr   r  r   ra   z https://judge.me/api/v1/reviews/z?product_id=z&per_page=100z,https://judge.me/api/v1/reviews?shop_domain=z&product_id=rj  rX  r<   rY  r\  r  r   z reviews from Judge.me API (vs z from HTML)zJudge.me API fetch failed: zError fetching Judge.me API: Zreviewer_namer?   rating5   Z
created_atZ	createdAtdatepublished_atZpublishedAt	timestampZreview_dateZ
reviewDater   z"Review date missing for review by z. Available keys: 	body_htmlro  r   r  r  photosrP  r   r   r   )authorrv  ry  rs  r   variantz
Extracted z  reviews from Judge.me JSON dataz$Failed to parse Judge.me JSON data: z	.jdgm-revz.jdgm-rev__author::textz%.jdgm-rev__rating .jdgm-star.jdgm--onz#.jdgm-rev__rating::attr(data-score)z(.jdgm-rev__timestamp::attr(data-content)z%.jdgm-rev__timestamp::attr(data-time)z*.jdgm-rev__timestamp::attr(data-timestamp)z.jdgm-rev__timestamp::textz[data-time]::attr(data-time)z&[data-timestamp]::attr(data-timestamp)ztime::attr(datetime)z
time::textrg  r  z.Judge.me HTML review date missing for author: z.jdgm-rev__body ::textc                 s   s   | ]}|  r|  V  qd S ry   rA  rQ  r+   r+   r,   r|   M  s      zCGenericSpider._extract_reviews_from_product_page.<locals>.<genexpr>z.jdgm-rev__title::textz. zb.jdgm-rev__photo img::attr(src), .jdgm-rev__photos img::attr(src), .jdgm-rev__media img::attr(src)zC.jdgm-rev__photo::attr(data-src), .jdgm-rev__photos::attr(data-src))r~  rv  ry  rs  r   z.yotpo-reviewz.yotpo-user-name::textz.yotpo-review-starsz./@data-scorez.yotpo-review-date::textz.yotpo-review-content::textz\.yotpo-review-photo img::attr(src), .yotpo-photo img::attr(src), .yotpo-media img::attr(src)zI.yotpo-review-photo img::attr(data-src), .yotpo-photo img::attr(data-src)z([data-product-id]::attr(data-product-id)z!iframe[src*="loox.io"]::attr(src)zloox\.io/widget/([^/]+)/zproductId=(\d+)zdata-product-id=["\'](\d+)["\']zloox\.io/widget/([^/\s"\']+)/zFound Loox product ID: z, widget ID: ztext/html,application/xhtml+xml)rX  ZAcceptZRefererr   F)r  	urlencodec                 S   s,   i | ]$\}}|t |tr$|r$|d  n|qS )r   )r   r   rl   r+   r+   r,   rp     s      zDGenericSpider._extract_reviews_from_product_page.<locals>.<dictcomp>totallimit6zhttps://loox.io/widget/z	/reviews?r   unknownz6Building URL from next_page_params (page from params: rE     z/reviews?productId=z&limit=z&page=rk  rl  r0   rn  z
.grid-itemz;No reviews found on first page, trying alternative selectorz2.review-item, .loox-review-item, [class*="review"]zNo reviews found on page rF  z::attr(data-id)z .owner::text, .block.owner::textz.block.time::attr(data-time)_z$Review item has no ID, adding anyway)r  zFetched z reviews from Loox page z (new: z, total unique: TzPage 1 has zO reviews (>=15), will try offset-based pagination if page 2 has <10 new reviewszv//button[contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "show more reviews")]r   z&limit=200&offset=zA reviews (>=20), switching to offset-based pagination immediatelyz>Offset-based pagination works (switched from page 1), fetched z reviews (new: z	, total: z reviews via offset z,Offset-based pagination from page 1 failed: z2Completed offset-based pagination. Total reviews: zXOffset-based pagination from page 1 didn't fetch new reviews, continuing with page-basedr   z7Switching to offset-based pagination after page 2 (got z new reviews on page 2, total: zBOffset-based pagination works (switched from page-based), fetched z Offset-based pagination failed: zNo new reviews added (still z9) and no 'Show more reviews' button - stopping paginationzRNo new reviews added but 'Show more reviews' button exists - continuing paginationz4Looking for load more button in response HTML (page actual_pagez	#loadMorez$button:contains("Show more reviews")za:contains("Show more reviews")zbutton[data-url]za[data-url]z[data-url*="page"]z.load-more-buttonz.loox-load-morez[id*="load"]z[class*="load"]z[class*="more"]zbutton[class*="show"]za[class*="show"]c                 3   s   | ]}| kV  qd S ry   r+   r   Zbutton_textr+   r,   r|   	  s     )ZmoreloadZshowzdata-urlz//button[@id="loadMore"]zq//a[contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "show more reviews")]z//button[contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "more") or contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "load")]z//a[contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "more") or contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "load")]z+No load more button and no reviews on page z
, stoppingz+Trying offset-based pagination with offset=z (page z, got z	 reviews)z'Offset-based pagination works, fetched z!Offset-based pagination returned z items but all were duplicateszNo load more button but got z reviews on page z), trying next page anywayz/No load more button and no new reviews on page __iter__z::attr(data-url)z	@data-urlrO  rootr$   r   z&Button found but no data-url attributer   z::attr(style)z::attr(class)zdisplay:nonezdisplay: noneZhiddenZdisabledzNLoad more button hidden on page 1, trying offset-based pagination with offset=z7Offset-based pagination works (button hidden), fetched zButton hidden but got z new reviews, trying next page)r  en)Z	productIdr  r   languager  zLButton hidden and no new reviews, trying offset-based pagination as fallbackzOffset-based pagination found z more reviews (total: z)Offset-based pagination fallback failed: z5Button hidden and no new reviews, stopping paginationr<  zNo data-url but got z2No data-url and no new reviews, all reviews loadedzError fetching Loox page r   z reviews from Loox across z page(s)z.owner::textz.block.owner::textz.block.title::textz\s+Verified.*rG  z.star.active, .star.fullz.block.ratingz.starg     @@zFailed to parse Loox timestamp z.block.time::textz	Item typeztype:ZVerifiedz#.block.desc::text, .main-text::textz^\s*z\s*Verified\s*z\s+Verified\s*z.metadata .value::textz.metadata ::textzItem\s+type:\s*([^\n]+)c                 S   sR   | r|   sdS |   } | dr&dS d| ks:d|  kr>dS d|  krNdS dS )NFr`   Z1x1Ztransparentzproduct-boxT)rb   rc   rT   )r0  r+   r+   r,   is_valid_review_image  s    
zOGenericSpider._extract_reviews_from_product_page.<locals>.is_valid_review_imagez.photo, .photo-itemr   r  zbackground-imagezurl\(["\']?([^"\']+)["\']?\)z	img[src="z"]z../ancestor::*[contains(@class, "product-box")]z  reviews from Loox widget iframezFailed to fetch Loox widget: z.loox-rating, .loox-reviewz4.loox-reviewer-name::text, .loox-customer-name::textz.[data-customer-name]::attr(data-customer-name)z".loox-star.active, .loox-star.fullz [data-rating]::attr(data-rating)z5.loox-review-date::text, [data-date]::attr(data-date)z0.loox-review-text::text, .loox-review-body::textzY.loox-review-photo img::attr(src), .loox-photo img::attr(src), .loox-media img::attr(src)z&.loox-review-photo img::attr(data-src)z[itemtype*="Review"]z[itemprop="author"]::textz'[itemprop="ratingValue"]::attr(content)z)[itemprop="datePublished"]::attr(content)z[itemprop="reviewBody"]::textzb[itemprop="image"]::attr(content), [itemprop="image"] img::attr(src), .review-image img::attr(src)z&[itemprop="image"] img::attr(data-src)z reviews from product pagezError extracting reviews: )Jr   r$   r   r   r   r   r   r   r   r$  r   r   rM   rd   rp  rq  r]  r%   r^  r   r   r   r   rU   rV   r   r   rb   rJ   r   rL   rK   r   ZfromtimestampZ	isoformatOSErrorr   keysr	   r   r   r   roundre  AttributeErrorr   r   rQ   r  r  r   rr  rm  rs  r   r   ri  rY   localsrT   r   rU  rO  r  r   r   Zshould_try_offset_proactiverc   r   r&  rJ  rI  escaper   )r&   r   r  Zjudgeme_gallery_dataZ	html_textr  r   Zjson_strZreview_datar   r4  Zshop_domainZproduct_handleZapi_urlsZapi_urlrp  Zapi_responseZapi_dataZapi_reviewsZapi_errZapi_fetch_errZreview_itemr~  Z
rating_strrv  ry  r   Zavailable_keysZcontent_htmlZcontent_selectorrs  Zreview_imagesr}  Zphotor0  r   r  Zjson_errZjudgeme_reviewsZreviewZrating_nodesZrating_scorer{  Zcontent_nodesrh   Z
photo_imgsr   Z
photo_dataZyotpo_reviewsZ
rating_divZloox_product_idZloox_widget_idZloox_iframeZwidget_matchrZ  r   r;   Zall_review_itemsZseen_review_idsZnext_page_paramsZprevious_review_countZshould_try_offset_after_page2r  r  Zparams_dictZflat_paramsZquery_stringZloox_widget_urlZcurrent_page_from_paramsr  Zwidget_responserm  Zwidget_htmlZloox_review_itemsZnew_reviews_countr   Zitem_idownerZ	date_attrZcurrent_review_countr  Zquick_check_buttonZhas_show_more_buttonoffsetZloox_widget_url_offsetZuse_offset_basedZoffset_responseZoffset_htmlZoffset_itemsZnew_offset_countZ
offset_urlZoffset_respZoffset_html_respZoffset_items_pageZnew_page_countZ
offset_errZpage1_review_countZquick_check_button_beforeZhas_show_more_button_beforeZload_more_buttonZbutton_selectorsrB   Zshould_try_offsetZnext_page_data_urlZ	is_hiddenZbutton_styleZbutton_classZmanual_paramsZprev_paramsr\   Zpage_errZactual_pagesZ
title_textZrating_blockZ	data_timeZtimestamp_msZtimestamp_sr   Zcontent_partsZdescZall_textr  Zvariant_valueZmetadata_textZvariant_matchr  Zphoto_containersr   Zimg_srcZstyleZbg_matchZbg_urlZall_imgsZimg_elementr@  Zdata_src_imgsZ
widget_errZloox_reviewsZrating_attrZschema_reviewsZ
rating_valZschema_imgsZschema_datar+   r  r,   r+  u  s   



$
" $


""$




$	



















*







(






.$
B






(






.$
$
*




$






(






.
$
$
$$









(







.$

(















.$*

 



(
"











	&





	&z0GenericSpider._extract_reviews_from_product_pagec                 C   s"   | j d|jj d|j  dS )zHandle request errors.zRequest failed: z - N)rU   r   Zrequestr   r7  )r&   Zfailurer+   r+   r,   r     s    zGenericSpider.errback_httpbinc                 c   sd  | j s| jd dS | jd| j   | j  d}d}d}|rN| jd| d|  ddl}|d	 z`tj|d
| jidd}|j	dkr| j
d|j	 d W dS | }|dg }|s| jd W qN| jdt| d| d |D ]}zV| |}	|	D ]4}
|
rt }|
 D ]\}}|||< q&|V  q|	rR|d7 }W q tk
r } z| jd|  W Y qW 5 d}~X Y qX q| |jdd}|d7 }|dkr| j
d W qNW q> tjjk
r } z| jd|  W Y dS d}~X Y q> tk
rJ } z| jd|  W Y dS d}~X Y q>X q>| jd|  dS )zn
        Crawl Shopify store using Products API.
        More reliable and faster than HTML crawling.
        z/No base_url configured for Shopify API crawlingNzStarting Shopify API crawl for z/products.json?limit=250r   r   zFetching Shopify API page r   g      ?rX     rY  r\  zShopify API returned status z . Falling back to HTML crawling.r  z&No more products found in API responsezGot z products from API (page rE  "Error converting Shopify product: ZLinkr?   r   z&Reached maximum page limit (100 pages)zShopify API request failed: z'Unexpected error in Shopify API crawl: z-Shopify API crawl completed. Total products: )r>   rU   r   rV   rp  rq  r]  r$   r%   r^  r   r   r   *_convert_shopify_api_product_with_variantsr   r   r   _parse_shopify_link_headerrZ  
exceptionsZRequestException)r&   r   r   Ztotal_productsrp  r   rW  r  Zproduct_dataZproduct_itemsr,  product_itemr   r7  r   r+   r+   r,   r     sl    





zGenericSpider.crawl_shopify_api)link_headerr-   c                 C   sd   |sdS | d}|D ]H}d|ks*d|krtd|}|r|d}| jd|  |  S qdS )aM  
        Parse Shopify Link header to get next page URL.
        
        Example Link header:
        <https://shop.com/products.json?page_info=abc123&limit=250>; rel="next"
        
        Args:
            link_header: The Link header value
            
        Returns:
            Next page URL or None if no next page
        Nr   z
rel="next"z
rel='next'z	<([^>]+)>r   zFound next page URL: )rd   r   r   r   rU   r   )r&   r  Zlinksr   r  rK  r+   r+   r,   r    s    


z(GenericSpider._parse_shopify_link_header)shopify_productr-   c              
   C   s  zD| dg }|s4| jd| d d W dS |d }| dg }g }|D ]0}| dd	}|rP|d
rvd| }|| qP| dd	}|r| j d| nd	}	| dd	}
| |
}| dd	}|r|gng }| dg }t|trdd |	dD }g }| j
r6|	r6| jd| d  | |	}t }| dd	|d< | dd	|d< t| dd|d< | d}|rt||d< nd|d< ||d< ||d< |	|d< |rd|nd|d< | ddrd nd!|d"< | ddrdnd#|d$< | d%d	||rd|nd	t| d&d| d'd	d(|d)< ||d*< |W S  tk
r } z| jd+|  W Y dS d}~X Y nX dS ),z
        Convert Shopify API product JSON to ProductItem.
        
        Args:
            shopify_product: Product data from Shopify API
            
        Returns:
            ProductItem or None if conversion fails
        variantsProduct rh    has no variants, skippingNr   r   rP  r?   r  r  r_  r   r|  product_typetagsc                 S   s   g | ]}|  r|  qS r+   rA  rm   r   r+   r+   r,   r   c  s      z>GenericSpider._convert_shopify_api_product.<locals>.<listcomp>r   Fetching reviews for: rr   r   r   compare_at_priceoriginal_priceri   r   r   r   	availableTr   out_of_stockr   unavailabler   vendorgramsrz  )r  r  r  weight_gramsrz  r   r  r  )r$   rU   r   rc   r   r>   r  r   r   rd   r"   rV   rt  r   r   r   r   r   )r&   r  r  Zfirst_variantr   rB  r  rP  r_  r   ri   description_cleanr  
categoriesr  r  r  r  r   r+   r+   r,   _convert_shopify_api_product6  sh    







	z*GenericSpider._convert_shopify_api_productc           "   
   C   s  z| dg }|s4| jd| d d g W S | dd}|rT| j d| nd}| dd}| |}| d	d}| d
g }t|trdd |dD }g }	| jr|r| j	d| d  | 
|}	| dg }
g }|
D ]2}| dd}|r|drd| }|| qg }i }tddD ]}d| }| dg }|t|kr.||d  }| dd}|r.|dkr.|| t }|D ] }| |}|r|| qtt|||< q.g }t|dkr|d }i }| dd|d< | dd|d< t| dd|d< | d}|r8t||d< nd|d< ||d< ||d< ||d < |rb|nd|d!< | d"d#r|d$nd%|d&< | d"d#rd"nd'|d(< | d)d||rd*|ndt| d+d| d,dd-d.|d/< |	|d0< || ni }| dd|d< d|d< d|d< d|d< ||d< ||d< ||d < |rD|nd|d!< d1|d&< d"|d(< | d)d||rxd*|nd| d,dd2d3}|	|d0< t|dD ]T\}}||d4| d5< d6|| |d4| d7< d8|d4| d9< d8|d4| d:< q||d/< |	|d0< || | jd;t| d<| d  |D ]6}i }| dd|d< | dd|d< t| dd|d< | d}|rt||d< nd|d< d|d< g |d< ||d < d|d!< | d"d#rd$nd%|d&< | d"d#rd"nd'|d(< d=dt| d+dd>}t|dD ]@\}}d| }| |d}||d4| d5< ||d4| d?< qg |d0< ||d/< || q4| jd@t| dAt| dB |W S  tk
r }  z6| jdC|   ddl}!| j|!  g  W Y S d} ~ X Y nX dS )Da0  
        Convert Shopify API product with variants to multiple WooCommerce rows.
        Returns list of ProductItem dicts (parent + variations).
        
        Args:
            shopify_product: Product data from Shopify API
            
        Returns:
            List of ProductItem dicts
        r  r  rh   r  r_  r?   r   r|  r  r  c                 S   s   g | ]}|  r|  qS r+   rA  r  r+   r+   r,   r     s      zLGenericSpider._convert_shopify_api_product_with_variants.<locals>.<listcomp>r   r  r   rP  r  r  r      optionoptionsrr   ZTitler   r   r   r  r  Nri   r   r   r  Tr   r  r   r  r   r  r   r  rz  Zsimple)r  r  r  r  rz  rD   r   r  Zinstockvariable)r  r  r  rz  rD   Z
attribute__name|Z_values1Z_visibleZ
_variationz	Creating z variations for Z	variation)rD   Z
parent_skur  _valuezCreated parent + z variations = z total itemsz0Error converting Shopify product with variants: )r$   rU   r   r>   r  r   r   rd   r"   rV   rt  rc   r   r   r   rQ   r   sortedr   r   r   r%  r   r   r   	traceback
format_exc)"r&   r  r  r_  r   ri   r  r  r  r  r   rB  r  rP  Zoption_namesZoption_values_by_nameiZ
option_keyZoption_name_keyZ
option_objZoption_namer   r  valr  r  r  Zparent_itemr   Zvariation_itemZvar_attributesZoption_valuer   r  r+   r+   r,   r    s    












	
"


"z8GenericSpider._convert_shopify_api_product_with_variants)NN)0__name__
__module____qualname____doc__rr   r   r   r   r   r   r   r#   rf   r   r   r   r   r   r   r   r   r*  r   r   r   r#  rL  r'  r(  rY   r   r   r   rQ   r   rW   r   r   r  r  rt  r+  r   r   r  r  r  __classcell__r+   r+   r)   r,   r      sf   O 5 e5  ', K/&32,	*          RR]r   )r  r   r   r   typingr   r   r$  r   r   r]  r   Zscrapyr   Zscrapy.selectorr	   Zscrapy.utils.projectr
   r   Zscrapy_project.itemsr   r   Z&scrapy_project.utils.platform_detectorr   ZSpiderr   r+   r+   r+   r,   <module>   s   