U
    ±Diõx  ã                   @   sÂ   d Z ddlZddlZddlmZ ddlmZ ddlmZm	Z	 ddl
mZ ddlmZ ddlmZmZmZmZ dd	lmZ G d
d„ dƒZG dd„ dƒZG dd„ dƒZG dd„ dƒZG dd„ dƒZdS )z%Scrapy pipelines for data processing.é    N)ÚPath)Údatetime)ÚListÚDict)ÚItem)ÚDropItem)ÚProductItemÚProductÚShopItemÚShop)Úurlparsec                   @   s    e Zd ZdZeedœdd„ZdS )ÚValidationPipelinez%Validate items using Pydantic models.©ÚitemÚreturnc           
   
   C   sÚ   |  d¡dkr|S |  di ¡}t|tƒr4|  dd¡nd}|dkrD|S z6t |¡}|jdd}| ¡ D ]\}}|||< qd|W S  tk
rÔ }	 z<|j 	d	|  d
d¡› dt
|	ƒ› ¡ t
|	ƒ|d< | W Y ¢S d}	~	X Y nX dS )z#Validate item using Pydantic model.Ú	item_typeÚshopÚ
attributesÚtypeÚsimple©ÚvariableÚ	variationT©Zexclude_nonezValidation error for item ÚnameZUnknownz: Z_validation_errorN)ÚgetÚ
isinstanceÚdictr	   Úfrom_scrapy_itemÚ
model_dumpÚitemsÚ	ExceptionÚloggerÚerrorÚstr)
Úselfr   ÚspiderÚattrsÚproduct_typeÚproductZvalidated_dictÚkeyÚvalueÚe© r-   úscrapy_project/pipelines.pyÚprocess_item   s     

$zValidationPipeline.process_itemN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r/   r-   r-   r-   r.   r      s   r   c                   @   s8   e Zd ZdZdd„ Zeedœdd„Zeedœdd	„Zd
S )ÚDeduplicationPipelinez6Remove duplicate items based on normalized URL or SKU.c                 C   s   t ƒ | _d S ©N)ÚsetÚids_seen)r%   r-   r-   r.   Ú__init__1   s    zDeduplicationPipeline.__init__)Úurlr   c                 C   s    |sdS |  d¡s|  d¡s dS ddl}| d|¡}|rj| d¡ ¡ }t|ƒ}|j› d|j› d|› }|S t|ƒ}|j› d|j› |j›  	d	¡}d|krœdS |S )
zLNormalize product URL to extract unique identifier (same logic as analyzer).N)zjavascript:zdata:zmailto:Zhttpr   z/products/([^/?#]+)é   z://z
/products/ú/)
Ú
startswithÚreÚsearchÚgroupÚlowerr   ZschemeÚnetlocÚpathÚrstrip)r%   r9   r=   Zproduct_slug_matchZproduct_slugÚparsedÚnormalized_urlr-   r-   r.   Ú_normalize_product_url4   s     z,DeduplicationPipeline._normalize_product_urlr   c                    sœ  |  d¡dkr|S |  di ¡}t|tƒr6|  d¡dknd}|rt|  d¡r^d|  d¡› }qX|  d	d
¡}|rx|  |¡nd}|rðg }tddƒD ]*}	|  d|	› dd
¡}
|
rŽ| t|
ƒ¡ qŽ|rÔ|› dd |¡› }n|  dd
¡}|› d|› }n‚|  dd
¡}g }tddƒD ].}	|  d|	› dd
¡}
|
r
| t|
ƒ¡ q
|rXd|› dd |¡› }n|j 	d|› ¡ t
dƒ‚nä|  d	d
¡}|r|  |¡nd}|r |}n¸|  d¡r¾d|  d¡› }nš|  d¡r>dddddg}|  dd
¡ ¡ ‰ t‡ fdd„|D ƒƒr,|j d|  d¡› ¡ t
d|  d¡› ƒ‚d|  d¡› }n|j 	d |› ¡ t
d!ƒ‚|| jkrŒ|j d"|› d#|› d$¡ t
d"|› ƒ‚| j |¡ |S )%z*Check for duplicates using normalized URL.r   r   r   r   r   FÚskuzsku:Úproduct_urlÚ Nr:   é   Ú
attribute_Ú_valueú#ú|Úpricez#price:r   zname:zVariation has no identifier: zVariation has no identifieru   cÃ¡c sáº£n pháº©m khÃ¡czother productszview allzsee allzshop allc                 3   s   | ]}|ˆ kV  qd S r5   r-   )Ú.0Úkeyword©Z
name_lowerr-   r.   Ú	<genexpr>Š   s     z5DeduplicationPipeline.process_item.<locals>.<genexpr>zSkipping category name: zItem has no identifier: zItem has no identifierzDuplicate item found: z (original URL: ú))r   r   r   rF   ÚrangeÚappendr$   Újoinr"   Úwarningr   r@   ÚanyÚdebugr7   Úadd)r%   r   r&   r'   Zis_variationZ
identifierrH   rE   Zattr_valuesÚiZ
attr_valuerO   r   Zcategory_keywordsr-   rR   r.   r/   P   sb    

z"DeduplicationPipeline.process_itemN)	r0   r1   r2   r3   r8   r$   rF   r   r/   r-   r-   r-   r.   r4   .   s   r4   c                   @   sh   e Zd ZdZdedœdd„Zedd„ ƒZdd	„ Zed
œdd„Z	e
e
dœdd„Zedœdd„Zdd„ ZdS )ÚShopPipelinez«
    Aggregate shop/site information across multiple pages and persist to a JSON file.

    This pipeline is intentionally best-effort and must never break the crawl.
    údata/exports©Ú
export_dirc                 C   s&   t |ƒ| _| jjddd tƒ | _d S ©NT)ÚparentsÚexist_ok)r   r`   Úmkdirr   r   )r%   r`   r-   r-   r.   r8   ¡   s    
zShopPipeline.__init__c                 C   s   |j  dd¡}| |dS )NÚ
EXPORT_DIRr^   r_   ©Zsettingsr   )ÚclsÚcrawlerr`   r-   r-   r.   Úfrom_crawler¦   s    zShopPipeline.from_crawlerc                 C   s<   t t|dd ƒ|  |¡ dd¡d| _|j d| j› ¡ d S )NÚbase_urlú-Ú.)rj   Údomainz-Shop pipeline initialized. Export directory: )r   ÚgetattrÚ_get_domain_from_spiderÚreplacer   r"   Úinfor`   ©r%   r&   r-   r-   r.   Úopen_spider«   s
    
þzShopPipeline.open_spider©r   c                 C   sb   t |ddƒ}|r|S t |ddƒ}|r^z$t|ƒ}|j dd¡ dd¡}|W S  tk
r\   Y nX dS ©	z=Extract domain slug from spider (prefers custom export slug).Úexport_domainNrj   rI   úwww.rl   rk   Úunknown)rn   r   rA   rp   r!   ©r%   r&   Zcustom_slugrj   rD   rm   r-   r-   r.   ro   ²   s    z$ShopPipeline._get_domain_from_spider)ÚexistingÚ
new_valuesc                 C   s*   |pg D ]}|sq||kr|  |¡ qd S r5   )rV   )r%   rz   r{   Úvr-   r-   r.   Ú_merge_listÂ   s
    zShopPipeline._merge_list)r   c              
   C   s¤  |  d¡dkr|S zT|  d¡p"i }t|tƒs2i }dD ].}|  |¡}|r6t| j|d ƒs6t| j||ƒ q6|  | jj|  d¡p|g ¡ |  | jj|  d¡p–g ¡ |  | jj	|  d¡p°g ¡ |  | jj
|  d¡pÊg ¡ |  d	¡}|  d
¡}|r||pðddœ}|| jjkr| jj |¡ |pd}	| jj  |	i ¡}
t|
tƒs<i }
|
 ||  d¡|  d¡dœ¡ |
| jj|	< W n6 tk
rž } z|j d|› ¡ W 5 d }~X Y nX |S )Nr   r   Ú	extracted)r   ÚdescriptionÚlogoÚemailsÚphonesÚsocial_linksÚ	addressesÚpage_urlÚ	page_typerx   )r9   r   ÚjsonldÚmeta)r…   r‡   rˆ   z)ShopPipeline: failed to merge shop item: )r   r   r   rn   r   Úsetattrr}   r   r‚   rƒ   r„   ÚpagesrV   ÚrawÚupdater!   r"   rX   )r%   r   r&   r~   r*   Únew_valr…   r†   Z
page_entryZraw_keyZ
raw_bucketr,   r-   r-   r.   r/   É   sB    




ý$zShopPipeline.process_itemc           	   
   C   s(  t | jj| jj| jj| jj| jj| jj| jj| jj	gƒ}|sL|j
 d¡ d S |  |¡}t|dd ƒpnt ¡  d¡}|› d|› d}| j| }zbt|ddd$}tj| jjd	d
|ddtd W 5 Q R X |j
 d|› ¡ |jj dt|ƒ¡ W n6 tk
r" } z|j
 d|› ¡ W 5 d }~X Y nX d S )Nz:ShopPipeline: no shop data collected; skipping shop exportÚrun_timestampú%Y%m%d-%H%M%Srk   z
-shop.jsonÚwúutf-8©ÚencodingTr   Fé   ©Úensure_asciiÚindentÚdefaultzSaved shop info to Zshop_json_filez)ShopPipeline: failed to write shop file: )rY   r   r   r   r€   r   r‚   rƒ   r„   rŠ   r"   rq   ro   rn   r   ÚnowÚstrftimer`   ÚopenÚjsonÚdumpr   r$   rh   ÚstatsÚ	set_valuer!   rX   )	r%   r&   Zhas_anyrm   Ú	timestampÚfilenameÚfilepathÚfr,   r-   r-   r.   Úclose_spiderø   s0    ø


(zShopPipeline.close_spiderN)r^   )r0   r1   r2   r3   r$   r8   Úclassmethodri   rs   ro   Úlistr}   r   r/   r¤   r-   r-   r-   r.   r]   š   s   
/r]   c                   @   sZ   e Zd ZdZdedœdd„Zedd„ ƒZdd	„ Ze	e	d
œdd„Z
edœdd„Zdd„ ZdS )ÚStoragePipelinezStore items to JSON file.Údata©Ú
output_dirc                 C   s$   t |ƒ| _| jjddd g | _d S ra   )r   rª   rd   Úproducts)r%   rª   r-   r-   r.   r8     s    
zStoragePipeline.__init__c                 C   s   |j  dd¡}| |dS )ú/Create pipeline instance from crawler settings.ZDATA_OUTPUT_DIRr¨   r©   rf   )rg   rh   rª   r-   r-   r.   ri     s    zStoragePipeline.from_crawlerc                 C   s   g | _ |j d| j› ¡ dS )úCalled when spider is opened.z0Storage pipeline initialized. Output directory: N)r«   r"   rq   rª   rr   r-   r-   r.   rs   $  s    zStoragePipeline.open_spiderr   c                 C   sZ   |  d¡dkr|S dd„ | ¡ D ƒ}d|krJt|d tƒrJ|d  ¡ |d< | j |¡ |S )z!Process item and store in memory.r   r   c                 S   s   i | ]\}}|d k	r||“qS r5   r-   )rP   Úkr|   r-   r-   r.   Ú
<dictcomp>0  s       z0StoragePipeline.process_item.<locals>.<dictcomp>Z
crawled_at)r   r    r   r   Z	isoformatr«   rV   )r%   r   r&   Zproduct_dictr-   r-   r.   r/   )  s    zStoragePipeline.process_itemrt   c                 C   sZ   t |ddƒ}|r|S t |ddƒ}|rVz$t|ƒ}|j dd¡ dd¡}|W S    Y nX dS )	z&Extract domain from spider's base_url.rv   Nrj   rI   rw   rl   rk   rx   ©rn   r   rA   rp   ry   r-   r-   r.   ro   9  s    z'StoragePipeline._get_domain_from_spiderc              	   C   sÎ   | j s|j d¡ dS |  |¡}t|ddƒp8t ¡  d¡}|› d|› d}| j| }t	|ddd	}t
j| j |d
dd W 5 Q R X |j dt| j ƒ› d|› ¡ |jj dt|ƒ¡ |jj dt| j ƒ¡ dS )z8Called when spider is closed. Save all products to JSON.zNo products to saveNrŽ   r   rk   ú.jsonr   r‘   r’   Fr”   )r–   r—   zSaved z products to Zproducts_json_fileZproducts_count)r«   r"   rX   ro   rn   r   r™   rš   rª   r›   rœ   r   rq   Úlenrh   rž   rŸ   r$   )r%   r&   rm   r    r¡   r¢   r£   r-   r-   r.   r¤   H  s    

zStoragePipeline.close_spiderN)r¨   )r0   r1   r2   r3   r$   r8   r¥   ri   rs   r   r/   ro   r¤   r-   r-   r-   r.   r§     s   
r§   c                   @   sœ   e Zd ZdZdeedœdd„Zedd„ ƒZd	d
„ Ze	e	dœdd„Z
edœdd„Zdd„ Zeedœdd„Zeedœdd„Zeedœdd„Zeedœdd„ZdS )ÚExportPipelinezAExport products to various formats (WooCommerce, Shopify, Excel).r^   Úwoocommerce©r`   Úexport_formatc                 C   s*   t |ƒ| _| jjddd || _g | _d S ra   )r   r`   rd   r¶   r«   )r%   r`   r¶   r-   r-   r.   r8   b  s    
zExportPipeline.__init__c                 C   s(   |j  dd¡}|j  dd¡}| ||dS )r¬   re   r^   ZEXPORT_FORMATr´   rµ   rf   )rg   rh   r`   r¶   r-   r-   r.   ri   h  s    zExportPipeline.from_crawlerc                 C   s&   g | _ |j d| j› d| j› ¡ dS )r­   z%Export pipeline initialized. Format: z, Directory: N)r«   r"   rq   r¶   r`   rr   r-   r-   r.   rs   o  s    zExportPipeline.open_spiderr   c              
   C   s  |  d¡dkr|S zœt|ƒ}|  di ¡}t|tƒr>|  dd¡nd}|dkrx| j |¡ |j d|› d|  d	d
¡› ¡ n4t |¡}| j |¡ |j d|j	› d|j
› d¡ W nN tk
rü } z0|j dt|ƒ› ¡ |j dt|ƒ› ¡ W 5 d}~X Y nX |S )zGConvert item to Pydantic model or store raw dict for variable products.r   r   r   r   r   r   zAdded z product to export: r   zN/AzAdded product to export: z (rT   zError processing item: zItem data: N)r   r   r   r«   rV   r"   rZ   r	   r   r   rH   r!   r#   r$   )r%   r   r&   Z	item_dictr'   r(   r)   r,   r-   r-   r.   r/   t  s     "
"(zExportPipeline.process_itemrt   c                 C   sZ   t |ddƒ}|r|S t |ddƒ}|rVz$t|ƒ}|j dd¡ dd¡}|W S    Y nX dS ru   r°   ry   r-   r-   r.   ro     s    z&ExportPipeline._get_domain_from_spiderc                 C   sè   | j s|j d¡ dS |  |¡}t|ddƒp8t ¡  d¡}| jdkrT|  	|||¡ n| jdkrn|  
|||¡ nv| jdkrˆ|  |||¡ n\| jdkr¢|  |||¡ nB| jd	krä|  	|||¡ |  
|||¡ |  |||¡ |  |||¡ dS )
z#Export products when spider closes.zNo products to exportNrŽ   r   r´   ZshopifyZexcelrœ   Úall)r«   r"   rX   ro   rn   r   r™   rš   r¶   Ú_export_woocommerceÚ_export_shopifyÚ_export_excelÚ_export_json)r%   r&   rm   r    r-   r-   r.   r¤   Ÿ  s$    





zExportPipeline.close_spider)rm   r    c              )   C   sÜ  g }| j D ]n}t|tƒrž| di ¡}t|tƒr<| dd¡nd}| dd¡}| dd¡}	| d¡}
| d¡}| d	d¡}| d
g ¡}| dd¡}| dd¡}n~t|dƒr´|jr´|jni }t|tƒrÎ| dd¡nd}|j}|jpàd}	|j}
|j	}|j
pöd}|jpg }|jpd}|jpd}|dkrªt|tƒr>| dg ¡ng }|rRt |¡nd}d|	|ddd|rt|dd… nd|ddddddddddddddddd|dd|r°d |¡ndddddddddd|dœ'}tddƒD ]Æ}d|› d}d|› d}d|› d}||krZ|| |d|› d< || |d|› d< | |d ¡|d|› d!< d|d|› d"< n@d|d|› d< d|d|› d< d|d|› d!< d|d|› d"< qÖ| |¡ q
|d#krd#|	|ddddddddd|d$krÚdndddddt|tƒrü| d%d¡ndddddd|r|
nd|r|n|
p&ddddddddddddddd&œ&}tddƒD ]¶}d|› d}d|› d'}||krÄ|| |d|› d< | |d¡|d|› d< d|d|› d!< d|d|› d"< n@d|d|› d< d|d|› d< d|d|› d!< d|d|› d"< qP| |¡ q
t|tƒrà| dg ¡}|r<t |¡nd}d|	|ddd|r^|dd… nd|dddd|d$krxdnddddddddddd|r˜|
nd|r¤|n|
p¬d|dd|rÂd |¡ndddddddddd|dœ'}n>| ¡ }t|dƒr|jr|jng }|rt |¡nd|d(< tddƒD ]F}d|d|› d< d|d|› d< d|d|› d!< d|d|› d"< q(| |¡ q
t |¡}|› d)|› d*}| j| }|j|d+d,d- |j d.t|ƒ› d/t| j ƒ› d0|› ¡ dS )1z?Export to WooCommerce CSV format with variable product support.r   r   r   r   rI   rG   rO   Úoriginal_pricer   ÚimagesÚcategoryÚstock_statusZinstockr   Úreviewsr:   r   ZvisibleNéÈ   ZtaxablerN   )'ÚTypeÚSKUÚNameÚ	PublishedúIs featured?úVisibility in catalogúShort descriptionÚDescriptionúDate sale price startsúDate sale price endsú
Tax statusú	Tax classú	In stock?ÚStockúLow stock amountúBackorders allowed?úSold individually?úWeight (kg)úLength (cm)ú
Width (cm)úHeight (cm)úAllow customer reviews?úPurchase noteú
Sale priceúRegular priceÚ
CategoriesÚTagsúShipping classÚImagesúDownload limitúDownload expiry daysÚParentúGrouped productsÚUpsellsúCross-sellsúExternal URLúButton textÚPositionÚReviewsrJ   rK   Ú_nameZ_valuesZ_visiblez
Attribute z namez	 value(s)Ú1z visiblez defaultr   Zin_stockZweight_grams)&rÂ   rÃ   rÄ   rÅ   rÆ   rÇ   rÈ   rÉ   rÊ   rË   rÌ   rÍ   rÎ   rÏ   rÐ   rÑ   rÒ   rÓ   rÔ   rÕ   rÖ   r×   rØ   rÙ   rÚ   rÛ   rÜ   rÝ   rÞ   rß   rà   rá   râ   rã   rä   rå   ræ   rç   rL   rè   rk   z-wc.csvFú	utf-8-sig©Úindexr“   ú	Exported z rows (z" products) to WooCommerce format: )r«   r   r   r   Úhasattrr   r   rG   rO   r¼   r   r½   r¾   r¿   rœ   ÚdumpsrW   rU   rV   Zto_woocommerce_csv_rowrÀ   ÚpdÚ	DataFramer`   Úto_csvr"   rq   r²   )r%   r&   rm   r    Úrowsr)   r'   r(   r   rG   rO   r¼   r   r½   r¾   r¿   rÀ   Zreviews_jsonÚrowr\   Zattr_name_keyZattr_values_keyZattr_visible_keyZattr_value_keyÚdfr¡   r¢   r-   r-   r.   r¸   ¸  s|   





Ù+

Ú*
Ù+

z"ExportPipeline._export_woocommercec           
      C   s‚   g }| j D ]}| ¡ }| |¡ q
t |¡}|› d|› d}| j| }	|j|	ddd |j dt	| j ƒ› dt	|ƒ› d|	› ¡ d	S )
zLExport to Shopify CSV format with multiple rows per product (one per image).rk   z-shopify.csvFrë   rì   rî   z products (z rows) to Shopify format: N)
r«   Zto_shopify_csv_rowsÚextendrñ   rò   r`   ró   r"   rq   r²   )
r%   r&   rm   r    Zall_rowsr)   Zproduct_rowsrö   r¡   r¢   r-   r-   r.   r¹   ž  s    


ÿzExportPipeline._export_shopifyc                 C   sf   dd„ | j D ƒ}t |¡}|› d|› d}| j| }|j|ddd |j dt| j ƒ› d	|› ¡ d
S )zExport to Excel format.c                 S   s   g | ]}|  ¡ ‘qS r-   ©r   ©rP   r)   r-   r-   r.   Ú
<listcomp>²  s     z0ExportPipeline._export_excel.<locals>.<listcomp>rk   z.xlsxFZopenpyxl)rí   Zenginerî   z products to Excel format: N)r«   rñ   rò   r`   Zto_excelr"   rq   r²   )r%   r&   rm   r    rô   rö   r¡   r¢   r-   r-   r.   rº   °  s    

zExportPipeline._export_excelc              	   C   sz   dd„ | j D ƒ}|› d|› d}| j| }t|ddd}tj||dd	td
 W 5 Q R X |j dt| j ƒ› d|› ¡ dS )zExport to JSON format.c                 S   s   g | ]}|  ¡ ‘qS r-   rø   rù   r-   r-   r.   rú   ½  s     z/ExportPipeline._export_json.<locals>.<listcomp>rk   r±   r   r‘   r’   Fr”   r•   rî   z products to JSON format: N)	r«   r`   r›   rœ   r   r$   r"   rq   r²   )r%   r&   rm   r    r¨   r¡   r¢   r£   r-   r-   r.   r»   »  s    
zExportPipeline._export_jsonN)r^   r´   )r0   r1   r2   r3   r$   r8   r¥   ri   rs   r   r/   ro   r¤   r¸   r¹   rº   r»   r-   r-   r-   r.   r³   _  s   
 gr³   )r3   rœ   Zpandasrñ   Úpathlibr   r   Útypingr   r   Zscrapyr   Zscrapy.exceptionsr   Zscrapy_project.itemsr   r	   r
   r   Zurllib.parser   r   r4   r]   r§   r³   r-   r-   r-   r.   Ú<module>   s   l|I