U
    8è4i*ã  ã                   @   sð   d Z ddlZddlZddlZddlmZmZmZmZ ddl	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlZddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ G dd„ dƒZeedœdd„Z dee!e!edœdd„Z"dS )zFWebsite analyzer to extract categories, product count, and attributes.é    N)ÚDictÚListÚSetÚOptional)ÚurljoinÚurlparse)ÚCounter)ÚPath)Údatetime)ÚCrawlerProcess)ÚRequest)Úget_project_settings)ÚProductItem)ÚGenericSpider)ÚPlatformDetectorc                   @   s8   e Zd ZdZdeedœdd„Zedœdd„Zd	d
„ Z	dS )ÚWebsiteAnalyzerz9Analyze website structure and extract useful information.F)Úconfig_fileÚ	count_allc                 C   sr   || _ tƒ | _tƒ | _i | _i | _d| _d| _d| _i | _	g | _
i | _d| _d| _|| _|s`dnd| _tƒ | _dS )zÇInitialize analyzer with config file.
        
        Args:
            config_file: Path to config YAML file
            count_all: If True, count all products across all pages/collections
        r   FÚ é   i?B N)r   ÚsetÚ
categoriesÚcollectionsÚcollection_product_countÚcollection_visited_productsÚproduct_countÚtotal_product_countÚused_apiÚ
attributesÚproducts_sampleÚplatform_infoÚhtml_contentÚbase_urlr   Úmax_products_to_analyzeÚvisited_product_urls)Úselfr   r   © r&   ú"./scrapy_project/utils/analyzer.pyÚ__init__   s     zWebsiteAnalyzer.__init__)Úreturnc                 C   s`  t ƒ }| dd¡ | dd¡ G dd„ dtƒ}t|ƒ}|j|| | jd |jdd | jd	kr¶| jr¶| j	s¶t
| j ¡ ƒ}d	}|d
d… D ]}||7 }|dkrˆ q¢qˆ|d	kr¶t|dƒ| _| j| j| jd	krÎ| jn| jdd„ | j ¡ D ƒ| j| j	t| jƒdœ}| j di ¡ d¡r,tt
| jƒƒ|d< g |d< n0tt
| jƒƒ|d< | jrTtt
| jƒƒng |d< |S )z|
        Analyze website and return results.
        
        Returns:
            Dictionary with analysis results
        Z	LOG_LEVELZWARNINGZLOG_ENABLEDTc                       sˆ   e Zd ZdZ‡ fdd„Zdd„ Zdd„ Zdd	„ Zd
d„ Zdd„ Z	dd„ Z
eedœdd„Zdd„ Zdd„ Zdd„ Zdd„ Zdd„ Z‡  ZS )z/WebsiteAnalyzer.analyze.<locals>.AnalysisSpiderÚanalysisc                    s:   t ƒ j||Ž || _d| _tƒ | _tƒ | _d| _d| _	d S )Nr   F)
Úsuperr(   ÚanalyzerÚproducts_foundr   Úplatform_detectorr   Úcollections_crawledÚ
is_shopifyÚsitemap_processed)r%   r,   ÚargsÚkwargs©Ú	__class__r&   r'   r(   @   s    z8WebsiteAnalyzer.analyze.<locals>.AnalysisSpider.__init__c                 s   sr   | j rP| j d }t|ƒ}|j› d|j› }t|dƒ}t|| jd| jd|idV  | j D ]}t|| jddV  qVdS )	a  Start by crawling sitemap.xml for Shopify sites to find all collections.
                
                Note: This method is deprecated in Scrapy 2.13+ but kept for backward compatibility.
                The start() method (async) is preferred for Scrapy 2.13+.
                r   ú://ú/sitemap.xmlTÚdomain©ÚurlÚcallbackÚdont_filterÚerrbackÚmeta©r:   r;   r<   N©	Ú
start_urlsr   ÚschemeÚnetlocr   r   Úparse_sitemap_indexÚsitemap_errbackÚparse©r%   r"   Úparsedr8   Úsitemap_urlr:   r&   r&   r'   Ústart_requestsI   s    

û	
z>WebsiteAnalyzer.analyze.<locals>.AnalysisSpider.start_requestsc                 S  sr   | j rP| j d }t|ƒ}|j› d|j› }t|dƒ}t|| jd| jd|idV  | j D ]}t|| jddV  qVdS )	a  Start by crawling sitemap.xml for Shopify sites (Scrapy 2.13+ async method).
                
                This method is preferred in Scrapy 2.13+ to avoid deprecation warnings.
                It reuses the same logic as start_requests() but in async context.
                r   r6   r7   Tr8   r9   r?   Nr@   rG   r&   r&   r'   Ústartc   s    

û	
z5WebsiteAnalyzer.analyze.<locals>.AnalysisSpider.startc                 S   s   dS )z'Handle sitemap request errors silently.Nr&   )r%   Zfailurer&   r&   r'   rE   ~   s    z?WebsiteAnalyzer.analyze.<locals>.AnalysisSpider.sitemap_errbackc           	   
   s   s¨   z€ddl m} |j dd¡}| |j¡}ddi}|jd|d}|rt|D ]*}d	|ksZd
|krFt|| jd| j	dV  qFn
|  |¡ W n" t
k
r¢ } zW 5 d}~X Y nX dS )z0Parse sitemap index to find collections sitemap.r   ©Úetreer8   r   Úsmú+http://www.sitemaps.org/schemas/sitemap/0.9z//sm:sitemap/sm:loc/text()©Ú
namespacesz/sitemap_collectionsz/collectionsT)r:   r;   r<   r=   N)ÚlxmlrM   r>   ÚgetÚ
fromstringÚbodyÚxpathr   Úparse_collections_sitemaprE   Ú	Exception)	r%   ÚresponserM   r8   ÚrootrQ   ZsitemapsrI   Úer&   r&   r'   rD   ‚   s$    ü	zCWebsiteAnalyzer.analyze.<locals>.AnalysisSpider.parse_sitemap_indexc              
   s   sÔ   z¬ddl m} | |j¡}ddi}|jd|d}|D ]n}|r4d|kr4t d|¡}|r4| d	¡ ¡ }d
dh}	||	kr4| 	¡ s4|| j
kr4| j
 |¡ t|| jddV  q4d| _W n" tk
rÎ }
 zW 5 d}
~
X Y nX dS )z5Extract all collection URLs from collections sitemap.r   rL   rN   rO   z//sm:loc/text()rP   ú/collections/ú/collections/([^/?#]+)é   ÚsearchÚfilterFr?   TN)rR   rM   rT   rU   rV   Úrer_   ÚgroupÚlowerÚisdigitr/   Úaddr   Úparse_collection_pager1   rX   )r%   rY   rM   rZ   rQ   Zurlsr:   ÚmatchÚcollection_slugÚ
skip_slugsr[   r&   r&   r'   rW   ¡   s,    
ý

zIWebsiteAnalyzer.analyze.<locals>.AnalysisSpider.parse_collections_sitemapc           !      3   s   | j j }|r¬|j| j _|j| j _| j |j|jt|jƒ¡| j _	| j j	 
d¡dk| _| jr¬|r¬ddlm} ||jƒ}|j› d|j› }|› d}t|| jdddid	V  d
S | j jr¸d
S | jr|  |¡}|D ],}|| jkrÎ| j |¡ t|| jddV  qÎ|rd|jkrd
S d
}	| j 
dd¡}
|
rZ|
 d¡D ]$}| ¡ }| |¡}	|	r4 qZq4|	sž| jržddddddddg}|D ]}| |¡}	|	r€ qžq€| j jr°| jsÀ|	 oÈ| joÈd|jk}|r~| d¡ ¡ }| j jrìd
nd}t|r|d
|… n|ƒD ]d‰ ˆ r
dˆ kr
t‡ fdd„dD ƒƒr6t|jˆ ƒ}|| jkr
| j |¡ t|| jddV  q
| j js~d
S | j jrJ|	rJ| d¡ 
¡ }|sð| d¡ ¡ }|j}|D ]4‰ ˆ rºˆ |krºdˆ ksâd ˆ krºˆ } qðqº|rJt|j|ƒ}|| jkrJ| j |¡ d|kr8t|| jddV  nt|| j ddV  |	rd|jkrd
}d|jkr¾t! "d!|j¡}|r¾| #d"¡ $¡ }d#d$d%d&h}| %¡ s¾||kr¾| &d'd(¡ '¡ }|	D ]V}| j(| j j)krÞ q|  *||j¡}|rÂ| 
d)¡rÂ| 
d*d+¡}| 
d)d+¡‰d,d-d.d/d0d1d2d3d4d5g
}t‡fd6d„|D ƒƒrJqÂ|rÂ| +d7¡s`qÂd8|krnqÂt! "d9|¡}|r°| #d"¡ $¡ }||ƒ}|j› d|j› d8|› } n&||ƒ}|j› d|j› |j,›  -d:¡} | | j j.kr| j j. | ¡ | j  j/d"7  _/|  j(d"7  _(| j  j0d"7  _0|rT|| j j1kr@d| j j1|< | j j1|  d"7  < | 
d;¡rž| j j	 
d<i ¡ 
d=¡rŒ| j j2 |d; ¡ n| j j3 |d; ¡ t4| j j5ƒdk rà| j j5 6| 
d)¡| 
d>¡| 
d;¡| 
d*¡d?œ¡ | 
d*¡rÂ| j(d@krÂt|d* | j7dAt|ƒiddBV  qÂd
S )CzParse and collect data.Zplatform_keyÚshopifyr   ©r   r6   ú/products.json?limit=250TZis_api)r:   r;   r<   r>   NFr?   r\   Úproduct_containerú.productú,ú.product-card-wrapperú".card-wrapper.product-card-wrapperú.grid__item--productú.product-itemú.product-cardú[class*="product-card"]ú[class*="product"]ú$a[href*="/collections/"]::attr(href)é   c                 3   s   | ]}|ˆ   ¡ kV  qd S ©N©rc   )Ú.0Úskip)Úlinkr&   r'   Ú	<genexpr>(  s     zHWebsiteAnalyzer.analyze.<locals>.AnalysisSpider.parse.<locals>.<genexpr>)z/collections/allz/collections/all-productsúX.pagination a.next::attr(href), .pagination .next::attr(href), a[rel="next"]::attr(href)ú/.pagination a::attr(href), .pager a::attr(href)úpage=ú?page=r]   r^   Úallzall-productsr_   r`   ú-ú ÚnameÚproduct_urlr   u   giá» hÃ ngÚcartÚemptyu   trá»‘ngu   cÃ¡c sáº£n pháº©m khÃ¡czother productszview allzsee allzshop allzall productsc                 3   s   | ]}|ˆ   ¡ kV  qd S ry   rz   )r{   Úkeyword)Úproduct_namer&   r'   r~   o  s     Úhttpú
/products/ú/products/([^/?#]+)ú/ÚcategoryÚfeaturesr   Úprice)r†   r’   r   r:   é   Úproduct)r:   r;   r>   r<   )8r,   r!   Útextr:   r"   r.   ÚdetectÚdictÚheadersr    rS   r0   Úurllib.parser   rB   rC   r   Úparse_shopify_apir   Ú_extract_collection_urlsr/   re   rf   Ú	selectorsÚsplitÚstripÚcssr   Úgetallr   Úanyr   rF   ra   r_   rb   rc   rd   ÚreplaceÚtitler-   r#   Z_extract_productÚ
startswithÚpathÚrstripr$   r   r   r   r   r   Úlenr   ÚappendÚparse_product_detail)!r%   rY   Zis_homepager   rH   r"   Zapi_urlÚcollection_urlsZcollection_urlÚproduct_containersÚconfigured_selectorÚselÚshopify_selectorsÚselectorZshould_follow_collectionsÚcollection_linksZmax_collectionsÚfull_urlÚ	next_pageÚpagination_linksÚcurrent_urlÚfull_next_urlZcurrent_collectionrg   rh   ri   Ú	containerr”   r‡   Zcategory_keywordsÚproduct_slug_matchÚproduct_slugÚnormalized_urlr&   )r}   r‹   r'   rF   À   s@   


ý


ü

ý
	

ø


þýý
$

ý
ý

ü
üz5WebsiteAnalyzer.analyze.<locals>.AnalysisSpider.parse)r•   r)   c                 S   s@   |sdS t  dd|¡}d | ¡ ¡}| dd¡ dd¡}| ¡ S )z6Clean text by removing HTML tags and extra whitespace.r   z<[^>]+>r…   z&nbsp;z&amp;ú&)ra   ÚsubÚjoinr   r¢   rž   )r%   r•   r&   r&   r'   Ú_clean_text¬  s    z;WebsiteAnalyzer.analyze.<locals>.AnalysisSpider._clean_textc           	         sÚ   t ƒ }| d¡ ¡ }| d¡ ¡ }| dd„ |D ƒ¡ |D ]š}|r:| d¡rZt|j|ƒ‰ n | d¡rj|‰ nt|jd| ƒ‰ t dˆ ¡}|r:| 	d¡ 
¡ }d	d
ddddh}| ¡ r²q:t‡ fdd„|D ƒƒrÊq:| ˆ ¡ q:|S )zRExtract ALL Shopify collection URLs from ANY page - find ALL /collections/* links.rw   za::attr(href)c                 S   s   g | ]}|rd |kr|‘qS )r\   r&   )r{   r}   r&   r&   r'   Ú
<listcomp>Â  s       z\WebsiteAnalyzer.analyze.<locals>.AnalysisSpider._extract_collection_urls.<locals>.<listcomp>r   rŒ   r]   r^   z.atomz.oembedz.jsonz.xmlz.rssz.htmlc                 3   s   | ]}ˆ   ¡  |¡V  qd S ry   )rc   Úendswith)r{   Zext©r±   r&   r'   r~   Ý  s     z[WebsiteAnalyzer.analyze.<locals>.AnalysisSpider._extract_collection_urls.<locals>.<genexpr>)r   rŸ   r    Úextendr¤   r   r:   ra   r_   rb   rc   rd   r¡   re   )	r%   rY   rª   r°   Z	all_linksr}   rg   rh   Zskip_extensionsr&   rÀ   r'   r›   ¸  s*    

zHWebsiteAnalyzer.analyze.<locals>.AnalysisSpider._extract_collection_urlsc              
   s   sÀ  zZddl }| |j¡}| dg ¡}|rZt|ƒ| j_tt|ƒdƒ| j_t	ƒ }|D ]N}| dg ¡}|D ]}| dd¡}	|	rf| 
|	¡ qf| dd¡}
|
rR| 
|
¡ qRzNddl}dd	lm} ||jƒ}|j› d
|j› }|› d}|j|ddidd}|jdkrð|  ¡ }| dg ¡}|D ]Ö}| dd¡}	|	r| 
|	¡ zœ| dd¡}|rÒ|› d|› d}|j|ddidd}|jdkrÒ|  ¡ }| dg ¡}t	ƒ }|D ] }| d¡}|rš| 
|¡ qš|rÒt|ƒ| jj|	< W n tk
rê   Y nX qW n$ tk
r } zW 5 d}~X Y nX || j_d| j_ddl}| d¡}| dt|ƒ› dt|ƒ› d¡ W n^ tk
rº } z>ddl}| d¡}| d|› ¡ |  |¡D ]}|V  qœW 5 d}~X Y nX dS )z Parse Shopify API JSON response.r   NÚproductsr   r   r£   r   Úproduct_typerk   r6   z/collections.json?limit=250z
User-AgentzMozilla/5.0é
   )r˜   ZtimeoutéÈ   Úhandler\   rl   ÚidTr*   zUsed Shopify API: Found z products, z collectionszShopify API parsing failed: )ÚjsonÚloadsr•   rS   r§   r,   r   Úminr   r   re   Úrequestsr™   r   r:   rB   rC   Zstatus_coder   rX   r   r   ÚloggingÚ	getLoggerÚinfoÚdebugrF   )r%   rY   rÈ   Zproducts_datarÂ   Zcollections_setr”   Zproduct_collectionsÚcollZ
coll_titlerÃ   rË   r   rH   r"   Zcollections_api_urlZcollections_responseZcollections_dataZapi_collectionsZcoll_handleZcoll_products_urlZcoll_products_responseZcoll_products_dataZcoll_productsZunique_product_idsÚcpZ
product_idr[   rÌ   ÚloggerÚitemr&   r&   r'   rš   æ  sŠ    

ý
ý

$
zAWebsiteAnalyzer.analyze.<locals>.AnalysisSpider.parse_shopify_apic                 3   s@  | j jrdS t d|j¡}|r<| d¡ ¡ }ddh}||krBdS | dd¡ ¡ }| j j	 
|¡ |  |¡}|D ]b}|| jkrnt d|¡}|rn| d¡ ¡ }	ddh}|	|krn|	 ¡ sn| j 
|¡ t|| jdd	V  qn|| j jkrêd
| j j|< || j jkrtƒ | j j|< d}
| j dd¡}|rN| d¡D ]$}| ¡ }| |¡}
|
r( qNq(|
sŠddddddddg}|D ]}| |¡}
|
rl qŠql|
r˜tƒ }tƒ }|
D ]}| d¡ ¡ ‰ ˆ r ˆ  d¡rÊq ddddddg}t‡ fdd„|D ƒƒröq ˆ  d¡rt|jˆ ƒ‰nˆ ‰t‡fd d„|D ƒƒr0q d!ˆkr>q t d"ˆ¡}|r€| d¡ ¡ }tˆƒ}|j› d#|j› d!|› }n&tˆƒ}|j› d#|j› |j›  d¡}| 
ˆ¡ | 
|¡ q |rZd
}|D ]†}|| j j| krü| j j|  
|¡ |d7 }|| j j krÌ| j j  
|¡ | j  j!d7  _!| j j"sÌ| j j#d$k rÌ| j  j#d7  _#qÌ|}n d
}| j j$ %d%|› d&|j› ¡ t&| j j |tƒ ¡ƒ| j j|< |
r<| d'¡ ¡ }|s | d(¡ '¡ }|j}|D ]4}|rÊ||krÊd)|ksòd*|krÊ|} q qÊ|r<t|j|ƒ}|| jkr<| j 
|¡ t|| jdd	V  dS )+zFParse a collection page to extract collection name and count products.Nr]   r^   r_   r`   r„   r…   Fr?   r   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   z!a[href*="/products/"]::attr(href))zjavascript:zdata:zmailto:ú#zfacebook.comztwitter.comzpinterest.comzinstagram.comz/sharez/sharerc                 3   s   | ]}|ˆ   ¡ kV  qd S ry   rz   ©r{   Úpattern)Úproduct_linkr&   r'   r~   ¤  s     zXWebsiteAnalyzer.analyze.<locals>.AnalysisSpider.parse_collection_page.<locals>.<genexpr>r   c                 3   s   | ]}|ˆ   ¡ kV  qd S ry   rz   rÕ   )r‡   r&   r'   r~   ­  s     r   rŽ   r6   r   z3No product URLs found in containers for collection z on r   r€   r   r‚   )(r,   r   ra   r_   r:   rb   rc   r¢   r£   r   re   r›   r/   rd   r   rf   r   r   r   rœ   rS   r   rž   rŸ   r¤   r¡   r   r   rB   rC   r¥   r¦   r$   r   r   r   rÒ   Úwarningr§   r    )r%   rY   rg   rh   ri   Zcollection_nameZadditional_collectionsZcoll_urlZ
coll_matchZ	coll_slugr«   r¬   r­   r®   r¯   Zproduct_urls_on_pageZnormalized_urls_on_pager¶   Zskip_patternsr·   r¸   rH   r¹   Znew_products_in_collectionr   r²   r³   r´   r}   rµ   r&   )r×   r‡   r'   rf   H  sä    

ý


ø





ÿÿ$
ýzEWebsiteAnalyzer.analyze.<locals>.AnalysisSpider.parse_collection_pagec           	   
      s$  dddddddg}|D ]}|  |¡}|dd	… D ]ê}|  d
¡ ¡ }|  d¡ ¡ }|r`d |¡nd‰ ˆ rr|  ˆ ¡nd‰ ˆ r2tˆ ƒdkr2tˆ ƒdk r2ddddddddg}t‡ fdd„|D ƒƒr¾q2|r2d| ¡ ksæd| ¡ ksæd| ¡ kr2| jj	 di ¡ d¡r| jj
 ˆ ¡ q2| jj ˆ ¡ q2qdS )z(Extract categories from navigation menu.znav az.navigation az.menu az.categories az.category-menu az[class*="category"] az[class*="nav"] aNr   z::attr(href)z::textr…   é   é2   ÚhomeZaboutZcontactZblogZloginzsign uprˆ   Zcheckoutc                 3   s   | ]}|ˆ   ¡ kV  qd S ry   rz   )r{   Úword©r•   r&   r'   r~     s     zaWebsiteAnalyzer.analyze.<locals>.AnalysisSpider._extract_navigation_categories.<locals>.<genexpr>z
/category/z/c/r\   r‘   r   )rŸ   rS   r    r¼   r½   r§   r¡   rc   r,   r    r   re   r   )	r%   rY   Znav_selectorsr¯   Z	nav_linksr}   ZhrefZ
text_partsZ
skip_wordsr&   rÝ   r'   Ú_extract_navigation_categoriesü  s.    ù


(zNWebsiteAnalyzer.analyze.<locals>.AnalysisSpider._extract_navigation_categoriesc                 S   sh   |j  di ¡}t | || j¡}|rd| ¡ D ]8\}}|| jjkrLtƒ | jj|< | jj|  	t
|ƒ¡ q*dS )z$Extract attributes from detail page.r”   N)r>   rS   r   Ú_extract_attributesrœ   Úitemsr,   r   r   re   Ústr)r%   rY   r”   r   ÚkeyÚvaluer&   r&   r'   r©     s    zDWebsiteAnalyzer.analyze.<locals>.AnalysisSpider.parse_product_detail)Ú__name__Ú
__module__Ú__qualname__r†   r(   rJ   rK   rE   rD   rW   rF   rá   r½   r›   rš   rf   rÞ   r©   Ú__classcell__r&   r&   r4   r'   ÚAnalysisSpider=   s    	 m.b 5"rè   )r,   r   )Zstop_after_crawlr   Nrx   r   c                 S   s   i | ]\}}|t t|ƒƒ“qS r&   )ÚsortedÚlist)r{   ÚkÚvr&   r&   r'   Ú
<dictcomp>B  s      z+WebsiteAnalyzer.analyze.<locals>.<dictcomp>)ÚplatformÚproduct_count_estimater   r   Úsample_productsÚcount_all_moder   r‘   r   r   )r   r   r   r   Úcrawlr   rK   r   r   r   rê   ÚvaluesrÊ   r    r   r   rà   r   r—   rS   ré   r   r   )r%   Úsettingsrè   ZprocessÚcollection_countsZ
sample_sumÚcountÚresultsr&   r&   r'   Úanalyze1   sF         sø
zWebsiteAnalyzer.analyzec           	      C   sf   i }|  dd¡}|rb| |› d¡}|D ]8}| d¡  ¡ }| d¡  ¡ }|r(|r(| ¡ || ¡ < q(|S )z,Extract attributes from product detail page.r   z .product-attributes, .attributesz trztd:first-child::textztd:last-child::text)rS   rŸ   rž   )	r%   rY   rœ   r   Zattr_selectorZ	attr_rowsÚrowrâ   rã   r&   r&   r'   rß   R  s    z#WebsiteAnalyzer._extract_attributesN)F)
rä   rå   ræ   Ú__doc__rá   Úboolr(   r   rø   rß   r&   r&   r&   r'   r      s         'r   )r*   r)   c                 C   sh  g }|  d¡ |  d¡ |  d¡ |  di ¡}|  d¡ |  d| dd¡› ¡ |  d| d	d¡› ¡ | d
¡r|  dd | d
g ¡¡› ¡ |  d| dd¡› ¡ |  d¡ | di ¡ d¡rÜ|  d¡ |  dg ¡}|  di ¡}|  dt|ƒ› ¡ |rZdd„ |D ƒ}i }|D ]L}t dd|¡ ¡  ¡ }||krF|||< ntdd„ |D ƒƒr|||< qt	| 
¡ ƒ}	|	rÐ|  d¡ |	D ]F}| |d¡}
|
dkrº|  d|› d|
› d¡ n|  d|› ¡ q†n
|  d¡ n~|  d ¡ |  d!g ¡}|  d"t|ƒ› ¡ |rZd#d„ |D ƒ}t	|ƒ}|rP|  d$¡ |D ]}|  d|› ¡ q6n
|  d%¡ |  d¡ |  d&¡ |  d'd¡}|  d(d¡}|  d)¡r¤|  d*|› ¡ nl|  di ¡}|rì||krì|  d+|› ¡ |  d,|› ¡ |  d-¡ n$|  d+|› ¡ ||kr|  d.¡ |  d¡ |  d/¡rÌ|  d0d | d/  ¡ ¡› ¡ t| d/  ¡ ƒd1d2… D ]d\}}d3d„ |d1d2… D ƒ}|  d4|› dd |¡› ¡ t|ƒd2kr\|  d5t|ƒd2 › d6¡ q\|  d¡ |  d7¡rT|  d8¡ t| d7 d1d2… d9ƒD ]P\}}| d:d;¡}| d<d;¡}| d=d;¡}|  d>|› d?|› d@|› d@|› ¡ qø|  d¡ |  d¡ dA |¡S )Bz¤
    Format analysis results as readable text.
    
    Args:
        analysis: Analysis results dictionary
        
    Returns:
        Formatted text string
    u   ðŸ“ˆ Analysis Results:z<============================================================r   rî   u   ðŸ¢ Platform Information:z   Platform: ZUnknownz   CMS: ZcmsZ
tech_stackz   Tech Stack: z, z   Confidence: Z
confidenceZlowr‘   r   u   ðŸ“¦ Collections (Shopify):r   z   Collections found: c                 S   s.   g | ]&}|rt | ¡ ƒd kr| d¡s|‘qS ©r^   ú<©r§   rž   r¤   ©r{   Úcr&   r&   r'   r¾   ƒ  s
       
 z+format_analysis_results.<locals>.<listcomp>z[^\w\s]c                 s   s   | ]}t |ƒd kV  qdS )é   N)Úord)r{   Úcharr&   r&   r'   r~   Ž  s     z*format_analysis_results.<locals>.<genexpr>z   Collections:r   z      - z: z	 productsz   Collections: None foundu   ðŸ“‚ Categories:r   z   Categories found: c                 S   s.   g | ]&}|rt | ¡ ƒd kr| d¡s|‘qS rü   rþ   rÿ   r&   r&   r'   r¾   ¢  s
       
 z   Categories:z   Categories: None foundu   ðŸ“Š Products:r   rï   rñ   z   Total products found: z   Products found (sample): z&   Total unique products (estimated): zA   Note: Run with --count-all for complete count across all pagesz0   Note: Run with --count-all to get total countr   u   ðŸ·ï¸  Attributes detected: Nr“   c                 S   s   g | ]}t |ƒ‘qS r&   )rá   )r{   rì   r&   r&   r'   r¾   Ã  s     z   - z     ... and z more valuesrð   u   ðŸ“¦ Sample products:r^   r†   zN/Ar’   r   ú   z. ú - Ú
)r¨   rS   r¼   r§   ra   r»   rž   rc   r¡   ré   ró   Úkeysrê   rà   Ú	enumerate)r*   Úlinesr    r   rõ   Zvalid_collectionsZseen_normalizedrÐ   Z
normalizedZunique_collectionsrö   r   Zvalid_categoriesÚcatZtotal_countZsample_countZ	attr_nameró   Zdisplay_valuesÚiÚprodr†   r’   r   r&   r&   r'   Úformat_analysis_resultsb  sž    



















 

&

r  TF)r   Úsave_to_filer   r)   c              
   C   s   t | |d}| ¡ }t ¡  ¡ |d< |rœt| ƒ}tdƒ}|jddd ||j› d }||j› d }zþg }	| ¡ rºz4t	|dd	d
}
t
 |
¡}| dg ¡}	W 5 Q R X W n   Y nX |d | dd¡| dd¡t| dg ¡ƒt| dg ¡ƒ|dœ}|	 d|¡ |	dd… }	|	|d< t|ƒ}|	rü|d7 }|d7 }|d7 }|	dd… D ]‚}| dd¡}|rjt |¡nd}|r~| d¡n|}| dd¡}| dd¡}| d¡rªdnd}|d |› d!|› d"|› d#7 }qJt|	ƒdkrô|d$t|	ƒd › d%7 }|d7 }t	|d&d	d
}
|
 |¡ W 5 Q R X t|ƒ|d'< t	|d&d	d
}
t
j||
d(d)td* W 5 Q R X t|ƒ|d+< W n2 tk
rš } ztd,|› ƒ W 5 d}~X Y nX |S )-a7  
    Analyze website and return statistics.
    
    Args:
        config_file: Path to config YAML file
        save_to_file: Whether to save analysis results to file
        count_all: If True, count all products across all pages/collections
        
    Returns:
        Dictionary with analysis results
    )r   Zanalysis_timestampzdata/analysisT)ÚparentsÚexist_okz_analysis.txtz_analysis.jsonÚrzutf-8)ÚencodingÚhistoryr   r   rï   r   r   )Ú	timestampÚtotal_productsrð   Úcollections_countZcategories_countrñ   NrÄ   r  u   ðŸ“œ History:
z=============================================================
rx   r  r   z%Y-%m-%d %H:%M:%Sr  rð   rñ   z	full scanÚsampler  r  z products (z)
z   ... and z more entries
ÚwÚanalysis_filerÙ   F)ÚindentÚensure_asciiÚdefaultÚanalysis_json_filez'Warning: Could not save analysis file: )r   rø   r
   ÚnowÚ	isoformatr	   ÚmkdirÚstemÚexistsÚopenrÈ   ÚloadrS   r§   Úinsertr  ÚfromisoformatÚstrftimeÚwriterá   ÚdumprX   Úprint)r   r  r   r,   r÷   Úconfig_pathZdata_dirr  Z	json_filer  ÚfZexisting_dataZhistory_entryZformatted_textÚentryr  ZdtZtime_strÚtotalr  Úmoder[   r&   r&   r'   Úanalyze_websiteØ  sj    


ú  r0  )TF)#rú   ra   rÈ   ÚyamlÚtypingr   r   r   r   r™   r   r   r   r   Úpathlibr	   r
   ZscrapyZscrapy.crawlerr   Zscrapy.httpr   Zscrapy.utils.projectr   Zscrapy_project.itemsr   Z%scrapy_project.spiders.generic_spiderr   Z&scrapy_project.utils.platform_detectorr   r   rá   r  rû   r0  r&   r&   r&   r'   Ú<module>   s0         Rv