U
    84iV                     @   s   d Z ddlZddlZddlmZmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZ G dd	 d	ZG d
d de	ZG dd deZG dd dZG dd dZdS )z9Scrapy middlewares for rate limiting, retry, and headers.    N)OptionalDict)signals)RetryMiddleware)UserAgentMiddleware)response_status_messagec                   @   s6   e Zd ZdZdeedddZedd Zd	d
 Z	dS )RateLimitMiddlewarez5Middleware to enforce rate limiting between requests.      ?Tdelayrandomize_delayc                 C   s   || _ || _i | _dS )z
        Initialize rate limit middleware.
        
        Args:
            delay: Base delay in seconds between requests
            randomize_delay: Whether to add random jitter to delay
        N)r   r   last_request_time)selfr   r    r   scrapy_project/middlewares.py__init__   s    zRateLimitMiddleware.__init__c                 C   s(   |j dd}|j dd}| ||dS )1Create middleware instance from crawler settings.ZDOWNLOAD_DELAYr	   ZRANDOMIZE_DOWNLOAD_DELAYTr
   )settingsgetfloatgetbool)clscrawlerr   Z	randomizer   r   r   from_crawler   s    z RateLimitMiddleware.from_crawlerc                 C   s   d|j kr|j dd n|j }|| jkrrt | j|  }| j}| jrX|dt   }||k rr|| }t| t | j|< dS )z'Process request and enforce rate limit./         ?N)urlsplitr   timer   r   randomsleep)r   requestspiderdomainelapsedr   Z
sleep_timer   r   r   process_request"   s     

z#RateLimitMiddleware.process_requestN)r	   T)
__name__
__module____qualname____doc__floatboolr   classmethodr   r%   r   r   r   r   r      s
   
r   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )CustomRetryMiddlewarez1Custom retry middleware with exponential backoff.c                    s8   t  | |dd| _tdd |dD | _d S )NZRETRY_TIMES   c                 s   s   | ]}t |V  qd S )N)int).0xr   r   r   	<genexpr>?   s     z1CustomRetryMiddleware.__init__.<locals>.<genexpr>ZRETRY_HTTP_CODES)superr   getintmax_retry_timessetgetlistretry_http_codes)r   r   	__class__r   r   r   <   s    zCustomRetryMiddleware.__init__c              
   C   sz  |j dkr(z"t|dr(|jr(|j nd}W n   d}Y nX |jdd}t|trh|jddd}nt	|}g }d	|ksd
|kr|
d d|kr|
d d|ksd|jkr|
d d|ksd|kr|
d |r|jd|j  d|j dd| d|pd  n|jd|j d |j dkrJ|jd|j d |j | jkrvt|j }| |||pt|S |S )z(Process response and retry if necessary.  text ZServer    zutf-8ignore)errorsZdatadomeZDataDomezDataDome protectionZcaptchazCAPTCHA challengeZ
cloudflarezcf-rayzCloudflare protectionzaccess deniedZblockedzAccess denied messageu   🚫 BLOCKED: z Forbidden detected on z. Blocking indicators: , z
. Server: ZUnknownz403 Forbidden on z( (no specific blocking indicators found)  u/   ⚠️  RATE LIMITED: 429 Too Many Requests on z-. Consider increasing delay between requests.)statushasattrr<   lowerheadersget
isinstancebytesdecodestrappendloggererrorr   joinwarningr8   r   _retry)r   r!   responser"   Zcontent_lowerZserver_headerZblocking_indicatorsreasonr   r   r   process_responseA   s>    "





*
z&CustomRetryMiddleware.process_responsec                 C   s   |j ddd }|| jkrd| tdd }|jd| d| d| j d|d	d
	 | }||j d< d|_|j	d |_	t
| |S |jd| d| d dS )z'Retry request with exponential backoff.retry_timesr      r   z	Retrying z
 (attempt r   z) after z.2fsTzGave up retrying z after z	 attemptsN)metarG   r5   r   ZuniformrM   debugcopydont_filterpriorityr   r    rN   )r   r!   rS   r"   rU   r   	retry_reqr   r   r   rQ   q   s    
*

zCustomRetryMiddleware._retry)r&   r'   r(   r)   r   rT   rQ   __classcell__r   r   r9   r   r-   9   s   0r-   c                   @   s4   e Zd ZdZd
edddZedd Zdd	 ZdS )CustomHeadersMiddlewarez-Middleware to set custom headers from config.N
user_agentc                 C   s   |pd| _ i | _dS )zInitialize headers middleware.zoMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36N)ra   Zcustom_headers)r   ra   r   r   r   r      s    
z CustomHeadersMiddleware.__init__c                 C   s   |j d}| |dS )r   Z
USER_AGENTr`   r   rG   )r   r   ra   r   r   r   r      s    z$CustomHeadersMiddleware.from_crawlerc                 C   s
  | j r| j |jd< t|drX|jrX|jdi }| D ]\}}| dkr8||j|< q8d|jkrd|jkrxd|jd< d|jkrd	|jd< d
|jkrd|jd
< d|jkrd|jd< d|jkrd|jd< d|jkrd|jd< d|jkrd|jd< d|jkrd|jd< dS )z Process request and set headers.z
User-AgentconfigrF   z
user-agentzetsy.comZRefererzhttps://www.etsy.com/zAccept-Languagezen-US,en;q=0.9zAccept-Encodingzgzip, deflate, brZ
Connectionz
keep-alivezUpgrade-Insecure-Requests1zSec-Fetch-DestZdocumentzSec-Fetch-ModeZnavigatezSec-Fetch-Sitezsame-originN)ra   rF   rD   rc   rG   itemsrE   r   )r   r!   r"   rF   keyvaluer   r   r   r%      s2    














z'CustomHeadersMiddleware.process_request)N)	r&   r'   r(   r)   rK   r   r,   r   r%   r   r   r   r   r_      s
   
r_   c                   @   s   e Zd ZdZd ee eeee	ddd	Z
ed
d Zdd Zdd ZedddZedddZedddZdd Zdd Zdd Zdd ZdS )!ProxyMiddlewarea=  
    Middleware to handle proxy rotation with health checking and auto-removal of bad proxies.
    
    Features:
    - Health tracking (success/failure counts, failure rate)
    - Auto-remove bad proxies (when failure rate exceeds threshold)
    - Retry with different proxy on failure
    - Statistics tracking
    Nrotater      Tproxies
proxy_modemax_failure_ratemin_requestsenable_health_checkc                 C   sn   |rt |ng | _t | j| _|| _d| _|| _|| _|| _i | _| jD ] }|| jkrHdddd| j|< qHdS )a  
        Initialize proxy middleware.
        
        Args:
            proxies: List of proxy URLs (e.g., ['http://proxy1:port', 'http://proxy2:port'])
            proxy_mode: 'rotate' (rotate through proxies) or 'random' (random selection)
            max_failure_rate: Maximum failure rate (0.0-1.0) before removing proxy (default: 0.5 = 50%)
            min_requests: Minimum requests before evaluating failure rate (default: 5)
            enable_health_check: Enable health checking and auto-removal (default: True)
        r   successfailurelast_failureN)	listrl   original_proxiesrm   current_proxy_indexrp   rn   ro   proxy_stats)r   rl   rm   rn   ro   rp   proxyr   r   r   r      s    

zProxyMiddleware.__init__c                 C   sX   |j dg }|j dd}|j dd}|j dd}|j dd	}| |||||d
S )r   ZPROXIESZ
PROXY_MODEri   ZPROXY_MAX_FAILURE_RATEr   ZPROXY_MIN_REQUESTSrj   ZPROXY_ENABLE_HEALTH_CHECKTrk   )r   r7   rG   r   r4   r   )r   r   rl   rm   rn   ro   rp   r   r   r   r      s    zProxyMiddleware.from_crawlerc                 C   s   | j s| jS g }| jD ]j}| j|ddd}|d |d  }|| jk rT|| q|dkrh|d | nd}|| jkr|| q|S )z,Get list of available (not removed) proxies.r   )rr   rs   rr   rs   )rp   rl   rx   rG   ro   rL   rn   )r   	availablery   statstotalfailure_rater   r   r   _get_available_proxies   s    



z&ProxyMiddleware._get_available_proxiesc                 C   sj   |   }|s | jr| jd S dS | jdkr4t|S |rb|| jt|  }| jd t| | _|S dS dS )z(Select a proxy based on mode and health.r   Nr   rV   )r~   rl   rm   r   choicerw   len)r   rz   ry   r   r   r   _select_proxy	  s    


zProxyMiddleware._select_proxyry   c                 C   s@   |r<| j r<|| jkr&dddd| j|< | j| d  d7  < dS )zMark proxy as successful.r   rq   rr   rV   N)rp   rx   )r   ry   r"   r   r   r   _mark_proxy_success  s    

z#ProxyMiddleware._mark_proxy_successc                 C   s   |r| j r|| jkr&dddd| j|< | j| }|d  d7  < t |d< |d |d  }|dkrp|d | nd}|| jkr|| jkr|jd| d|d	d
|d  d| d	 dS )z4Mark proxy as failed and check if should be removed.r   rq   rs   rV   rt   rr   u   ⚠️  Proxy z has high failure rate: .1%z (r   z"). Will be excluded from rotation.N)rp   rx   r   ro   rn   rM   rP   )r   ry   r"   r{   r|   r}   r   r   r   _mark_proxy_failure%  s    


"z#ProxyMiddleware._mark_proxy_failure)returnc                 C   s   | j s
dS g }| j  D ]Z\}}|d |d  }|dkr|d | }|d| d|d  d|d  d|d	d
	 q|rd|S dS )z Get summary of proxy statistics.zNo proxy statistics availablerr   rs   r   z  z: z
 success, z
 failure (r   z success rate)
zNo requests processed yet)rx   re   rL   rO   )r   linesry   r{   r|   Zsuccess_rater   r   r   _get_proxy_stats_summary9  s    &z(ProxyMiddleware._get_proxy_stats_summaryc                 C   s   d|j krdS t|dr|jr|jdi dg }|rt|}|| jkr|| _t|| _|D ] }|| jkr^dddd| j|< q^|jdi dd	| _| jsdS | 	 }|s|j
d
 dS ||j d< |j dd|j d< |j
d| d|j d  d dS )zSet proxy for request.ry   Nrc   rl   ru   r   rq   moderi   z/No available proxies, skipping proxy assignmentproxy_retry_countzUsing proxy: z	 (retry: ))rX   rD   rc   rG   ru   rv   rl   rx   rm   r   rM   rP   rY   )r   r!   r"   Zconfig_proxiesZnew_proxiesry   r   r   r   r%   J  s.    




 zProxyMiddleware.process_requestc              	   C   sv   |j d}|rrd|j  kr&dk r8n n| || n:|jdkrr| || |jd| d|j d|j d |S )	z.Track proxy success/failure based on response.ry      i  )r;   rB   zProxy z
 returned z for z. Marking as failure.)rX   rG   rC   r   r   rM   rP   r   )r   r!   rR   r"   ry   r   r   r   rT   r  s    
z ProxyMiddleware.process_responsec           	         s  |j d  r |  | |j dd}d}||k r|  } fdd|D }|r| jdkrjt|n|d }|jd| d	  d
|d  d| d	 |	 }||j d< |d |j d< d|_
|S |jd|  n&|jd| d|j d  d|  dS )z3Handle proxy errors and retry with different proxy.ry   r   r   r.   c                    s   g | ]}| kr|qS r   r   r0   pr   r   r   
<listcomp>  s      z5ProxyMiddleware.process_exception.<locals>.<listcomp>r   u$   🔄 Retrying with different proxy: z (previous: z
, attempt rV   r   r   TuH   ❌ No available proxies left. All proxies have failed. Original error: u   ❌ Max proxy retries (z) reached for z. Last proxy: z	, Error: N)rX   rG   r   r~   rm   r   r   rM   inforZ   r[   rN   r   )	r   r!   Z	exceptionr"   r   Zmax_proxy_retriesrz   Z	new_proxyr]   r   r   r   process_exception  s2     
z!ProxyMiddleware.process_exceptionc                    sh   | j rd|jd |j|   |    fdd| jD }|rd|jdt| dd|  dS )z(Log proxy statistics when spider closes.u   📊 Proxy Statistics Summary:c                    s   g | ]}| kr|qS r   r   r   rz   r   r   r     s      z1ProxyMiddleware.spider_closed.<locals>.<listcomp>u   ⚠️  Removed z bad proxy(ies): rA   N)	rx   rM   r   r   r~   rv   rP   r   rO   )r   r"   Zremovedr   r   r   spider_closed  s    zProxyMiddleware.spider_closed)Nri   r   rj   T)r&   r'   r(   r)   r   ru   rK   r*   r/   r+   r   r,   r   r~   r   r   r   r   r%   rT   r   r   r   r   r   r   rh      s*   
     
 
(/rh   c                   @   sP   e Zd ZdZdee dddZedd Zdd	 Z	d
d Z
dd Zdd ZdS )StateMiddlewarez5Middleware to save crawl state for resume capability.N
state_filec                 C   s   || _ t | _dS )zInitialize state middleware.N)r   r6   visited_urls)r   r   r   r   r   r     s    zStateMiddleware.__init__c                 C   s   |j d}| |dS )r   Z
STATE_FILEr   rb   )r   r   r   r   r   r   r     s    zStateMiddleware.from_crawlerc              
   C   s   | j rddl}ddlm} || j }| rzPt|d<}||}t|dg | _	|j
dt| j	 d W 5 Q R X W n4 tk
r } z|j
d|  W 5 d}~X Y nX dS )	z3Called when spider is opened. Load state if exists.r   NPathrr   zLoaded state: z visited URLszCould not load state: )r   jsonpathlibr   existsopenloadr6   rG   r   rM   r   r   	ExceptionrP   )r   r"   r   r   
state_pathfstateer   r   r   spider_opened  s    

(zStateMiddleware.spider_openedc                 C   s\   | j rX|j| jkrXt|dg }|j|krX|jd|j  ddlm} |d|j dS )z!Check if URL was already visited.
start_urlszSkipping already visited URL: r   )IgnoreRequestzURL already visited: N)r   r   r   getattrrM   rY   Zscrapy.exceptionsr   )r   r!   r"   r   r   r   r   r   r%     s    
zStateMiddleware.process_requestc                 C   s   | j |j |S )z.Mark URL as visited after successful response.)r   addr   )r   r!   rR   r"   r   r   r   rT     s    z StateMiddleware.process_responsec              
   C   s   | j rddl}ddlm} || j }|jjddd t| jt| jd}zHt	|d}|j
||dd	 W 5 Q R X |jd
t| j d|  W n4 tk
r } z|jd|  W 5 d}~X Y nX dS )z)Called when spider is closed. Save state.r   Nr   T)parentsexist_ok)r   Ztotal_visitedwr   )indentzSaved state: z visited URLs to zCould not save state: )r   r   r   r   parentmkdirru   r   r   r   dumprM   r   r   rN   )r   r"   r   r   r   r   r   r   r   r   r   r     s    
"zStateMiddleware.spider_closed)N)r&   r'   r(   r)   r   rK   r   r,   r   r   r%   rT   r   r   r   r   r   r     s   
r   )r)   r   r   typingr   r   Zscrapyr   Z"scrapy.downloadermiddlewares.retryr   Z&scrapy.downloadermiddlewares.useragentr   Zscrapy.utils.responser   r   r-   r_   rh   r   r   r   r   r   <module>   s   -O1  