@@ -102,6 +102,13 @@ pub struct DomainFronter {
102102 inflight : Arc < Mutex < HashMap < String , broadcast:: Sender < Vec < u8 > > > > > ,
103103 coalesced : AtomicU64 ,
104104 blacklist : Arc < std:: sync:: Mutex < HashMap < String , Instant > > > ,
105+ /// Per-deployment rolling timeout counter. Maps `script_id` →
106+ /// `(window_start, strike_count)`. Reset when the window expires
107+ /// or when a batch succeeds. Triggers a short-cooldown blacklist
108+ /// at `TIMEOUT_STRIKE_LIMIT`. Distinct from `blacklist` because
109+ /// strike state is per-deployment health bookkeeping, not the
110+ /// permanent ban list.
111+ script_timeouts : Arc < std:: sync:: Mutex < HashMap < String , ( Instant , u32 ) > > > ,
105112 relay_calls : AtomicU64 ,
106113 relay_failures : AtomicU64 ,
107114 bytes_relayed : AtomicU64 ,
@@ -146,6 +153,21 @@ impl HostStat {
146153
147154const BLACKLIST_COOLDOWN_SECS : u64 = 600 ;
148155
156+ /// Sliding window for the timeout-strike blacklist heuristic. Three
157+ /// timeouts within this window on a single deployment trip the
158+ /// blacklist. Tuned so a single cold-start stall plus one transient
159+ /// network blip won't false-trigger, but a deployment that's actually
160+ /// dead (stale `TUNNEL_SERVER_URL`, paused project, dropped script)
161+ /// fails fast instead of poisoning round-robin until the user notices.
162+ const TIMEOUT_STRIKE_WINDOW : Duration = Duration :: from_secs ( 30 ) ;
163+ const TIMEOUT_STRIKE_LIMIT : u32 = 3 ;
164+
165+ /// Cooldown for a deployment blacklisted via the timeout-strike path.
166+ /// Distinct from `BLACKLIST_COOLDOWN_SECS` (10 min) because timeouts
167+ /// are a much noisier signal than quota errors — if the deployment
168+ /// recovers, we want to rejoin in minutes, not after a 10-min penalty.
169+ const TIMEOUT_BLACKLIST_COOLDOWN_SECS : u64 = 120 ;
170+
149171/// Request payload sent to Apps Script (single, non-batch).
150172#[ derive( Serialize ) ]
151173struct RelayRequest < ' a > {
@@ -258,6 +280,7 @@ impl DomainFronter {
258280 inflight : Arc :: new ( Mutex :: new ( HashMap :: new ( ) ) ) ,
259281 coalesced : AtomicU64 :: new ( 0 ) ,
260282 blacklist : Arc :: new ( std:: sync:: Mutex :: new ( HashMap :: new ( ) ) ) ,
283+ script_timeouts : Arc :: new ( std:: sync:: Mutex :: new ( HashMap :: new ( ) ) ) ,
261284 relay_calls : AtomicU64 :: new ( 0 ) ,
262285 relay_failures : AtomicU64 :: new ( 0 ) ,
263286 bytes_relayed : AtomicU64 :: new ( 0 ) ,
@@ -414,17 +437,67 @@ impl DomainFronter {
414437 }
415438
416439 fn blacklist_script ( & self , script_id : & str , reason : & str ) {
417- let until = Instant :: now ( ) + Duration :: from_secs ( BLACKLIST_COOLDOWN_SECS ) ;
440+ self . blacklist_script_for (
441+ script_id,
442+ Duration :: from_secs ( BLACKLIST_COOLDOWN_SECS ) ,
443+ reason,
444+ ) ;
445+ }
446+
447+ fn blacklist_script_for ( & self , script_id : & str , cooldown : Duration , reason : & str ) {
448+ let until = Instant :: now ( ) + cooldown;
418449 let mut bl = self . blacklist . lock ( ) . unwrap ( ) ;
419450 bl. insert ( script_id. to_string ( ) , until) ;
420451 tracing:: warn!(
421452 "blacklisted script {} for {}s: {}" ,
422453 mask_script_id( script_id) ,
423- BLACKLIST_COOLDOWN_SECS ,
454+ cooldown . as_secs ( ) ,
424455 reason
425456 ) ;
426457 }
427458
459+ /// Record a batch timeout against `script_id`. After
460+ /// `TIMEOUT_STRIKE_LIMIT` timeouts inside `TIMEOUT_STRIKE_WINDOW`
461+ /// the deployment is blacklisted with a short cooldown so the
462+ /// round-robin stops sending real traffic to a deployment that's
463+ /// hung (most commonly: stale `TUNNEL_SERVER_URL` after the
464+ /// tunnel-node moved hosts).
465+ pub ( crate ) fn record_timeout_strike ( & self , script_id : & str ) {
466+ let now = Instant :: now ( ) ;
467+ let mut counts = self . script_timeouts . lock ( ) . unwrap ( ) ;
468+ let entry = counts
469+ . entry ( script_id. to_string ( ) )
470+ . or_insert ( ( now, 0 ) ) ;
471+ if now. duration_since ( entry. 0 ) > TIMEOUT_STRIKE_WINDOW {
472+ * entry = ( now, 1 ) ;
473+ } else {
474+ entry. 1 += 1 ;
475+ }
476+ let strikes = entry. 1 ;
477+ if strikes >= TIMEOUT_STRIKE_LIMIT {
478+ counts. remove ( script_id) ;
479+ drop ( counts) ;
480+ self . blacklist_script_for (
481+ script_id,
482+ Duration :: from_secs ( TIMEOUT_BLACKLIST_COOLDOWN_SECS ) ,
483+ & format ! (
484+ "{} timeouts in {}s" ,
485+ strikes,
486+ TIMEOUT_STRIKE_WINDOW . as_secs( )
487+ ) ,
488+ ) ;
489+ }
490+ }
491+
492+ /// Clear the timeout strike counter for `script_id`. Called after
493+ /// a batch succeeds so a recovered deployment doesn't keep stale
494+ /// strikes from hours ago — three strikes must occur within one
495+ /// real failure burst, not accumulate across unrelated incidents.
496+ pub ( crate ) fn record_batch_success ( & self , script_id : & str ) {
497+ let mut counts = self . script_timeouts . lock ( ) . unwrap ( ) ;
498+ counts. remove ( script_id) ;
499+ }
500+
428501 /// Log a relay failure with extra guidance on cert-validation cases.
429502 /// Rate-limited so a flood of identical "UnknownIssuer" errors doesn't
430503 /// fill the log.
0 commit comments