Skip to content

Commit d6d1006

Browse files
feat(tunnel-client): blacklist deployments after sustained timeouts (#319)
1 parent 8758a75 commit d6d1006

2 files changed

Lines changed: 90 additions & 3 deletions

File tree

src/domain_fronter.rs

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,13 @@ pub struct DomainFronter {
102102
inflight: Arc<Mutex<HashMap<String, broadcast::Sender<Vec<u8>>>>>,
103103
coalesced: AtomicU64,
104104
blacklist: Arc<std::sync::Mutex<HashMap<String, Instant>>>,
105+
/// Per-deployment rolling timeout counter. Maps `script_id` →
106+
/// `(window_start, strike_count)`. Reset when the window expires
107+
/// or when a batch succeeds. Triggers a short-cooldown blacklist
108+
/// at `TIMEOUT_STRIKE_LIMIT`. Distinct from `blacklist` because
109+
/// strike state is per-deployment health bookkeeping, not the
110+
/// permanent ban list.
111+
script_timeouts: Arc<std::sync::Mutex<HashMap<String, (Instant, u32)>>>,
105112
relay_calls: AtomicU64,
106113
relay_failures: AtomicU64,
107114
bytes_relayed: AtomicU64,
@@ -146,6 +153,21 @@ impl HostStat {
146153

147154
const BLACKLIST_COOLDOWN_SECS: u64 = 600;
148155

156+
/// Sliding window for the timeout-strike blacklist heuristic. Three
157+
/// timeouts within this window on a single deployment trip the
158+
/// blacklist. Tuned so a single cold-start stall plus one transient
159+
/// network blip won't false-trigger, but a deployment that's actually
160+
/// dead (stale `TUNNEL_SERVER_URL`, paused project, dropped script)
161+
/// fails fast instead of poisoning round-robin until the user notices.
162+
const TIMEOUT_STRIKE_WINDOW: Duration = Duration::from_secs(30);
163+
const TIMEOUT_STRIKE_LIMIT: u32 = 3;
164+
165+
/// Cooldown for a deployment blacklisted via the timeout-strike path.
166+
/// Distinct from `BLACKLIST_COOLDOWN_SECS` (10 min) because timeouts
167+
/// are a much noisier signal than quota errors — if the deployment
168+
/// recovers, we want to rejoin in minutes, not after a 10-min penalty.
169+
const TIMEOUT_BLACKLIST_COOLDOWN_SECS: u64 = 120;
170+
149171
/// Request payload sent to Apps Script (single, non-batch).
150172
#[derive(Serialize)]
151173
struct RelayRequest<'a> {
@@ -258,6 +280,7 @@ impl DomainFronter {
258280
inflight: Arc::new(Mutex::new(HashMap::new())),
259281
coalesced: AtomicU64::new(0),
260282
blacklist: Arc::new(std::sync::Mutex::new(HashMap::new())),
283+
script_timeouts: Arc::new(std::sync::Mutex::new(HashMap::new())),
261284
relay_calls: AtomicU64::new(0),
262285
relay_failures: AtomicU64::new(0),
263286
bytes_relayed: AtomicU64::new(0),
@@ -414,17 +437,67 @@ impl DomainFronter {
414437
}
415438

416439
fn blacklist_script(&self, script_id: &str, reason: &str) {
417-
let until = Instant::now() + Duration::from_secs(BLACKLIST_COOLDOWN_SECS);
440+
self.blacklist_script_for(
441+
script_id,
442+
Duration::from_secs(BLACKLIST_COOLDOWN_SECS),
443+
reason,
444+
);
445+
}
446+
447+
fn blacklist_script_for(&self, script_id: &str, cooldown: Duration, reason: &str) {
448+
let until = Instant::now() + cooldown;
418449
let mut bl = self.blacklist.lock().unwrap();
419450
bl.insert(script_id.to_string(), until);
420451
tracing::warn!(
421452
"blacklisted script {} for {}s: {}",
422453
mask_script_id(script_id),
423-
BLACKLIST_COOLDOWN_SECS,
454+
cooldown.as_secs(),
424455
reason
425456
);
426457
}
427458

459+
/// Record a batch timeout against `script_id`. After
460+
/// `TIMEOUT_STRIKE_LIMIT` timeouts inside `TIMEOUT_STRIKE_WINDOW`
461+
/// the deployment is blacklisted with a short cooldown so the
462+
/// round-robin stops sending real traffic to a deployment that's
463+
/// hung (most commonly: stale `TUNNEL_SERVER_URL` after the
464+
/// tunnel-node moved hosts).
465+
pub(crate) fn record_timeout_strike(&self, script_id: &str) {
466+
let now = Instant::now();
467+
let mut counts = self.script_timeouts.lock().unwrap();
468+
let entry = counts
469+
.entry(script_id.to_string())
470+
.or_insert((now, 0));
471+
if now.duration_since(entry.0) > TIMEOUT_STRIKE_WINDOW {
472+
*entry = (now, 1);
473+
} else {
474+
entry.1 += 1;
475+
}
476+
let strikes = entry.1;
477+
if strikes >= TIMEOUT_STRIKE_LIMIT {
478+
counts.remove(script_id);
479+
drop(counts);
480+
self.blacklist_script_for(
481+
script_id,
482+
Duration::from_secs(TIMEOUT_BLACKLIST_COOLDOWN_SECS),
483+
&format!(
484+
"{} timeouts in {}s",
485+
strikes,
486+
TIMEOUT_STRIKE_WINDOW.as_secs()
487+
),
488+
);
489+
}
490+
}
491+
492+
/// Clear the timeout strike counter for `script_id`. Called after
493+
/// a batch succeeds so a recovered deployment doesn't keep stale
494+
/// strikes from hours ago — three strikes must occur within one
495+
/// real failure burst, not accumulate across unrelated incidents.
496+
pub(crate) fn record_batch_success(&self, script_id: &str) {
497+
let mut counts = self.script_timeouts.lock().unwrap();
498+
counts.remove(script_id);
499+
}
500+
428501
/// Log a relay failure with extra guidance on cert-validation cases.
429502
/// Rate-limited so a flood of identical "UnknownIssuer" errors doesn't
430503
/// fill the log.

src/tunnel_client.rs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ use tokio::io::{AsyncReadExt, AsyncWrite, AsyncWriteExt};
2323
use tokio::net::TcpStream;
2424
use tokio::sync::{mpsc, oneshot, Semaphore};
2525

26-
use crate::domain_fronter::{BatchOp, DomainFronter, TunnelResponse};
26+
use crate::domain_fronter::{BatchOp, DomainFronter, FronterError, TunnelResponse};
2727

2828
/// Apps Script allows 30 concurrent executions per account / deployment.
2929
const CONCURRENCY_PER_DEPLOYMENT: usize = 30;
@@ -827,6 +827,7 @@ async fn fire_batch(
827827

828828
match result {
829829
Ok(Ok(batch_resp)) => {
830+
f.record_batch_success(&script_id);
830831
for (idx, reply) in data_replies {
831832
if let Some(resp) = batch_resp.r.get(idx) {
832833
let _ = reply.send(Ok((resp.clone(), script_id.clone())));
@@ -836,13 +837,26 @@ async fn fire_batch(
836837
}
837838
}
838839
Ok(Err(e)) => {
840+
// Read-side timeout from `domain_fronter`: Apps Script didn't
841+
// start streaming response bytes within the per-read deadline.
842+
// Common cause: deployment's `TUNNEL_SERVER_URL` points at a
843+
// dead host, so UrlFetchApp inside Apps Script hangs until its
844+
// own internal connect timeout. Strike-counter blacklists the
845+
// deployment after a sustained pattern.
846+
if matches!(e, FronterError::Timeout) {
847+
f.record_timeout_strike(&script_id);
848+
}
839849
let err_msg = format!("{}", e);
840850
tracing::warn!("batch failed: {}", err_msg);
841851
for (_, reply) in data_replies {
842852
let _ = reply.send(Err(err_msg.clone()));
843853
}
844854
}
845855
Err(_) => {
856+
// Whole-batch budget (`BATCH_TIMEOUT`, 30 s) elapsed. Even
857+
// stronger signal than a per-read timeout — count it the same
858+
// way so a truly-stuck deployment exits round-robin fast.
859+
f.record_timeout_strike(&script_id);
846860
tracing::warn!("batch timed out after {:?} ({} ops)", BATCH_TIMEOUT, n_ops);
847861
for (_, reply) in data_replies {
848862
let _ = reply.send(Err("batch timed out".into()));

0 commit comments

Comments
 (0)