Skip to content

Commit ba99d2a

Browse files
perf: TLS connection pool + coalesce tuning for lower latency (#751)
TLS pool improvements: - Increase POOL_TTL from 45s to 60s so connections live longer - Add POOL_MIN (8): background refill loop keeps at least 8 ready TLS connections so acquire() never pays a cold handshake - Refill checks every 5s, only counts connections with ≥20s remaining as "healthy" — nearly-expired entries don't count - warm() now opens sequentially (500ms gaps) with 8s expiry offset per connection so they roll off gradually instead of all expiring together after a cliff - acquire() picks the freshest connection (most remaining TTL) instead of popping whatever is on top Coalesce step increase: - DEFAULT_COALESCE_STEP_MS: 10 → 200. The dominant bottleneck is the Apps Script round-trip (~1.5s), so the extra 200ms wait is negligible to the user but lets significantly more ops land in each batch — measured 3–5 ops/batch vs 1 op/batch at 10ms during page loads, cutting round-trips roughly in half. Tested on Android (Pixel 6 Pro) with full-mode tunnel. Pool hit rate went from 96% (POOL_MIN=4) to 100% (POOL_MIN=8) — zero cold TLS handshakes during requests. Co-authored-by: yyoyoian-pixel <279225925+yyoyoian-pixel@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent e13bca8 commit ba99d2a

3 files changed

Lines changed: 106 additions & 41 deletions

File tree

src/domain_fronter.rs

Lines changed: 85 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,9 @@ pub enum FronterError {
5757
}
5858

5959
type PooledStream = TlsStream<TcpStream>;
60-
const POOL_TTL_SECS: u64 = 45;
60+
const POOL_TTL_SECS: u64 = 60;
61+
const POOL_MIN: usize = 8;
62+
const POOL_REFILL_INTERVAL_SECS: u64 = 5;
6163
const POOL_MAX: usize = 80;
6264
const REQUEST_TIMEOUT_SECS: u64 = 25;
6365
const RANGE_PARALLEL_CHUNK_BYTES: u64 = 256 * 1024;
@@ -644,33 +646,31 @@ impl DomainFronter {
644646
Ok(tls)
645647
}
646648

647-
/// Open `n` outbound TLS connections in parallel and park them in the
648-
/// pool so the first few user requests don't pay the handshake cost.
649-
/// Errors are logged but not returned — best-effort.
649+
/// Open `n` outbound TLS connections sequentially (500 ms apart) and
650+
/// park them in the pool. Staggered so we don't burst N TLS handshakes
651+
/// at Google edge simultaneously, and each connection gets an 8 s
652+
/// expiry offset so they roll off gradually instead of all hitting
653+
/// POOL_TTL_SECS at once.
650654
pub async fn warm(self: &Arc<Self>, n: usize) {
651-
let mut set = tokio::task::JoinSet::new();
652-
for _ in 0..n {
653-
let me = self.clone();
654-
set.spawn(async move {
655-
match me.open().await {
656-
Ok(s) => Some(PoolEntry {
655+
let mut warmed = 0usize;
656+
for i in 0..n {
657+
if i > 0 {
658+
tokio::time::sleep(Duration::from_millis(500)).await;
659+
}
660+
match self.open().await {
661+
Ok(s) => {
662+
let entry = PoolEntry {
657663
stream: s,
658-
created: Instant::now(),
659-
}),
660-
Err(e) => {
661-
tracing::debug!("pool warm: open failed: {}", e);
662-
None
664+
created: Instant::now() - Duration::from_secs(8 * i as u64),
665+
};
666+
let mut pool = self.pool.lock().await;
667+
if pool.len() < POOL_MAX {
668+
pool.push(entry);
669+
warmed += 1;
663670
}
664671
}
665-
});
666-
}
667-
let mut warmed = 0;
668-
while let Some(res) = set.join_next().await {
669-
if let Ok(Some(entry)) = res {
670-
let mut pool = self.pool.lock().await;
671-
if pool.len() < POOL_MAX {
672-
pool.push(entry);
673-
warmed += 1;
672+
Err(e) => {
673+
tracing::debug!("pool warm: open failed: {}", e);
674674
}
675675
}
676676
}
@@ -679,6 +679,56 @@ impl DomainFronter {
679679
}
680680
}
681681

682+
/// Background loop that keeps at least `POOL_MIN` valid connections
683+
/// ready. A connection only counts toward the minimum if it has at
684+
/// least 20 s of TTL remaining — nearly-expired entries don't help.
685+
/// Checks every `POOL_REFILL_INTERVAL_SECS`, evicts expired entries,
686+
/// and opens replacements one at a time so there's no burst.
687+
pub async fn run_pool_refill(self: Arc<Self>) {
688+
const MIN_REMAINING_SECS: u64 = 20;
689+
loop {
690+
tokio::time::sleep(Duration::from_secs(POOL_REFILL_INTERVAL_SECS)).await;
691+
692+
// Evict expired entries first.
693+
{
694+
let mut pool = self.pool.lock().await;
695+
pool.retain(|e| e.created.elapsed().as_secs() < POOL_TTL_SECS);
696+
}
697+
698+
// Count only connections with enough life left.
699+
// Refill one at a time to avoid bursting TLS handshakes.
700+
loop {
701+
let healthy = {
702+
let pool = self.pool.lock().await;
703+
pool.iter()
704+
.filter(|e| {
705+
let age = e.created.elapsed().as_secs();
706+
age + MIN_REMAINING_SECS < POOL_TTL_SECS
707+
})
708+
.count()
709+
};
710+
if healthy >= POOL_MIN {
711+
break;
712+
}
713+
match self.open().await {
714+
Ok(s) => {
715+
let mut pool = self.pool.lock().await;
716+
if pool.len() < POOL_MAX {
717+
pool.push(PoolEntry {
718+
stream: s,
719+
created: Instant::now(),
720+
});
721+
}
722+
}
723+
Err(e) => {
724+
tracing::debug!("pool refill: open failed: {}", e);
725+
break;
726+
}
727+
}
728+
}
729+
}
730+
}
731+
682732
/// Keep the Apps Script container warm with a periodic HEAD ping.
683733
///
684734
/// `acquire()` keeps the *TCP/TLS pool* warm but does nothing for the
@@ -721,12 +771,17 @@ impl DomainFronter {
721771
async fn acquire(&self) -> Result<PoolEntry, FronterError> {
722772
{
723773
let mut pool = self.pool.lock().await;
724-
while let Some(entry) = pool.pop() {
725-
if entry.created.elapsed().as_secs() < POOL_TTL_SECS {
726-
return Ok(entry);
727-
}
728-
// expired — drop it
729-
drop(entry);
774+
// Evict expired, then hand out the freshest (most remaining TTL).
775+
pool.retain(|e| e.created.elapsed().as_secs() < POOL_TTL_SECS);
776+
if !pool.is_empty() {
777+
// Freshest = smallest elapsed time. swap_remove is O(1).
778+
let freshest = pool
779+
.iter()
780+
.enumerate()
781+
.min_by_key(|(_, e)| e.created.elapsed())
782+
.map(|(i, _)| i)
783+
.unwrap();
784+
return Ok(pool.swap_remove(freshest));
730785
}
731786
}
732787
let stream = self.open().await?;

src/proxy_server.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -593,6 +593,16 @@ impl ProxyServer {
593593
tokio::spawn(async move { std::future::pending::<()>().await })
594594
};
595595

596+
// Background pool refill: keeps at least POOL_MIN ready TLS
597+
// connections so acquire() never pays a cold handshake.
598+
let refill_task = if let Some(refill_fronter) = self.fronter.clone() {
599+
tokio::spawn(async move {
600+
refill_fronter.run_pool_refill().await;
601+
})
602+
} else {
603+
tokio::spawn(async move { std::future::pending::<()>().await })
604+
};
605+
596606
let stats_task = if let Some(stats_fronter) = self.fronter.clone() {
597607
tokio::spawn(async move {
598608
let mut ticker = tokio::time::interval(std::time::Duration::from_secs(60));
@@ -701,6 +711,7 @@ impl ProxyServer {
701711
tracing::info!("Shutdown signal received, stopping listeners");
702712
stats_task.abort();
703713
keepalive_task.abort();
714+
refill_task.abort();
704715
http_task.abort();
705716
socks_task.abort();
706717
}

src/tunnel_client.rs

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -60,17 +60,16 @@ const CLIENT_FIRST_DATA_WAIT: Duration = Duration::from_millis(50);
6060
/// step for more ops. Resets on every arrival, up to max from the first
6161
/// op. Overridable via config `coalesce_step_ms` / `coalesce_max_ms`.
6262
///
63-
/// 10 ms is enough to catch ops that arrive in the same event-loop tick
64-
/// (e.g. a browser opening 6 parallel connections) without adding
65-
/// perceptible latency to downloads where the tunnel-node reply — not
66-
/// coalescing — is the real bottleneck. When both sides *do* have data
67-
/// in flight (uploads, bursty page loads), the adaptive reset still
68-
/// packs batches efficiently: each arriving op resets the step timer, so
69-
/// a rapid burst naturally coalesces up to `DEFAULT_COALESCE_MAX_MS`
70-
/// without an explicit upload/download distinction. The net effect is
71-
/// "don't wait when there's nothing to wait for; batch aggressively when
72-
/// there is."
73-
const DEFAULT_COALESCE_STEP_MS: u64 = 10;
63+
/// 200 ms balances latency against batching efficiency. The dominant
64+
/// bottleneck is the Apps Script round-trip (~1.5 s), so the extra
65+
/// 200 ms wait is negligible to the user but lets significantly more
66+
/// ops land in each batch — a page load that would fire 10 separate
67+
/// 1-op batches at 10 ms now packs 3–5 ops per batch, cutting the
68+
/// number of round-trips roughly in half. On idle sessions the step
69+
/// timer fires once with nothing queued (no cost); under load each
70+
/// arriving op resets the timer, so rapid bursts still coalesce up to
71+
/// `DEFAULT_COALESCE_MAX_MS` naturally.
72+
const DEFAULT_COALESCE_STEP_MS: u64 = 200;
7473
const DEFAULT_COALESCE_MAX_MS: u64 = 1000;
7574

7675
/// Structured error code the tunnel-node returns when it doesn't know the

0 commit comments

Comments
 (0)