MasterHttpRelayVPN-RUST/src/proxy_server.rs at 3ce3d81c119e231d9dcf137fc9d53565a820ec13 · dazzling-no-more/MasterHttpRelayVPN-RUST · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
use std::collections::{HashMap, VecDeque};
use std::net::SocketAddr;
use std::sync::Arc;
use std::time::Duration;

use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tokio::net::{TcpListener, TcpStream, UdpSocket};
use tokio::sync::{mpsc, Mutex};
use tokio::task::JoinSet;
use tokio_rustls::rustls::client::danger::{
    HandshakeSignatureValid, ServerCertVerified, ServerCertVerifier,
};
use tokio_rustls::rustls::pki_types::{CertificateDer, ServerName, UnixTime};
use tokio_rustls::rustls::server::Acceptor;
use tokio_rustls::rustls::{ClientConfig, DigitallySignedStruct, SignatureScheme};
use tokio_rustls::{LazyConfigAcceptor, TlsAcceptor, TlsConnector};

use crate::config::{Config, Mode};
use crate::domain_fronter::DomainFronter;
use crate::mitm::MitmCertManager;
use crate::tunnel_client::{decode_udp_packets, TunnelMux};

// Domains that are served from Google's core frontend IP pool and therefore
// respond correctly when we connect to `google_ip` with SNI=`front_domain`
// and Host=<the real domain>. Routing these via the tunnel instead of the
// Apps Script relay also avoids Apps Script's fixed "Google-Apps-Script"
// User-Agent, which makes Google serve the bot/no-JS fallback for search.
// Kept conservative: anything on a separate CDN (googlevideo, ytimg,
// doubleclick, etc.) is DROPPED because routing to the wrong backend breaks
// rather than helps. Those fall through to MITM+relay (slower but works).
// Domains that are hosted on the Google Front End and therefore reachable via
// the same SNI-rewrite tunnel used for www.google.com itself. Adding a suffix
// here means "TLS CONNECT to google_ip, SNI = front_domain, Host = real name"
// for requests to it — bypassing the Apps Script relay entirely, so there's no
// User-Agent locking and no Apps Script quota.
// When in doubt leave it out: sites that aren't actually on GFE will 404 or
// return a wrong-cert error instead of loading.
const SNI_REWRITE_SUFFIXES: &[&str] = &[
    // Core Google
    "google.com",
    "gstatic.com",
    "googleusercontent.com",
    "googleapis.com",
    "ggpht.com",
    // YouTube family
    "youtube.com",
    "youtu.be",
    "youtube-nocookie.com",
    "ytimg.com",
    // NOTE on `googlevideo.com`: v1.7.4 (#275) added this here on the
    // theory that video chunks should bypass the Apps Script relay.
    // **Reverted in v1.7.6** — multiple users (#275 amirabbas117, #281
    // mrerf) reported total YouTube breakage after v1.7.4. Root cause
    // is that googlevideo.com is served by Google's separate "EVA"
    // edge IPs, not the regular GFE IPs that the user's `google_ip`
    // typically points at. SNI-rewriting `googlevideo.com:443` to a
    // GFE IP got TLS handshake / wrong-cert errors for those users.
    // Pre-v1.7.4 behaviour (chunks via the Apps Script relay path —
    // slow but reliable on every GFE IP) is restored. If we ever want
    // direct googlevideo.com routing, it needs a separate config knob
    // that lets users specify their EVA edge IP independently.
    // Google Video Transport CDN — YouTube video chunks, Chrome
    // auto-updates, Google Play Store downloads. The single biggest
    // gap vs the upstream Python port: without these in the list
    // YouTube video playback stalls because every chunk tries to
    // traverse Apps Script instead of the direct GFE tunnel.
    "gvt1.com",
    "gvt2.com",
    // Ad + analytics infra. All on GFE, all previously broken the
    // same way YouTube was: SNI-blocked on Iranian DPI, but reachable
    // via `google_ip` with SNI rewritten.
    "doubleclick.net",
    "googlesyndication.com",
    "googleadservices.com",
    "google-analytics.com",
    "googletagmanager.com",
    "googletagservices.com",
    // fonts.googleapis.com is technically covered by the googleapis.com
    // suffix above, but mirroring Python's explicit listing makes the
    // intent obvious at a glance.
    "fonts.googleapis.com",
    // Blogger / Blog.google
    "blogspot.com",
    "blogger.com",
];

/// YouTube hosts that should be routed through the Apps Script relay
/// when `youtube_via_relay` is enabled — the API + HTML surfaces where
/// Restricted Mode is actually enforced (via the SNI=www.google.com
/// edge looking at the request). Issue #102 / #275.
///
/// Deliberately narrower than the YouTube section of
/// `SNI_REWRITE_SUFFIXES`:
///   - `youtube.com` / `youtu.be` / `youtube-nocookie.com`: HTML pages
///     and player frames. These trigger Restricted Mode if served via
///     the SNI rewrite, so when the flag is on we relay them.
///   - `youtubei.googleapis.com`: the YouTube data API the player
///     queries for video metadata + manifest. Restricted Mode also
///     gates video availability here. Without this entry, the JSON
///     RPC layer would still hit the SNI-rewrite tunnel via the
///     broader `googleapis.com` suffix — the user-visible symptom of
///     that miss is "youtube_via_relay flips on but Restricted Mode
///     stays sticky on some videos."
///
/// **NOT** in this list (intentional, was a regression in #275):
///   - `ytimg.com`: thumbnails. No Restricted Mode logic on a static
///     image CDN; routing through Apps Script makes thumbnails slow
///     for zero gain.
///   - `googlevideo.com`: video chunk CDN. Routing through Apps Script
///     means every chunk eats Apps Script quota *and* risks the 6-min
///     execution cap aborting long videos mid-playback.
///   - `ggpht.com`: channel/profile images, same reasoning as ytimg.
const YOUTUBE_RELAY_HOSTS: &[&str] = &[
    "youtube.com",
    "youtu.be",
    "youtube-nocookie.com",
    "youtubei.googleapis.com",
];

fn matches_sni_rewrite(host: &str, youtube_via_relay: bool) -> bool {
    let h = host.to_ascii_lowercase();
    let h = h.trim_end_matches('.');

    // YouTube relay carve-out runs FIRST so it wins over the broad
    // `googleapis.com` suffix that would otherwise pull
    // `youtubei.googleapis.com` into the SNI-rewrite path. The earlier
    // implementation iterated SNI_REWRITE_SUFFIXES with a filter, which
    // works for sibling entries (e.g. `youtube.com` in both lists) but
    // not for nested ones (`youtubei.googleapis.com` matches the broad
    // `googleapis.com` even when its specific entry is filtered out).
    // The short-circuit here is unconditional — we don't need to check
    // SNI rewrite once we've decided this host goes to the relay.
    if youtube_via_relay {
        for s in YOUTUBE_RELAY_HOSTS {
            if h == *s || h.ends_with(&format!(".{}", s)) {
                return false;
            }
        }
    }

    SNI_REWRITE_SUFFIXES
        .iter()
        .any(|s| h == *s || h.ends_with(&format!(".{}", s)))
}

fn hosts_override<'a>(
    hosts: &'a std::collections::HashMap<String, String>,
    host: &str,
) -> Option<&'a str> {
    let h = host.to_ascii_lowercase();
    let h = h.trim_end_matches('.');
    if let Some(ip) = hosts.get(h) {
        return Some(ip.as_str());
    }
    let parts: Vec<&str> = h.split('.').collect();
    for i in 1..parts.len() {
        let parent = parts[i..].join(".");
        if let Some(ip) = hosts.get(&parent) {
            return Some(ip.as_str());
        }
    }
    None
}

#[derive(Debug, thiserror::Error)]
pub enum ProxyError {
    #[error("io: {0}")]
    Io(#[from] std::io::Error),
}

pub struct ProxyServer {
    host: String,
    port: u16,
    socks5_port: u16,
    /// `None` in `google_only` (bootstrap) mode: no Apps Script relay is
    /// wired up, only the SNI-rewrite tunnel path is live.
    fronter: Option<Arc<DomainFronter>>,
    mitm: Arc<Mutex<MitmCertManager>>,
    rewrite_ctx: Arc<RewriteCtx>,
    tunnel_mux: Option<Arc<TunnelMux>>,
}

pub struct RewriteCtx {
    pub google_ip: String,
    pub front_domain: String,
    pub hosts: std::collections::HashMap<String, String>,
    pub tls_connector: TlsConnector,
    pub upstream_socks5: Option<String>,
    pub mode: Mode,
    /// If true, YouTube traffic bypasses the SNI-rewrite tunnel and
    /// goes through the Apps Script relay instead. See config.rs for
    /// the trade-off. Issue #102.
    pub youtube_via_relay: bool,
    /// User-configured hostnames that should skip the relay entirely
    /// and pass through as plain TCP (optionally via upstream_socks5).
    /// See config.rs `passthrough_hosts` for matching rules. Issues #39, #127.
    pub passthrough_hosts: Vec<String>,
    /// If true, drop SOCKS5 UDP datagrams destined for port 443 so
    /// callers fall back to TCP/HTTPS. See config.rs `block_quic` for
    /// the trade-off. Issue #213.
    pub block_quic: bool,
}

/// True if `host` matches any entry in the user's passthrough list.
/// Match is case-insensitive. Entries match either exactly, or as a
/// suffix if they start with "." (e.g. ".internal.example" matches
/// "a.b.internal.example" and the bare "internal.example"). Bare
/// entries like "example.com" only match the exact hostname — users
/// who want subdomains included should use ".example.com".
pub fn matches_passthrough(host: &str, list: &[String]) -> bool {
    if list.is_empty() {
        return false;
    }
    let h = host.to_ascii_lowercase();
    let h = h.trim_end_matches('.');
    list.iter().any(|entry| {
        let e = entry.trim().trim_end_matches('.').to_ascii_lowercase();
        if e.is_empty() {
            return false;
        }
        if let Some(suffix) = e.strip_prefix('.') {
            h == suffix || h.ends_with(&format!(".{}", suffix))
        } else {
            h == e
        }
    })
}

impl ProxyServer {
    pub fn new(config: &Config, mitm: Arc<Mutex<MitmCertManager>>) -> Result<Self, ProxyError> {
        let mode = config
            .mode_kind()
            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidInput, format!("{e}")))?;

        // `google_only` mode skips the Apps Script relay entirely, so we must
        // not try to construct the DomainFronter — it errors on a missing
        // `script_id`, which is exactly the state a bootstrapping user is in.
        let fronter = match mode {
            Mode::AppsScript | Mode::Full => {
                let f = DomainFronter::new(config)
                    .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, format!("{e}")))?;
                Some(Arc::new(f))
            }
            Mode::GoogleOnly => None,
        };

        let tls_config = if config.verify_ssl {
            let mut roots = tokio_rustls::rustls::RootCertStore::empty();
            roots.extend(webpki_roots::TLS_SERVER_ROOTS.iter().cloned());
            ClientConfig::builder()
                .with_root_certificates(roots)
                .with_no_client_auth()
        } else {
            ClientConfig::builder()
                .dangerous()
                .with_custom_certificate_verifier(Arc::new(NoVerify))
                .with_no_client_auth()
        };
        let tls_connector = TlsConnector::from(Arc::new(tls_config));

        let rewrite_ctx = Arc::new(RewriteCtx {
            google_ip: config.google_ip.clone(),
            front_domain: config.front_domain.clone(),
            hosts: config.hosts.clone(),
            tls_connector,
            upstream_socks5: config.upstream_socks5.clone(),
            mode,
            youtube_via_relay: config.youtube_via_relay,
            passthrough_hosts: config.passthrough_hosts.clone(),
            block_quic: config.block_quic,
        });

        let socks5_port = config.socks5_port.unwrap_or(config.listen_port + 1);

        Ok(Self {
            host: config.listen_host.clone(),
            port: config.listen_port,
            socks5_port,
            fronter,
            mitm,
            rewrite_ctx,
            tunnel_mux: None, // initialized in run() inside the tokio runtime
        })
    }

    pub fn fronter(&self) -> Option<Arc<DomainFronter>> {
        self.fronter.clone()
    }
    pub async fn run(
        mut self,
        mut shutdown_rx: tokio::sync::oneshot::Receiver<()>,
    ) -> Result<(), ProxyError> {
        // Initialize TunnelMux inside the runtime (tokio::spawn requires it).
        if self.rewrite_ctx.mode == Mode::Full {
            if let Some(f) = self.fronter.as_ref() {
                self.tunnel_mux = Some(TunnelMux::start(f.clone()));
            }
        }

        let http_addr = format!("{}:{}", self.host, self.port);
        let socks_addr = format!("{}:{}", self.host, self.socks5_port);
        let http_listener = TcpListener::bind(&http_addr).await?;
        let socks_listener = TcpListener::bind(&socks_addr).await?;
        tracing::warn!(
            "Listening HTTP   on {} — set your browser HTTP proxy to this address.",
            http_addr
        );
        tracing::warn!(
            "Listening SOCKS5 on {} — xray / Telegram / app-level SOCKS5 clients use this.",
            socks_addr
        );
        // Pre-warm the outbound connection pool so the user's first request
        // doesn't pay a fresh TLS handshake to Google edge. Best-effort;
        // failures are logged and ignored. Skipped in `google_only` — there
        // is no fronter to warm.
        //
        // Sized to roughly match a browser's parallel-connection burst at
        // startup. The previous fixed `3` was fine for a single deployment
        // but left requests 4-10 of the opening burst paying a cold TLS
        // handshake each (~300ms). Scaling with deployment count gives
        // multi-account configs a proportionally warmer pool, capped so
        // single-deployment users don't hammer Google edge unnecessarily.
        if let Some(warm_fronter) = self.fronter.clone() {
            let n = warm_fronter.num_scripts().clamp(6, 16);
            tokio::spawn(async move {
                warm_fronter.warm(n).await;
            });
        }

        // Apps Script container keepalive. `warm()` above keeps the TCP
        // pool warm at startup, but the V8 container behind UrlFetchApp
        // goes cold after ~5min idle and costs 1-3s to wake. A periodic
        // HEAD ping prevents the cold-start lag on the first request
        // after a quiet pause (most visible as YouTube player stalls).
        // Skipped in google_only mode for the same reason as warm —
        // there's no fronter to ping.
        //
        // The handle is captured (not fire-and-forget) so the shutdown
        // arm of the select! below can abort it. Without that, hitting
        // Stop in the UI would leave the keepalive holding an
        // Arc<DomainFronter> on stale config and pinging Apps Script
        // every 240s — same class of bug that issue #99 hit for the
        // accept loops.
        let keepalive_task = if let Some(keepalive_fronter) = self.fronter.clone() {
            tokio::spawn(async move {
                keepalive_fronter.run_h1_keepalive().await;
            })
        } else {
            tokio::spawn(async move { std::future::pending::<()>().await })
        };

        let stats_task = if let Some(stats_fronter) = self.fronter.clone() {
            tokio::spawn(async move {
                let mut ticker = tokio::time::interval(std::time::Duration::from_secs(60));
                ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
                ticker.tick().await;
                loop {
                    ticker.tick().await;
                    let s = stats_fronter.snapshot_stats();
                    if s.relay_calls > 0 || s.cache_hits > 0 {
                        tracing::info!("{}", s.fmt_line());
                    }
                }
            })
        } else {
            tokio::spawn(async move { std::future::pending::<()>().await })
        };

        let http_fronter = self.fronter.clone();
        let http_mitm = self.mitm.clone();
        let http_ctx = self.rewrite_ctx.clone();
        let http_mux = self.tunnel_mux.clone();
        let mut http_task = tokio::spawn(async move {
            let mut fd_exhaust_count: u64 = 0;
            // Track every per-client child task in a JoinSet so that when
            // this accept task is aborted on shutdown, dropping the JoinSet
            // aborts the children too. Previously children were bare
            // `tokio::spawn(...)` handles with no ownership — aborting the
            // parent accept loop stopped taking new connections but left
            // in-flight ones running with the OLD config. That manifested
            // as "hitting Stop in the UI doesn't actually stop anything
            // already running" (issue #99) and as "changing auth_key and
            // Start doesn't take effect for domains with a live
            // keep-alive" because the old DomainFronter stayed alive
            // inside those child tasks.
            let mut children: tokio::task::JoinSet<()> = tokio::task::JoinSet::new();
            loop {
                // Opportunistic reap so completed children don't pile up
                // memory on long-running proxies.
                while children.try_join_next().is_some() {}

                let (sock, peer) = match http_listener.accept().await {
                    Ok(x) => {
                        fd_exhaust_count = 0;
                        x
                    }
                    Err(e) => {
                        accept_backoff("http", &e, &mut fd_exhaust_count).await;
                        continue;
                    }
                };
                let _ = sock.set_nodelay(true);
                let fronter = http_fronter.clone();
                let mitm = http_mitm.clone();
                let rewrite_ctx = http_ctx.clone();
                let mux = http_mux.clone();
                children.spawn(async move {
                    if let Err(e) = handle_http_client(sock, fronter, mitm, rewrite_ctx, mux).await
                    {
                        tracing::debug!("http client {} closed: {}", peer, e);
                    }
                });
            }
        });

        let socks_fronter = self.fronter.clone();
        let socks_mitm = self.mitm.clone();
        let socks_ctx = self.rewrite_ctx.clone();
        let socks_mux = self.tunnel_mux.clone();
        let mut socks_task = tokio::spawn(async move {
            let mut fd_exhaust_count: u64 = 0;
            // Same pattern as http_task above — JoinSet so shutdown
            // drops in-flight SOCKS5 clients instead of leaving them to
            // keep running on the stale config.
            let mut children: tokio::task::JoinSet<()> = tokio::task::JoinSet::new();
            loop {
                while children.try_join_next().is_some() {}

                let (sock, peer) = match socks_listener.accept().await {
                    Ok(x) => {
                        fd_exhaust_count = 0;
                        x
                    }
                    Err(e) => {
                        accept_backoff("socks", &e, &mut fd_exhaust_count).await;
                        continue;
                    }
                };
                let _ = sock.set_nodelay(true);
                let fronter = socks_fronter.clone();
                let mitm = socks_mitm.clone();
                let rewrite_ctx = socks_ctx.clone();
                let mux = socks_mux.clone();
                children.spawn(async move {
                    if let Err(e) =
                        handle_socks5_client(sock, fronter, mitm, rewrite_ctx, mux).await
                    {
                        tracing::debug!("socks client {} closed: {}", peer, e);
                    }
                });
            }
        });

        tokio::select! {
            biased;
            _ = &mut shutdown_rx => {
                tracing::info!("Shutdown signal received, stopping listeners");
                stats_task.abort();
                keepalive_task.abort();
                http_task.abort();
                socks_task.abort();
            }
            _ = &mut http_task => {}
            _ = &mut socks_task => {}
        }

        Ok(())
    }
}

/// Back-off helper for the accept() loop.
///
/// Motivated by issue #18: when the process hits its file-descriptor limit
/// (EMFILE — `No file descriptors available`), `accept()` returns that
/// error synchronously and is immediately ready to fire again. The old
/// loop just `continue`'d, producing a wall of identical ERROR lines
/// thousands per second and starving the tokio runtime of CPU that
/// existing connections would have used to drain and close.
///
/// Two things this does right:
///   1. Sleeps when `EMFILE` / `ENFILE` are seen, proportional to how long
///      the problem has been going on (exponential-ish, capped at 2s).
///      Gives existing connections a chance to finish and free fds.
///   2. Rate-limits the log line: first occurrence logs a full warning
///      with fix instructions, subsequent ones log once per 100 errors
///      so the log doesn't fill up.
async fn accept_backoff(kind: &str, err: &std::io::Error, count: &mut u64) {
    let is_fd_limit = matches!(
        err.raw_os_error(),
        Some(libc_emfile) if libc_emfile == 24 || libc_emfile == 23
    );

    *count = count.saturating_add(1);

    if is_fd_limit {
        if *count == 1 {
            tracing::warn!(
                "accept ({}) hit RLIMIT_NOFILE: {}. Backing off. Raise the fd limit: \
                 `ulimit -n 65536` before starting, or (OpenWRT) use the shipped procd \
                 init which sets nofile=16384. The listener will keep retrying.",
                kind,
                err
            );
        } else if *count % 100 == 0 {
            tracing::warn!(
                "accept ({}) still fd-limited after {} retries. Current connections \
                 need to finish before we can accept new ones.",
                kind,
                *count
            );
        }
        // Back off exponentially-ish up to 2s. First hit: 50ms, 10th hit:
        // ~500ms, 50th+: 2s cap.
        let backoff_ms = (50u64 * (*count).min(40)).min(2000);
        tokio::time::sleep(std::time::Duration::from_millis(backoff_ms)).await;
    } else {
        // Transient non-EMFILE error (e.g. ECONNABORTED from a client that
        // went away during the handshake). One-line log, short sleep to
        // avoid a tight loop in case it repeats.
        tracing::error!("accept ({}): {}", kind, err);
        tokio::time::sleep(std::time::Duration::from_millis(5)).await;
    }
}

async fn handle_http_client(
    mut sock: TcpStream,
    fronter: Option<Arc<DomainFronter>>,
    mitm: Arc<Mutex<MitmCertManager>>,
    rewrite_ctx: Arc<RewriteCtx>,
    tunnel_mux: Option<Arc<TunnelMux>>,
) -> std::io::Result<()> {
    let (head, leftover) = match read_http_head(&mut sock).await? {
        HeadReadResult::Got { head, leftover } => (head, leftover),
        HeadReadResult::Closed => return Ok(()),
        HeadReadResult::Oversized => {
            // Reply with 431 instead of just dropping the socket so the
            // browser shows a real error rather than retrying the same
            // oversized request in a loop.
            tracing::warn!(
                "request head exceeds {} bytes — refusing with 431",
                MAX_HEADER_BYTES
            );
            let _ = sock
                .write_all(
                    b"HTTP/1.1 431 Request Header Fields Too Large\r\n\
                      Connection: close\r\n\
                      Content-Length: 0\r\n\r\n",
                )
                .await;
            let _ = sock.flush().await;
            return Ok(());
        }
    };

    let (method, target, _version, _headers) = parse_request_head(&head)
        .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::InvalidData, "bad request"))?;

    if method.eq_ignore_ascii_case("CONNECT") {
        let (host, port) = parse_host_port(&target);
        // Mirror the SOCKS5 short-circuit: if the tunnel-node just failed
        // this (host, port) with unreachable, return 502 immediately rather
        // than acknowledging the CONNECT and blowing tunnel quota on a
        // guaranteed retry. See `TunnelMux::is_unreachable` for context.
        if let Some(ref mux) = tunnel_mux {
            if mux.is_unreachable(&host, port) {
                tracing::info!("CONNECT {}:{} (negative-cached, refusing)", host, port);
                let _ = sock
                    .write_all(b"HTTP/1.1 502 Bad Gateway\r\nContent-Length: 0\r\nConnection: close\r\n\r\n")
                    .await;
                let _ = sock.flush().await;
                return Ok(());
            }
        }
        sock.write_all(b"HTTP/1.1 200 Connection Established\r\n\r\n")
            .await?;
        sock.flush().await?;
        dispatch_tunnel(sock, host, port, fronter, mitm, rewrite_ctx, tunnel_mux).await
    } else {
        // Plain HTTP proxy request (e.g. `GET http://…`).
        //
        // apps_script mode: relay through the Apps Script fronter (which
        // is the whole point of the relay).
        //
        // google_only bootstrap mode: no fronter exists, so passthrough as
        // direct TCP. Same contract as `dispatch_tunnel` honors for CONNECT
        // in google_only — anything not on the Google edge is forwarded
        // direct (or via `upstream_socks5`) so the user's browser still
        // works while they finish setting up Apps Script. Issue: typing a
        // bare `http://example.com` URL used to return a 502 here even
        // though `https://example.com` (CONNECT) worked fine.
        match fronter {
            Some(f) => do_plain_http(sock, &head, &leftover, f).await,
            None => do_plain_http_passthrough(sock, &head, &leftover, &rewrite_ctx).await,
        }
    }
}

// ---------- SOCKS5 ----------

async fn handle_socks5_client(
    mut sock: TcpStream,
    fronter: Option<Arc<DomainFronter>>,
    mitm: Arc<Mutex<MitmCertManager>>,
    rewrite_ctx: Arc<RewriteCtx>,
    tunnel_mux: Option<Arc<TunnelMux>>,
) -> std::io::Result<()> {
    // RFC 1928 handshake: VER=5, NMETHODS, METHODS...
    let mut hdr = [0u8; 2];
    sock.read_exact(&mut hdr).await?;
    if hdr[0] != 0x05 {
        return Ok(());
    }
    let nmethods = hdr[1] as usize;
    let mut methods = vec![0u8; nmethods];
    sock.read_exact(&mut methods).await?;
    // Only "no auth" (0x00) is supported.
    if !methods.contains(&0x00) {
        sock.write_all(&[0x05, 0xff]).await?;
        return Ok(());
    }
    sock.write_all(&[0x05, 0x00]).await?;

    // Request: VER=5, CMD, RSV=0, ATYP, DST.ADDR, DST.PORT
    let mut req = [0u8; 4];
    sock.read_exact(&mut req).await?;
    if req[0] != 0x05 {
        return Ok(());
    }
    let cmd = req[1];
    if cmd != 0x01 && cmd != 0x03 {
        // CONNECT and UDP ASSOCIATE only.
        sock.write_all(&[0x05, 0x07, 0x00, 0x01, 0, 0, 0, 0, 0, 0])
            .await?;
        return Ok(());
    }
    let atyp = req[3];
    let host: String = match atyp {
        0x01 => {
            let mut ip = [0u8; 4];
            sock.read_exact(&mut ip).await?;
            format!("{}.{}.{}.{}", ip[0], ip[1], ip[2], ip[3])
        }
        0x03 => {
            let mut len = [0u8; 1];
            sock.read_exact(&mut len).await?;
            let mut name = vec![0u8; len[0] as usize];
            sock.read_exact(&mut name).await?;
            String::from_utf8_lossy(&name).into_owned()
        }
        0x04 => {
            let mut ip = [0u8; 16];
            sock.read_exact(&mut ip).await?;
            let addr = std::net::Ipv6Addr::from(ip);
            addr.to_string()
        }
        _ => {
            sock.write_all(&[0x05, 0x08, 0x00, 0x01, 0, 0, 0, 0, 0, 0])
                .await?;
            return Ok(());
        }
    };
    let mut port_buf = [0u8; 2];
    sock.read_exact(&mut port_buf).await?;
    let port = u16::from_be_bytes(port_buf);

    if cmd == 0x03 {
        tracing::info!("SOCKS5 UDP ASSOCIATE requested for {}:{}", host, port);
        return handle_socks5_udp_associate(sock, rewrite_ctx, tunnel_mux).await;
    }

    // Negative-cache short-circuit: if the tunnel-node just failed to reach
    // this exact (host, port) with `Network is unreachable` / `No route to
    // host`, reply 0x04 (Host unreachable) immediately. Saves a 1.5–2s tunnel
    // round-trip on guaranteed-failing targets — the IPv6 probe retry loop
    // is the main offender on devices without IPv6.
    if let Some(ref mux) = tunnel_mux {
        if mux.is_unreachable(&host, port) {
            tracing::info!("SOCKS5 CONNECT -> {}:{} (negative-cached, refusing)", host, port);
            sock.write_all(&[0x05, 0x04, 0x00, 0x01, 0, 0, 0, 0, 0, 0])
                .await?;
            sock.flush().await?;
            return Ok(());
        }
    }

    tracing::info!("SOCKS5 CONNECT -> {}:{}", host, port);

    // Success reply with zeroed BND.
    sock.write_all(&[0x05, 0x00, 0x00, 0x01, 0, 0, 0, 0, 0, 0])
        .await?;
    sock.flush().await?;

    dispatch_tunnel(sock, host, port, fronter, mitm, rewrite_ctx, tunnel_mux).await
}

#[derive(Clone, Debug, Eq, Hash, PartialEq)]
struct SocksUdpTarget {
    host: String,
    port: u16,
    atyp: u8,
    addr: Vec<u8>,
}

/// Per-target relay session state shared between the dispatch loop and
/// the per-session task. The dispatch loop pushes uplink datagrams via
/// `uplink`; the task drains the upstream and serializes both directions
/// onto a single tunnel-mux call at a time. `sid` is held here so the
/// dispatch teardown path can issue close_session for any task it has
/// to abort mid-await.
struct UdpRelaySession {
    sid: String,
    uplink: mpsc::Sender<Vec<u8>>,
}

/// All per-ASSOCIATE UDP relay state behind a single mutex so insertion
/// order, the live-session map, and per-task self-removal can all stay
/// consistent. Wrapping each separately invited a slow leak: the
/// previous design's `insertion_order` deque was only pruned on
/// overflow eviction, so a long-lived ASSOCIATE that opened many
/// short-lived sessions accumulated dead `SocksUdpTarget` entries.
struct UdpRelayState {
    sessions: HashMap<SocksUdpTarget, UdpRelaySession>,
    /// Insertion-order log for FIFO eviction. NOT a real LRU — repeated
    /// uplinks to a hot session do not move it to the back. We keep it
    /// in lockstep with `sessions` (insert appends; remove scans and
    /// erases the matching entry — O(N) but N ≤ 256).
    order: VecDeque<SocksUdpTarget>,
}

impl UdpRelayState {
    fn new() -> Self {
        Self {
            sessions: HashMap::new(),
            order: VecDeque::new(),
        }
    }

    fn get_uplink(&self, target: &SocksUdpTarget) -> Option<mpsc::Sender<Vec<u8>>> {
        self.sessions.get(target).map(|s| s.uplink.clone())
    }

    fn insert(&mut self, target: SocksUdpTarget, session: UdpRelaySession) {
        self.order.push_back(target.clone());
        self.sessions.insert(target, session);
    }

    fn remove(&mut self, target: &SocksUdpTarget) {
        if let Some(pos) = self.order.iter().position(|t| t == target) {
            self.order.remove(pos);
        }
        self.sessions.remove(target);
    }

    /// Pop the oldest session entries until `sessions.len() < cap`.
    /// Stale `order` entries (already removed by self-cleanup on a
    /// task's natural exit) are quietly skipped.
    fn evict_until_under(&mut self, cap: usize) -> Vec<SocksUdpTarget> {
        let mut evicted = Vec::new();
        while self.sessions.len() >= cap {
            let Some(victim) = self.order.pop_front() else {
                break;
            };
            if self.sessions.remove(&victim).is_some() {
                evicted.push(victim);
            }
        }
        evicted
    }

    /// Snapshot live sids for the teardown close_session sweep. We
    /// take a copy (not a drain) so the caller can decide whether to
    /// also clear the map.
    fn live_sids(&self) -> Vec<String> {
        self.sessions.values().map(|s| s.sid.clone()).collect()
    }

    fn clear(&mut self) {
        self.sessions.clear();
        self.order.clear();
    }
}

/// SOCKS5 UDP request frame: 4-byte header + atyp-specific address + 2-byte
/// port + payload. DOMAIN atyp uses a 1-byte length prefix + up to 255
/// bytes, so the largest header is `4 + 1 + 255 + 2 = 262`. Round to 300
/// for safety; payload itself can be a full 64 KB datagram.
const SOCKS5_UDP_RECV_BUF_BYTES: usize = 65535 + 300;

/// Bound on per-session uplink queue depth. UDP is lossy by design — if
/// the per-session task can't keep up, drop the newest datagram (caller
/// uses `try_send`) instead of stalling the whole UDP relay loop.
const UDP_UPLINK_QUEUE: usize = 64;

/// Initial poll spacing when a session is idle. Tunnel-node already
/// long-polls each empty `udp_data` for up to 5 s, so this is a
/// client-side floor — bursts of upstream packets reset back to this.
const UDP_INITIAL_POLL_DELAY: Duration = Duration::from_millis(500);

/// Cap on the exponential backoff for an idle session. After this many
/// seconds of zero traffic in either direction, polls happen at most
/// once per `UDP_MAX_POLL_DELAY` plus the tunnel-node long-poll window —
/// so an idle UDP destination costs roughly one batch slot every 35 s.
const UDP_MAX_POLL_DELAY: Duration = Duration::from_secs(30);

/// Cap on simultaneous UDP relay sessions per SOCKS5 ASSOCIATE. STUN
/// candidate gathering and DNS fanout produce dozens of distinct
/// targets; an abusive or runaway client could produce thousands.
/// 256 is generous for legitimate use and bounds tunnel-node UDP
/// sessions a single ASSOCIATE can hold open.
///
/// Eviction policy is FIFO by insertion time, not true LRU — repeated
/// uplinks to a hot session do not move it to the back. Real LRU
/// would need a touch on every uplink (extra lock acquisition per
/// datagram); the long-tail of dead targets gets cleaned up here just
/// fine without that cost, and live targets are typically also recently
/// inserted.
const MAX_UDP_SESSIONS_PER_ASSOCIATE: usize = 256;

/// Drop UDP datagrams larger than this (pre-base64). Standard MTU is
/// 1500B, jumbo frames are ~9000B; anything above that is either a
/// pathologically fragmented IP datagram or abusive traffic. Each
/// datagram carries ~33% base64 + JSON envelope overhead and consumes
/// Apps Script per-account quota, so a permissive ceiling here matters.
const MAX_UDP_PAYLOAD_BYTES: usize = 9 * 1024;

async fn handle_socks5_udp_associate(
    mut control: TcpStream,
    rewrite_ctx: Arc<RewriteCtx>,
    tunnel_mux: Option<Arc<TunnelMux>>,
) -> std::io::Result<()> {
    if rewrite_ctx.mode != Mode::Full {
        tracing::debug!("UDP ASSOCIATE rejected: only full mode supports UDP tunneling");
        write_socks5_reply(&mut control, 0x07, None).await?;
        return Ok(());
    }
    let Some(mux) = tunnel_mux else {
        tracing::debug!("UDP ASSOCIATE rejected: full mode has no tunnel mux");
        write_socks5_reply(&mut control, 0x01, None).await?;
        return Ok(());
    };

    // Per RFC 1928 §6 the UDP relay only accepts datagrams from the
    // SOCKS5 client. We pin the source IP to the control TCP peer up
    // front so a third party on the bind interface can't hijack the
    // session by sending the first datagram. THIS — not the bind IP
    // below — is what actually keeps unauthenticated traffic out.
    let client_peer_ip = control.peer_addr()?.ip();

    // Bind the UDP relay to the same local IP the SOCKS5 client used
    // to reach the control TCP socket. `TcpStream::local_addr()` on an
    // accepted socket returns the concrete terminating address (e.g.
    // 127.0.0.1 for a loopback client, 192.168.1.5 for a LAN client),
    // not the listener's bind specifier — so this naturally tracks the
    // path the client took. Source-IP filtering above is the security
    // boundary; the bind choice is just about reachability.
    let bind_ip = control.local_addr()?.ip();
    let udp = Arc::new(UdpSocket::bind(SocketAddr::new(bind_ip, 0)).await?);
    write_socks5_reply(&mut control, 0x00, Some(udp.local_addr()?)).await?;
    tracing::info!(
        "SOCKS5 UDP relay bound on {} for client {}",
        udp.local_addr()?,
        client_peer_ip
    );

    let mut buf = vec![0u8; SOCKS5_UDP_RECV_BUF_BYTES];
    let mut control_buf = [0u8; 1];
    let mut client_addr: Option<SocketAddr> = None;
    let state: Arc<Mutex<UdpRelayState>> = Arc::new(Mutex::new(UdpRelayState::new()));
    // Tracking per-target tasks here — instead of bare `tokio::spawn`
    // — lets the teardown path call `abort_all()`, cancelling any
    // in-flight `mux.udp_data` await. Without it, a task mid-poll
    // could keep paying tunnel-node round trips for up to 5 s after
    // the SOCKS5 client went away.
    let mut tasks: JoinSet<()> = JoinSet::new();
    let mut oversized_dropped: u64 = 0;
    let mut sessions_evicted: u64 = 0;
    let mut foreign_ip_drops: u64 = 0;

    loop {
        tokio::select! {
            recv = udp.recv_from(&mut buf) => {
                let (n, peer) = match recv {
                    Ok(v) => v,
                    Err(e) => {
                        tracing::debug!("udp associate recv failed: {}", e);
                        break;
                    }
                };
                // Source-IP check: anything not from the SOCKS5 client's
                // host is dropped silently.
                if peer.ip() != client_peer_ip {
                    foreign_ip_drops += 1;
                    if foreign_ip_drops == 1 || foreign_ip_drops.is_multiple_of(100) {
                        tracing::debug!(
                            "udp dropped from unauthorized source {}: count={}",
                            peer.ip(),
                            foreign_ip_drops,
                        );
                    }
                    continue;
                }

                // Parse BEFORE port-locking. A malformed datagram from
                // the right IP must not pin client_addr to its source
                // port — otherwise a co-tenant on the bind interface
                // can race one bad packet to DoS the legitimate client
                // (whose real datagram, sent from a different ephemeral
                // port, would then be silently rejected).
                let Some((target, payload)) = parse_socks5_udp_packet(&buf[..n]) else {
                    continue;
                };

                // Issue #213: client-side QUIC block. UDP/443 is
                // HTTP/3 — drop the datagram silently so the client
                // stack retries a couple of times and then falls back
                // to TCP/HTTPS, which goes through the regular CONNECT
                // path. Skipping this at the SOCKS5 layer (rather than
                // letting it hit the tunnel-node) avoids paying the
                // 200–500 ms tunnel-node round-trip per dropped QUIC
                // datagram, which would otherwise compound during the
                // 1–3 retries before the browser falls back.
                //
                // Silent drop instead of an explicit error reply: the
                // SOCKS5 UDP wire has no "destination unreachable"
                // datagram — `0x04` only exists in TCP CONNECT replies
                // (RFC 1928 §6). The browser's QUIC stack already has
                // a "no response → fall back" timeout, so silent drop
                // is the contractually correct shape.
                if rewrite_ctx.block_quic && target.port == 443 {
                    tracing::debug!(
                        "udp dropped: block_quic=true, target {}:443",
                        target.host
                    );
                    continue;
                }

                // RFC 1928 §6: lock to the first VALID datagram's source
                // port. Subsequent datagrams must come from the same
                // (ip, port) pair.
                if let Some(existing) = client_addr {
                    if existing != peer {
                        continue;
                    }
                } else {
                    tracing::info!("UDP relay locked to client {}", peer);
                    client_addr = Some(peer);
                }

                // Size guard: drop oversize datagrams before they reach
                // the mux. Each datagram costs ~payload * 1.33 in the
                // batched JSON envelope plus tunnel-node CPU; uncapped,
                // a runaway client can exhaust Apps Script quota.
                if payload.len() > MAX_UDP_PAYLOAD_BYTES {
                    oversized_dropped += 1;
                    if oversized_dropped == 1 || oversized_dropped.is_multiple_of(100) {
                        tracing::debug!(
                            "udp datagram dropped: {} B > {} B (count={})",
                            payload.len(),
                            MAX_UDP_PAYLOAD_BYTES,
                            oversized_dropped,
                        );
                    }
                    continue;
                }
                let payload = payload.to_vec();

                // Fast path: existing session — push payload onto its
                // bounded uplink queue, drop on overflow (UDP semantics).
                {
                    let st = state.lock().await;
                    if let Some(uplink) = st.get_uplink(&target) {
                        let _ = uplink.try_send(payload);
                        continue;
                    }
                }

                // Cap reached → evict oldest sessions before opening a
                // new one. Each evicted entry drops its uplink Sender,
                // which causes the per-session task to exit its select
                // and tell tunnel-node to close. Any uplink already in
                // that channel is delivered before the task exits.
                {
                    let mut st = state.lock().await;
                    let evicted = st.evict_until_under(MAX_UDP_SESSIONS_PER_ASSOCIATE);
                    for victim in evicted {
                        sessions_evicted += 1;
                        if sessions_evicted == 1 || sessions_evicted.is_multiple_of(50) {
                            tracing::debug!(
                                "udp session cap {} reached; evicted {}:{} (total evicted={})",
                                MAX_UDP_SESSIONS_PER_ASSOCIATE,
                                victim.host,
                                victim.port,
                                sessions_evicted,
                            );
                        }
                    }
                }

                // New target: open via tunnel-node and spawn the per-session
                // task. The first datagram rides the udp_open op so we