|
2 | 2 | "author": "Benjamin Hadad", |
3 | 3 | "github_id": "codemath3000", |
4 | 4 | "name": "PR #1797 base + n-gram tilt + AsymLogit + #2060 levers + NUM_PHASES=1 + GPTQ_CALIBRATION_BATCHES=32", |
5 | | - "blurb": "Single-knob refinement on top of PR #2130: GPTQ_CALIBRATION_BATCHES=32. Every other hyperparameter, env var, and code path is byte-for-byte the PR #2130 reproduction command. 3-seed mean val_bpb 1.05651 (sample std 3.58e-04), beats PR #2130's 1.05670 by 0.00019 BPB. Full validation target coverage preserved (val_tokens == target_tokens == 47851520).", |
| 5 | + "blurb": "11L 512d SP8192 CaseOps stack with LQER asymmetric rank-4 correction, SparseAttnGate, BOS-fixed SmearGate, AsymLogit Rescale, token-only n-gram tilt (per merged PR #1514), per-group compression, single-phase quantized phased score-first TTT (LoRA RANK=80, K-off), with GPTQ calibration batches doubled from 16 to 32. Full validation target coverage preserved: val_tokens == target_tokens == 47,851,520 in all included logs.", |
6 | 6 | "date": "2026-05-01", |
7 | 7 | "track": "10min_16mb", |
8 | 8 | "val_loss": 2.31203043, |
9 | 9 | "val_bpb": 1.05650768, |
10 | 10 | "val_loss_std": 0.00077803, |
11 | 11 | "val_bpb_std": 0.00035573, |
12 | | - "seeds": [ |
13 | | - 314, |
14 | | - 42, |
15 | | - 0 |
16 | | - ], |
| 12 | + "seeds": [314, 42, 0], |
17 | 13 | "seed_results": { |
18 | 14 | "314": { |
19 | 15 | "val_loss": 2.31227656, |
20 | 16 | "val_bpb": 1.05662016, |
21 | 17 | "artifact_bytes": 15945305, |
| 18 | + "steps": 4983, |
22 | 19 | "step_avg_ms": 120.32, |
23 | 20 | "train_wallclock_ms": 599593, |
24 | 21 | "eval_time_s": 567.1, |
|
31 | 28 | "val_loss": 2.31115900, |
32 | 29 | "val_bpb": 1.05610947, |
33 | 30 | "artifact_bytes": 15947490, |
| 31 | + "steps": 5001, |
34 | 32 | "step_avg_ms": 119.91, |
35 | 33 | "train_wallclock_ms": 599672, |
36 | 34 | "eval_time_s": 540.1, |
|
43 | 41 | "val_loss": 2.31265572, |
44 | 42 | "val_bpb": 1.05679341, |
45 | 43 | "artifact_bytes": 15942822, |
| 44 | + "steps": 4997, |
46 | 45 | "step_avg_ms": 119.99, |
47 | 46 | "train_wallclock_ms": 599608, |
48 | 47 | "eval_time_s": 567.1, |
|
52 | 51 | "quantized_val_bpb": 1.06939370 |
53 | 52 | } |
54 | 53 | }, |
55 | | - "comparison_baseline": "PR #2130", |
56 | | - "comparison_baseline_bpb": 1.0566953, |
57 | | - "delta_vs_baseline_bpb": -0.00018762, |
58 | | - "merged_leaderboard_baseline": "PR #1855", |
59 | | - "merged_leaderboard_baseline_bpb": 1.06107587, |
| 54 | + "comparison_baseline": "PR #1855 merged leaderboard record", |
| 55 | + "comparison_baseline_bpb": 1.06107587, |
| 56 | + "comparison_baseline_loss": 2.32202732, |
60 | 57 | "delta_vs_leaderboard_bpb": -0.00456819, |
| 58 | + "delta_vs_leaderboard_nats": -0.00999689, |
| 59 | + "artifact_bytes_mean": 15945206, |
61 | 60 | "artifact_bytes_max": 15947490, |
| 61 | + "bytes_total": 15947490, |
| 62 | + "train_steps_mean": 4993.67, |
| 63 | + "train_wallclock_ms_mean": 599624, |
62 | 64 | "train_wallclock_ms_max": 599672, |
63 | 65 | "step_avg_ms_mean": 120.07, |
64 | 66 | "eval_time_s_mean": 558.1, |
65 | 67 | "eval_time_s_max": 567.1, |
66 | 68 | "hardware": "8xH100 80GB SXM", |
67 | 69 | "pytorch_version": "2.11.0+cu130", |
68 | | - "lever_changes": [ |
69 | | - { |
70 | | - "name": "GPTQ_CALIBRATION_BATCHES", |
71 | | - "from": 16, |
72 | | - "to": 32 |
73 | | - } |
74 | | - ], |
| 70 | + "technique_summary": "PR #2130 stack (token-only n-gram tilt + AsymLogit Rescale + #2060 hp levers + NUM_PHASES=1 on PR #1797 / PR #1855 lineage) with a single hyperparameter change: GPTQ_CALIBRATION_BATCHES doubled from 16 to 32. All other code paths and env vars are byte-for-byte identical to the PR #2130 reproduction.", |
75 | 71 | "compliance": { |
76 | 72 | "artifact_under_16mb": true, |
77 | | - "train_under_600s": true, |
| 73 | + "train_under_600s_strict": true, |
78 | 74 | "eval_under_600s": true, |
79 | | - "n_seeds": 3, |
| 75 | + "three_seeds": true, |
80 | 76 | "full_validation_targets": true, |
| 77 | + "score_before_update": "Quantized phased TTT scores each validation chunk before the LoRA update; inherited unchanged from PR #2130. Within-word and word-start n-gram channels are explicitly disabled (logs confirm within_gate=0 word_gate=0 agree2plus=0).", |
| 78 | + "no_validation_training_leak": true, |
81 | 79 | "val_tokens_equal_target_tokens": true |
| 80 | + }, |
| 81 | + "logs": { |
| 82 | + "seed314": "train_seed314.log", |
| 83 | + "seed42": "train_seed42.log", |
| 84 | + "seed0": "train_seed0.log" |
82 | 85 | } |
83 | 86 | } |
0 commit comments