spec 019: extend constant-α wiring to TTT forward path

leon2k2k2k · claude · leon2k2k2k · commit 2895db3d6d96 · 2026-04-21T12:46:01.000+08:00
018c (aabfbea) applied constant-α to forward_logits but not to forward_ttt / _block_with_lora. For a full-pipeline run we need TTT to also exercise the hardcoded α values (same lerp with literal weight, same compile specialization benefit). Mirror the encoder/decoder pattern from forward_logits: precomputed _encoder_alpha_info and _decoder_alpha_info lists store Python floats; forward_ttt reads them via Python indexing and calls torch.lerp(x_before, x, alpha) after each _block_with_lora at recur-alpha sites. Closes the TTT-path gap from spec 015's original patch AND maintains the compile-time-constant α optimization validated at proxy scale in 018c (92% blend overhead recovered). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/records/track_10min_16mb/2026-04-19_SP8192_CaseOps_GatedAttn_QuantGate_Loop45_PhasedTTT/train_gpt.py b/records/track_10min_16mb/2026-04-19_SP8192_CaseOps_GatedAttn_QuantGate_Loop45_PhasedTTT/train_gpt.py
@@ -1310,10 +1310,28 @@ def forward_ttt(self, input_ids, target_ids, lora):
                 )
             )
         )
+        # Spec 019: apply constant-α blend in the TTT forward path too.
+        # alpha_info lists contain Python floats (set at __init__ from the 017
+        # endpoint table). The torch.lerp call sees the literal in the weight
+        # position just like forward_logits, so compile specialization applies.
+        enc_alpha_info = (
+            self._encoder_alpha_info
+            if (self.recur_alpha_enabled and self.looping_active)
+            else None
+        )
+        dec_alpha_info = (
+            self._decoder_alpha_info
+            if (self.recur_alpha_enabled and self.looping_active)
+            else None
+        )
         slot = 0
-        for i in enc_iter:
+        for step_idx, i in enumerate(enc_iter):
             q_w, k_w, v_w, out_w, up_w, down_w = self._bank_weights(i)
-            x = self._block_with_lora(self.blocks[i], x, x0, lora, slot, q_w, k_w, v_w, out_w, up_w, down_w)
+            x_before = x
+            x = self._block_with_lora(self.blocks[i], x_before, x0, lora, slot, q_w, k_w, v_w, out_w, up_w, down_w)
+            if enc_alpha_info is not None and enc_alpha_info[step_idx] is not None:
+                alpha = enc_alpha_info[step_idx]  # Python float constant
+                x = torch.lerp(x_before, x, alpha)
             slot += 1
             skips.append(x)
         psl = self.parallel_start_layer
@@ -1348,7 +1366,11 @@ def forward_ttt(self, input_ids, target_ids, lora):
                         x = torch.lerp(scaled_skip, x, g)
                     else:
                         x = x + scaled_skip
-                x = self._block_with_lora(self.blocks[i], x, x0, lora, slot, q_w, k_w, v_w, out_w, up_w, down_w)
+                x_before = x
+                x = self._block_with_lora(self.blocks[i], x_before, x0, lora, slot, q_w, k_w, v_w, out_w, up_w, down_w)
+                if dec_alpha_info is not None and dec_alpha_info[skip_idx] is not None:
+                    alpha = dec_alpha_info[skip_idx]  # Python float constant
+                    x = torch.lerp(x_before, x, alpha)
             slot += 1
         if lane0 is not None:
             x = self._final_parallel_hidden(lane0, lane1)