Restore faithful CaseOps replay for W99

resouer · resouer · commit e73c4883a4e6 · 2026-04-18T21:22:14.000-07:00
Round35 W99 was replaying standard sp8192 because the worker evaluator hard-coded the generic SP8192 downloader and tokenizer path. The worker now detects the CaseOps spec, downloads the romeerp HF export, enables validation byte sidecars, and points train_gpt at the lossless CaseOps dataset/tokenizer surface. Constraint: W99 must match PR openai#1729's public CaseOps dataset/tokenizer path closely enough to make the replay meaningful Rejected: Keep relaunching the generic SP8192 surface | it cannot validate the CaseOps claim Confidence: high Scope-risk: narrow Directive: If a future tokenizer lane ships a spec file plus custom downloader surface, patch evaluator data setup before trusting any replay Tested: python3 -m py_compile evaluate.py train_gpt.py data/cached_challenge_fineweb.py Not-tested: End-to-end HF CaseOps download on Lepton
diff --git a/data/cached_challenge_fineweb.py b/data/cached_challenge_fineweb.py
@@ -7,24 +7,41 @@
 from huggingface_hub import hf_hub_download
 
 
-REPO_ID = os.environ.get("MATCHED_FINEWEB_REPO_ID", "willdepueoai/parameter-golf")
-REMOTE_ROOT_PREFIX = os.environ.get("MATCHED_FINEWEB_REMOTE_ROOT_PREFIX", "datasets")
+DEFAULT_REPO_ID = "willdepueoai/parameter-golf"
+DEFAULT_REMOTE_ROOT_PREFIX = "datasets"
+CASEOPS_REPO_ID = "romeerp/parameter-golf-caseops-v1"
+CASEOPS_VARIANT = "sp8192_lossless_caps_caseops_v1_reserved"
+CASEOPS_DATASET_DIR = "fineweb10B_sp8192_lossless_caps_caseops_v1_reserved"
 ROOT = Path(__file__).resolve().parent
 DATASETS_DIR = ROOT / "datasets"
 TOKENIZERS_DIR = ROOT / "tokenizers"
+CASEOPS_SPEC = ROOT.parent / "tokenizer_specs_export_caseops_v1_reserved_only.json"
+
+ACTIVE_REPO_ID = os.environ.get("MATCHED_FINEWEB_REPO_ID", DEFAULT_REPO_ID)
+ACTIVE_REMOTE_ROOT_PREFIX = os.environ.get(
+    "MATCHED_FINEWEB_REMOTE_ROOT_PREFIX", DEFAULT_REMOTE_ROOT_PREFIX
+)
+
+
+def caseops_enabled_for_variant(name: str) -> bool:
+    return name == "sp8192" and CASEOPS_SPEC.is_file()
 
 def dataset_dir_for_variant(name: str) -> str:
+    if caseops_enabled_for_variant(name):
+        return CASEOPS_DATASET_DIR
     if name == "byte260":
         return "fineweb10B_byte260"
     if name.startswith("sp") and name[2:].isdigit():
         return f"fineweb10B_{name}"
+    if name.startswith("sp"):
+        return f"fineweb10B_{name}"
     raise ValueError(f"unsupported variant {name!r}; expected byte260 or sp<VOCAB_SIZE>")
 
 
 def local_path_for_remote(relative_path: str) -> Path:
     remote_path = Path(relative_path)
-    if REMOTE_ROOT_PREFIX and remote_path.parts[:1] == (REMOTE_ROOT_PREFIX,):
-        remote_path = remote_path.relative_to(REMOTE_ROOT_PREFIX)
+    if ACTIVE_REMOTE_ROOT_PREFIX and remote_path.parts[:1] == (ACTIVE_REMOTE_ROOT_PREFIX,):
+        remote_path = remote_path.relative_to(ACTIVE_REMOTE_ROOT_PREFIX)
     if remote_path.parts[:1] == ("datasets",):
         return DATASETS_DIR.joinpath(*remote_path.parts[1:])
     if remote_path.parts[:1] == ("tokenizers",):
@@ -42,7 +59,7 @@ def get(relative_path: str) -> None:
     remote_path = Path(relative_path)
     cached_path = Path(
         hf_hub_download(
-            repo_id=REPO_ID,
+            repo_id=ACTIVE_REPO_ID,
             filename=remote_path.name,
             subfolder=remote_path.parent.as_posix() if remote_path.parent != Path(".") else None,
             repo_type="dataset",
@@ -59,7 +76,7 @@ def get(relative_path: str) -> None:
 
 
 def manifest_path() -> Path:
-    return local_path_for_remote(f"{REMOTE_ROOT_PREFIX}/manifest.json")
+    return local_path_for_remote(f"{ACTIVE_REMOTE_ROOT_PREFIX}/manifest.json")
 
 
 def load_manifest(*, skip_manifest_download: bool) -> dict:
@@ -118,7 +135,13 @@ def build_parser() -> argparse.ArgumentParser:
 
 
 def main() -> None:
+    global ACTIVE_REPO_ID, ACTIVE_REMOTE_ROOT_PREFIX
     args = build_parser().parse_args()
+    if caseops_enabled_for_variant(args.variant):
+        ACTIVE_REPO_ID = os.environ.get("MATCHED_FINEWEB_REPO_ID", CASEOPS_REPO_ID)
+        ACTIVE_REMOTE_ROOT_PREFIX = os.environ.get(
+            "MATCHED_FINEWEB_REMOTE_ROOT_PREFIX", DEFAULT_REMOTE_ROOT_PREFIX
+        )
     dataset_dir = dataset_dir_for_variant(args.variant)
     train_shards = args.train_shards_positional if args.train_shards_positional is not None else args.train_shards
     if train_shards < 0:
@@ -132,25 +155,29 @@ def main() -> None:
     val_shards = int((dataset_entry.get("stats") or {}).get("files_val"))
     if train_shards > max_train_shards:
         raise ValueError(
-            f"{args.variant} only has {max_train_shards} training shards on {REPO_ID}, requested {train_shards}"
+            f"{args.variant} only has {max_train_shards} training shards on {ACTIVE_REPO_ID}, requested {train_shards}"
         )
     tokenizer_name = dataset_entry.get("tokenizer_name")
     tokenizer_entry = next((x for x in manifest.get("tokenizers", []) if x.get("name") == tokenizer_name), None)
     if tokenizer_entry is None:
         raise ValueError(f"tokenizer {tokenizer_name} not found in {REMOTE_ROOT_PREFIX}/manifest.json")
 
     if args.with_docs:
-        get(f"{REMOTE_ROOT_PREFIX}/docs_selected.jsonl")
-        get(f"{REMOTE_ROOT_PREFIX}/docs_selected.source_manifest.json")
+        get(f"{ACTIVE_REMOTE_ROOT_PREFIX}/docs_selected.jsonl")
+        get(f"{ACTIVE_REMOTE_ROOT_PREFIX}/docs_selected.source_manifest.json")
 
-    dataset_prefix = f"{REMOTE_ROOT_PREFIX}/datasets/{dataset_dir}"
+    dataset_prefix = f"{ACTIVE_REMOTE_ROOT_PREFIX}/datasets/{dataset_dir}"
     for i in range(val_shards):
         get(f"{dataset_prefix}/fineweb_val_{i:06d}.bin")
+    val_bytes_glob = dataset_entry.get("val_bytes_glob")
+    if val_bytes_glob:
+        for i in range(val_shards):
+            get(f"{dataset_prefix}/fineweb_val_bytes_{i:06d}.bin")
     for i in range(train_shards):
         get(f"{dataset_prefix}/fineweb_train_{i:06d}.bin")
 
     for artifact_path in artifact_paths_for_tokenizer(tokenizer_entry):
-        get(f"{REMOTE_ROOT_PREFIX}/{artifact_path}")
+        get(f"{ACTIVE_REMOTE_ROOT_PREFIX}/{artifact_path}")
 
 
 if __name__ == "__main__":
diff --git a/evaluate.py b/evaluate.py
@@ -196,14 +196,29 @@ def _make_job_command(commit_sha, branch=None):
     export TOKENIZER_META_PATH="${TOKENIZER_META_PATH:-./candidate.meta.npz}"
 else
     # Non-default vocab: use kevclark's HF repo + delete stale manifest (SP8192 not in default manifest)
-    [ "$VOCAB" -gt 1024 ] && export MATCHED_FINEWEB_REPO_ID=kevclark/parameter-golf
-    [ "$VOCAB" -gt 1024 ] && rm -f data/manifest.json
-    if [ ! -f "data/datasets/.download_complete_sp${VOCAB}" ]; then
-        python data/cached_challenge_fineweb.py --variant sp${VOCAB} --train-shards $SHARDS
-        touch "data/datasets/.download_complete_sp${VOCAB}"
+    if [ "$VOCAB" = "8192" ] && [ -f "tokenizer_specs_export_caseops_v1_reserved_only.json" ]; then
+        CASEOPS_VARIANT=sp8192_lossless_caps_caseops_v1_reserved
+        export MATCHED_FINEWEB_REPO_ID=${MATCHED_FINEWEB_REPO_ID:-romeerp/parameter-golf-caseops-v1}
+        export MATCHED_FINEWEB_REMOTE_ROOT_PREFIX=${MATCHED_FINEWEB_REMOTE_ROOT_PREFIX:-datasets}
+        rm -f data/manifest.json data/datasets/manifest.json
+        if [ ! -f "data/datasets/.download_complete_${CASEOPS_VARIANT}" ]; then
+            python data/cached_challenge_fineweb.py --variant ${CASEOPS_VARIANT} --train-shards 80
+            touch "data/datasets/.download_complete_${CASEOPS_VARIANT}"
+        fi
+        export DATA_PATH=${DATA_PATH:-./data/datasets/fineweb10B_sp8192_lossless_caps_caseops_v1_reserved}
+        export DATASETS_DIR=${DATASETS_DIR:-./data/datasets/fineweb10B_sp8192_lossless_caps_caseops_v1_reserved}
+        export TOKENIZER_PATH=${TOKENIZER_PATH:-./data/tokenizers/fineweb_8192_bpe_lossless_caps_caseops_v1_reserved.model}
+    else
+        [ "$VOCAB" -gt 1024 ] && export MATCHED_FINEWEB_REPO_ID=kevclark/parameter-golf
+        [ "$VOCAB" -gt 1024 ] && rm -f data/manifest.json data/datasets/manifest.json
+        if [ ! -f "data/datasets/.download_complete_sp${VOCAB}" ]; then
+            python data/cached_challenge_fineweb.py --variant sp${VOCAB} --train-shards $SHARDS
+            touch "data/datasets/.download_complete_sp${VOCAB}"
+        fi
+        export DATA_PATH=${DATA_PATH:-./data/datasets/fineweb10B_sp${VOCAB}}
+        export DATASETS_DIR=${DATASETS_DIR:-./data/datasets/fineweb10B_sp${VOCAB}}
+        export TOKENIZER_PATH=${TOKENIZER_PATH:-./data/tokenizers/fineweb_${VOCAB}_bpe.model}
     fi
-    export DATA_PATH=${DATA_PATH:-./data/datasets/fineweb10B_sp${VOCAB}}
-    export TOKENIZER_PATH=${TOKENIZER_PATH:-./data/tokenizers/fineweb_${VOCAB}_bpe.model}
 fi
 """
 
diff --git a/tokenizer_specs_export_caseops_v1_reserved_only.json b/tokenizer_specs_export_caseops_v1_reserved_only.json
@@ -0,0 +1,12 @@
+{
+  "tokenizers": [
+    {
+      "name": "sp_bpe_8192_lossless_caps_caseops_v1_reserved",
+      "dataset_suffix": "sp8192_lossless_caps_caseops_v1_reserved",
+      "vocab_size": 8192,
+      "text_transform": "lossless_caps_caseops_v1",
+      "reserve_text_transform_controls": true,
+      "model_prefix": "fineweb_8192_bpe_lossless_caps_caseops_v1_reserved"
+    }
+  ]
+}
diff --git a/train_gpt.py b/train_gpt.py
@@ -118,12 +118,29 @@ class Hyperparameters:
     local_rank = int(os.environ.get("LOCAL_RANK", "0"))
     is_main_process = rank == 0
     grad_accum_steps = 8 // world_size
-    default_datasets_dir = os.path.join(data_dir, "datasets", f"fineweb10B_sp{vocab_size}")
+    caseops_spec_path = Path(__file__).with_name(
+        "tokenizer_specs_export_caseops_v1_reserved_only.json"
+    )
+    caseops_defaults_enabled = caseops_spec_path.is_file()
+    default_datasets_dir = (
+        os.path.join(data_dir, "datasets", "fineweb10B_sp8192_lossless_caps_caseops_v1_reserved")
+        if caseops_defaults_enabled
+        else os.path.join(data_dir, "datasets", f"fineweb10B_sp{vocab_size}")
+    )
     datasets_dir = os.environ.get("DATASETS_DIR", default_datasets_dir)
     train_files = os.environ.get("TRAIN_FILES", os.path.join(datasets_dir, "fineweb_train_*.bin"))
     val_files = os.environ.get("VAL_FILES", os.path.join(datasets_dir, "fineweb_val_[0-9][0-9][0-9][0-9][0-9][0-9].bin"))
+    default_tokenizer_path = (
+        os.path.join(
+            data_dir,
+            "tokenizers",
+            "fineweb_8192_bpe_lossless_caps_caseops_v1_reserved.model",
+        )
+        if caseops_defaults_enabled
+        else os.path.join(data_dir, "tokenizers", f"fineweb_{vocab_size}_bpe.model")
+    )
     tokenizer_path = os.environ.get(
-        "TOKENIZER_PATH", os.path.join(data_dir, "tokenizers", f"fineweb_{vocab_size}_bpe.model")
+        "TOKENIZER_PATH", default_tokenizer_path
     )
     artifact_dir = os.environ.get("ARTIFACT_DIR", "")
     logfile = (