Skip to content

Commit e73c488

Browse files
committed
Restore faithful CaseOps replay for W99
Round35 W99 was replaying standard sp8192 because the worker evaluator hard-coded the generic SP8192 downloader and tokenizer path. The worker now detects the CaseOps spec, downloads the romeerp HF export, enables validation byte sidecars, and points train_gpt at the lossless CaseOps dataset/tokenizer surface. Constraint: W99 must match PR openai#1729's public CaseOps dataset/tokenizer path closely enough to make the replay meaningful Rejected: Keep relaunching the generic SP8192 surface | it cannot validate the CaseOps claim Confidence: high Scope-risk: narrow Directive: If a future tokenizer lane ships a spec file plus custom downloader surface, patch evaluator data setup before trusting any replay Tested: python3 -m py_compile evaluate.py train_gpt.py data/cached_challenge_fineweb.py Not-tested: End-to-end HF CaseOps download on Lepton
1 parent cc99eeb commit e73c488

4 files changed

Lines changed: 91 additions & 20 deletions

File tree

data/cached_challenge_fineweb.py

Lines changed: 38 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,24 +7,41 @@
77
from huggingface_hub import hf_hub_download
88

99

10-
REPO_ID = os.environ.get("MATCHED_FINEWEB_REPO_ID", "willdepueoai/parameter-golf")
11-
REMOTE_ROOT_PREFIX = os.environ.get("MATCHED_FINEWEB_REMOTE_ROOT_PREFIX", "datasets")
10+
DEFAULT_REPO_ID = "willdepueoai/parameter-golf"
11+
DEFAULT_REMOTE_ROOT_PREFIX = "datasets"
12+
CASEOPS_REPO_ID = "romeerp/parameter-golf-caseops-v1"
13+
CASEOPS_VARIANT = "sp8192_lossless_caps_caseops_v1_reserved"
14+
CASEOPS_DATASET_DIR = "fineweb10B_sp8192_lossless_caps_caseops_v1_reserved"
1215
ROOT = Path(__file__).resolve().parent
1316
DATASETS_DIR = ROOT / "datasets"
1417
TOKENIZERS_DIR = ROOT / "tokenizers"
18+
CASEOPS_SPEC = ROOT.parent / "tokenizer_specs_export_caseops_v1_reserved_only.json"
19+
20+
ACTIVE_REPO_ID = os.environ.get("MATCHED_FINEWEB_REPO_ID", DEFAULT_REPO_ID)
21+
ACTIVE_REMOTE_ROOT_PREFIX = os.environ.get(
22+
"MATCHED_FINEWEB_REMOTE_ROOT_PREFIX", DEFAULT_REMOTE_ROOT_PREFIX
23+
)
24+
25+
26+
def caseops_enabled_for_variant(name: str) -> bool:
27+
return name == "sp8192" and CASEOPS_SPEC.is_file()
1528

1629
def dataset_dir_for_variant(name: str) -> str:
30+
if caseops_enabled_for_variant(name):
31+
return CASEOPS_DATASET_DIR
1732
if name == "byte260":
1833
return "fineweb10B_byte260"
1934
if name.startswith("sp") and name[2:].isdigit():
2035
return f"fineweb10B_{name}"
36+
if name.startswith("sp"):
37+
return f"fineweb10B_{name}"
2138
raise ValueError(f"unsupported variant {name!r}; expected byte260 or sp<VOCAB_SIZE>")
2239

2340

2441
def local_path_for_remote(relative_path: str) -> Path:
2542
remote_path = Path(relative_path)
26-
if REMOTE_ROOT_PREFIX and remote_path.parts[:1] == (REMOTE_ROOT_PREFIX,):
27-
remote_path = remote_path.relative_to(REMOTE_ROOT_PREFIX)
43+
if ACTIVE_REMOTE_ROOT_PREFIX and remote_path.parts[:1] == (ACTIVE_REMOTE_ROOT_PREFIX,):
44+
remote_path = remote_path.relative_to(ACTIVE_REMOTE_ROOT_PREFIX)
2845
if remote_path.parts[:1] == ("datasets",):
2946
return DATASETS_DIR.joinpath(*remote_path.parts[1:])
3047
if remote_path.parts[:1] == ("tokenizers",):
@@ -42,7 +59,7 @@ def get(relative_path: str) -> None:
4259
remote_path = Path(relative_path)
4360
cached_path = Path(
4461
hf_hub_download(
45-
repo_id=REPO_ID,
62+
repo_id=ACTIVE_REPO_ID,
4663
filename=remote_path.name,
4764
subfolder=remote_path.parent.as_posix() if remote_path.parent != Path(".") else None,
4865
repo_type="dataset",
@@ -59,7 +76,7 @@ def get(relative_path: str) -> None:
5976

6077

6178
def manifest_path() -> Path:
62-
return local_path_for_remote(f"{REMOTE_ROOT_PREFIX}/manifest.json")
79+
return local_path_for_remote(f"{ACTIVE_REMOTE_ROOT_PREFIX}/manifest.json")
6380

6481

6582
def load_manifest(*, skip_manifest_download: bool) -> dict:
@@ -118,7 +135,13 @@ def build_parser() -> argparse.ArgumentParser:
118135

119136

120137
def main() -> None:
138+
global ACTIVE_REPO_ID, ACTIVE_REMOTE_ROOT_PREFIX
121139
args = build_parser().parse_args()
140+
if caseops_enabled_for_variant(args.variant):
141+
ACTIVE_REPO_ID = os.environ.get("MATCHED_FINEWEB_REPO_ID", CASEOPS_REPO_ID)
142+
ACTIVE_REMOTE_ROOT_PREFIX = os.environ.get(
143+
"MATCHED_FINEWEB_REMOTE_ROOT_PREFIX", DEFAULT_REMOTE_ROOT_PREFIX
144+
)
122145
dataset_dir = dataset_dir_for_variant(args.variant)
123146
train_shards = args.train_shards_positional if args.train_shards_positional is not None else args.train_shards
124147
if train_shards < 0:
@@ -132,25 +155,29 @@ def main() -> None:
132155
val_shards = int((dataset_entry.get("stats") or {}).get("files_val"))
133156
if train_shards > max_train_shards:
134157
raise ValueError(
135-
f"{args.variant} only has {max_train_shards} training shards on {REPO_ID}, requested {train_shards}"
158+
f"{args.variant} only has {max_train_shards} training shards on {ACTIVE_REPO_ID}, requested {train_shards}"
136159
)
137160
tokenizer_name = dataset_entry.get("tokenizer_name")
138161
tokenizer_entry = next((x for x in manifest.get("tokenizers", []) if x.get("name") == tokenizer_name), None)
139162
if tokenizer_entry is None:
140163
raise ValueError(f"tokenizer {tokenizer_name} not found in {REMOTE_ROOT_PREFIX}/manifest.json")
141164

142165
if args.with_docs:
143-
get(f"{REMOTE_ROOT_PREFIX}/docs_selected.jsonl")
144-
get(f"{REMOTE_ROOT_PREFIX}/docs_selected.source_manifest.json")
166+
get(f"{ACTIVE_REMOTE_ROOT_PREFIX}/docs_selected.jsonl")
167+
get(f"{ACTIVE_REMOTE_ROOT_PREFIX}/docs_selected.source_manifest.json")
145168

146-
dataset_prefix = f"{REMOTE_ROOT_PREFIX}/datasets/{dataset_dir}"
169+
dataset_prefix = f"{ACTIVE_REMOTE_ROOT_PREFIX}/datasets/{dataset_dir}"
147170
for i in range(val_shards):
148171
get(f"{dataset_prefix}/fineweb_val_{i:06d}.bin")
172+
val_bytes_glob = dataset_entry.get("val_bytes_glob")
173+
if val_bytes_glob:
174+
for i in range(val_shards):
175+
get(f"{dataset_prefix}/fineweb_val_bytes_{i:06d}.bin")
149176
for i in range(train_shards):
150177
get(f"{dataset_prefix}/fineweb_train_{i:06d}.bin")
151178

152179
for artifact_path in artifact_paths_for_tokenizer(tokenizer_entry):
153-
get(f"{REMOTE_ROOT_PREFIX}/{artifact_path}")
180+
get(f"{ACTIVE_REMOTE_ROOT_PREFIX}/{artifact_path}")
154181

155182

156183
if __name__ == "__main__":

evaluate.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -196,14 +196,29 @@ def _make_job_command(commit_sha, branch=None):
196196
export TOKENIZER_META_PATH="${TOKENIZER_META_PATH:-./candidate.meta.npz}"
197197
else
198198
# Non-default vocab: use kevclark's HF repo + delete stale manifest (SP8192 not in default manifest)
199-
[ "$VOCAB" -gt 1024 ] && export MATCHED_FINEWEB_REPO_ID=kevclark/parameter-golf
200-
[ "$VOCAB" -gt 1024 ] && rm -f data/manifest.json
201-
if [ ! -f "data/datasets/.download_complete_sp${VOCAB}" ]; then
202-
python data/cached_challenge_fineweb.py --variant sp${VOCAB} --train-shards $SHARDS
203-
touch "data/datasets/.download_complete_sp${VOCAB}"
199+
if [ "$VOCAB" = "8192" ] && [ -f "tokenizer_specs_export_caseops_v1_reserved_only.json" ]; then
200+
CASEOPS_VARIANT=sp8192_lossless_caps_caseops_v1_reserved
201+
export MATCHED_FINEWEB_REPO_ID=${MATCHED_FINEWEB_REPO_ID:-romeerp/parameter-golf-caseops-v1}
202+
export MATCHED_FINEWEB_REMOTE_ROOT_PREFIX=${MATCHED_FINEWEB_REMOTE_ROOT_PREFIX:-datasets}
203+
rm -f data/manifest.json data/datasets/manifest.json
204+
if [ ! -f "data/datasets/.download_complete_${CASEOPS_VARIANT}" ]; then
205+
python data/cached_challenge_fineweb.py --variant ${CASEOPS_VARIANT} --train-shards 80
206+
touch "data/datasets/.download_complete_${CASEOPS_VARIANT}"
207+
fi
208+
export DATA_PATH=${DATA_PATH:-./data/datasets/fineweb10B_sp8192_lossless_caps_caseops_v1_reserved}
209+
export DATASETS_DIR=${DATASETS_DIR:-./data/datasets/fineweb10B_sp8192_lossless_caps_caseops_v1_reserved}
210+
export TOKENIZER_PATH=${TOKENIZER_PATH:-./data/tokenizers/fineweb_8192_bpe_lossless_caps_caseops_v1_reserved.model}
211+
else
212+
[ "$VOCAB" -gt 1024 ] && export MATCHED_FINEWEB_REPO_ID=kevclark/parameter-golf
213+
[ "$VOCAB" -gt 1024 ] && rm -f data/manifest.json data/datasets/manifest.json
214+
if [ ! -f "data/datasets/.download_complete_sp${VOCAB}" ]; then
215+
python data/cached_challenge_fineweb.py --variant sp${VOCAB} --train-shards $SHARDS
216+
touch "data/datasets/.download_complete_sp${VOCAB}"
217+
fi
218+
export DATA_PATH=${DATA_PATH:-./data/datasets/fineweb10B_sp${VOCAB}}
219+
export DATASETS_DIR=${DATASETS_DIR:-./data/datasets/fineweb10B_sp${VOCAB}}
220+
export TOKENIZER_PATH=${TOKENIZER_PATH:-./data/tokenizers/fineweb_${VOCAB}_bpe.model}
204221
fi
205-
export DATA_PATH=${DATA_PATH:-./data/datasets/fineweb10B_sp${VOCAB}}
206-
export TOKENIZER_PATH=${TOKENIZER_PATH:-./data/tokenizers/fineweb_${VOCAB}_bpe.model}
207222
fi
208223
"""
209224

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{
2+
"tokenizers": [
3+
{
4+
"name": "sp_bpe_8192_lossless_caps_caseops_v1_reserved",
5+
"dataset_suffix": "sp8192_lossless_caps_caseops_v1_reserved",
6+
"vocab_size": 8192,
7+
"text_transform": "lossless_caps_caseops_v1",
8+
"reserve_text_transform_controls": true,
9+
"model_prefix": "fineweb_8192_bpe_lossless_caps_caseops_v1_reserved"
10+
}
11+
]
12+
}

train_gpt.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,12 +118,29 @@ class Hyperparameters:
118118
local_rank = int(os.environ.get("LOCAL_RANK", "0"))
119119
is_main_process = rank == 0
120120
grad_accum_steps = 8 // world_size
121-
default_datasets_dir = os.path.join(data_dir, "datasets", f"fineweb10B_sp{vocab_size}")
121+
caseops_spec_path = Path(__file__).with_name(
122+
"tokenizer_specs_export_caseops_v1_reserved_only.json"
123+
)
124+
caseops_defaults_enabled = caseops_spec_path.is_file()
125+
default_datasets_dir = (
126+
os.path.join(data_dir, "datasets", "fineweb10B_sp8192_lossless_caps_caseops_v1_reserved")
127+
if caseops_defaults_enabled
128+
else os.path.join(data_dir, "datasets", f"fineweb10B_sp{vocab_size}")
129+
)
122130
datasets_dir = os.environ.get("DATASETS_DIR", default_datasets_dir)
123131
train_files = os.environ.get("TRAIN_FILES", os.path.join(datasets_dir, "fineweb_train_*.bin"))
124132
val_files = os.environ.get("VAL_FILES", os.path.join(datasets_dir, "fineweb_val_[0-9][0-9][0-9][0-9][0-9][0-9].bin"))
133+
default_tokenizer_path = (
134+
os.path.join(
135+
data_dir,
136+
"tokenizers",
137+
"fineweb_8192_bpe_lossless_caps_caseops_v1_reserved.model",
138+
)
139+
if caseops_defaults_enabled
140+
else os.path.join(data_dir, "tokenizers", f"fineweb_{vocab_size}_bpe.model")
141+
)
125142
tokenizer_path = os.environ.get(
126-
"TOKENIZER_PATH", os.path.join(data_dir, "tokenizers", f"fineweb_{vocab_size}_bpe.model")
143+
"TOKENIZER_PATH", default_tokenizer_path
127144
)
128145
artifact_dir = os.environ.get("ARTIFACT_DIR", "")
129146
logfile = (

0 commit comments

Comments
 (0)