77from huggingface_hub import hf_hub_download
88
99
10- REPO_ID = os .environ .get ("MATCHED_FINEWEB_REPO_ID" , "willdepueoai/parameter-golf" )
11- REMOTE_ROOT_PREFIX = os .environ .get ("MATCHED_FINEWEB_REMOTE_ROOT_PREFIX" , "datasets" )
10+ DEFAULT_REPO_ID = "willdepueoai/parameter-golf"
11+ DEFAULT_REMOTE_ROOT_PREFIX = "datasets"
12+ CASEOPS_REPO_ID = "romeerp/parameter-golf-caseops-v1"
13+ CASEOPS_VARIANT = "sp8192_lossless_caps_caseops_v1_reserved"
14+ CASEOPS_DATASET_DIR = "fineweb10B_sp8192_lossless_caps_caseops_v1_reserved"
1215ROOT = Path (__file__ ).resolve ().parent
1316DATASETS_DIR = ROOT / "datasets"
1417TOKENIZERS_DIR = ROOT / "tokenizers"
18+ CASEOPS_SPEC = ROOT .parent / "tokenizer_specs_export_caseops_v1_reserved_only.json"
19+
20+ ACTIVE_REPO_ID = os .environ .get ("MATCHED_FINEWEB_REPO_ID" , DEFAULT_REPO_ID )
21+ ACTIVE_REMOTE_ROOT_PREFIX = os .environ .get (
22+ "MATCHED_FINEWEB_REMOTE_ROOT_PREFIX" , DEFAULT_REMOTE_ROOT_PREFIX
23+ )
24+
25+
26+ def caseops_enabled_for_variant (name : str ) -> bool :
27+ return name == "sp8192" and CASEOPS_SPEC .is_file ()
1528
1629def dataset_dir_for_variant (name : str ) -> str :
30+ if caseops_enabled_for_variant (name ):
31+ return CASEOPS_DATASET_DIR
1732 if name == "byte260" :
1833 return "fineweb10B_byte260"
1934 if name .startswith ("sp" ) and name [2 :].isdigit ():
2035 return f"fineweb10B_{ name } "
36+ if name .startswith ("sp" ):
37+ return f"fineweb10B_{ name } "
2138 raise ValueError (f"unsupported variant { name !r} ; expected byte260 or sp<VOCAB_SIZE>" )
2239
2340
2441def local_path_for_remote (relative_path : str ) -> Path :
2542 remote_path = Path (relative_path )
26- if REMOTE_ROOT_PREFIX and remote_path .parts [:1 ] == (REMOTE_ROOT_PREFIX ,):
27- remote_path = remote_path .relative_to (REMOTE_ROOT_PREFIX )
43+ if ACTIVE_REMOTE_ROOT_PREFIX and remote_path .parts [:1 ] == (ACTIVE_REMOTE_ROOT_PREFIX ,):
44+ remote_path = remote_path .relative_to (ACTIVE_REMOTE_ROOT_PREFIX )
2845 if remote_path .parts [:1 ] == ("datasets" ,):
2946 return DATASETS_DIR .joinpath (* remote_path .parts [1 :])
3047 if remote_path .parts [:1 ] == ("tokenizers" ,):
@@ -42,7 +59,7 @@ def get(relative_path: str) -> None:
4259 remote_path = Path (relative_path )
4360 cached_path = Path (
4461 hf_hub_download (
45- repo_id = REPO_ID ,
62+ repo_id = ACTIVE_REPO_ID ,
4663 filename = remote_path .name ,
4764 subfolder = remote_path .parent .as_posix () if remote_path .parent != Path ("." ) else None ,
4865 repo_type = "dataset" ,
@@ -59,7 +76,7 @@ def get(relative_path: str) -> None:
5976
6077
6178def manifest_path () -> Path :
62- return local_path_for_remote (f"{ REMOTE_ROOT_PREFIX } /manifest.json" )
79+ return local_path_for_remote (f"{ ACTIVE_REMOTE_ROOT_PREFIX } /manifest.json" )
6380
6481
6582def load_manifest (* , skip_manifest_download : bool ) -> dict :
@@ -118,7 +135,13 @@ def build_parser() -> argparse.ArgumentParser:
118135
119136
120137def main () -> None :
138+ global ACTIVE_REPO_ID , ACTIVE_REMOTE_ROOT_PREFIX
121139 args = build_parser ().parse_args ()
140+ if caseops_enabled_for_variant (args .variant ):
141+ ACTIVE_REPO_ID = os .environ .get ("MATCHED_FINEWEB_REPO_ID" , CASEOPS_REPO_ID )
142+ ACTIVE_REMOTE_ROOT_PREFIX = os .environ .get (
143+ "MATCHED_FINEWEB_REMOTE_ROOT_PREFIX" , DEFAULT_REMOTE_ROOT_PREFIX
144+ )
122145 dataset_dir = dataset_dir_for_variant (args .variant )
123146 train_shards = args .train_shards_positional if args .train_shards_positional is not None else args .train_shards
124147 if train_shards < 0 :
@@ -132,25 +155,29 @@ def main() -> None:
132155 val_shards = int ((dataset_entry .get ("stats" ) or {}).get ("files_val" ))
133156 if train_shards > max_train_shards :
134157 raise ValueError (
135- f"{ args .variant } only has { max_train_shards } training shards on { REPO_ID } , requested { train_shards } "
158+ f"{ args .variant } only has { max_train_shards } training shards on { ACTIVE_REPO_ID } , requested { train_shards } "
136159 )
137160 tokenizer_name = dataset_entry .get ("tokenizer_name" )
138161 tokenizer_entry = next ((x for x in manifest .get ("tokenizers" , []) if x .get ("name" ) == tokenizer_name ), None )
139162 if tokenizer_entry is None :
140163 raise ValueError (f"tokenizer { tokenizer_name } not found in { REMOTE_ROOT_PREFIX } /manifest.json" )
141164
142165 if args .with_docs :
143- get (f"{ REMOTE_ROOT_PREFIX } /docs_selected.jsonl" )
144- get (f"{ REMOTE_ROOT_PREFIX } /docs_selected.source_manifest.json" )
166+ get (f"{ ACTIVE_REMOTE_ROOT_PREFIX } /docs_selected.jsonl" )
167+ get (f"{ ACTIVE_REMOTE_ROOT_PREFIX } /docs_selected.source_manifest.json" )
145168
146- dataset_prefix = f"{ REMOTE_ROOT_PREFIX } /datasets/{ dataset_dir } "
169+ dataset_prefix = f"{ ACTIVE_REMOTE_ROOT_PREFIX } /datasets/{ dataset_dir } "
147170 for i in range (val_shards ):
148171 get (f"{ dataset_prefix } /fineweb_val_{ i :06d} .bin" )
172+ val_bytes_glob = dataset_entry .get ("val_bytes_glob" )
173+ if val_bytes_glob :
174+ for i in range (val_shards ):
175+ get (f"{ dataset_prefix } /fineweb_val_bytes_{ i :06d} .bin" )
149176 for i in range (train_shards ):
150177 get (f"{ dataset_prefix } /fineweb_train_{ i :06d} .bin" )
151178
152179 for artifact_path in artifact_paths_for_tokenizer (tokenizer_entry ):
153- get (f"{ REMOTE_ROOT_PREFIX } /{ artifact_path } " )
180+ get (f"{ ACTIVE_REMOTE_ROOT_PREFIX } /{ artifact_path } " )
154181
155182
156183if __name__ == "__main__" :
0 commit comments