diff --git a/configs/refine/olmo-1b-refine-mixed-2ep.yaml b/configs/refine/olmo-1b-refine-mixed-2ep.yaml new file mode 100644 index 000000000..9435236a1 --- /dev/null +++ b/configs/refine/olmo-1b-refine-mixed-2ep.yaml @@ -0,0 +1,321 @@ +run_name: olmo-1b-refine-mixed-2ep-001 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +# Cx1: t_max = 1.3B params * 20 = 26e9 +# Cx2: t_max = 1.3B params * 40 = 52e9 +# Cx3: t_max = 1.3B params * 60 = 78e9 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 2e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 2ep +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # Cx1 20b REWRITE sample set 01 + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # Cx1 20b SOURCE sample set 01 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy diff --git a/configs/refine/olmo-1b-refine-mixed-50pctl-dclm-Cx5.yaml b/configs/refine/olmo-1b-refine-mixed-50pctl-dclm-Cx5.yaml new file mode 100644 index 000000000..2cf46a1d6 --- /dev/null +++ b/configs/refine/olmo-1b-refine-mixed-50pctl-dclm-Cx5.yaml @@ -0,0 +1,480 @@ +run_name: olmo-1b-refine-mixed-50pctl-dclm-Cx5-20240925 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 5e9 + t_max: 130e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_num_checkpoints_to_keep: 12 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 130e9T # Match total token count +stop_at: 61989 # 130e9 / (2048 * 1024) = 61_989 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 2500 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-67-00000.npy + diff --git a/configs/refine/olmo-1b-refine-mixed-50pctl-fw-Cx5.yaml b/configs/refine/olmo-1b-refine-mixed-50pctl-fw-Cx5.yaml new file mode 100644 index 000000000..78b5636ef --- /dev/null +++ b/configs/refine/olmo-1b-refine-mixed-50pctl-fw-Cx5.yaml @@ -0,0 +1,893 @@ +run_name: olmo-1b-refine-mixed-50pctl-fw-Cx5-20240920 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 5e9 + t_max: 130e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_num_checkpoints_to_keep: 12 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 130e9T # Match total token count +stop_at: 61989 # 130e9 / (2048 * 1024) = 61_989 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 2500 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0007/part-0-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0007/part-1-00000.npy diff --git a/configs/refine/olmo-1b-refine-mixed-50pctl-length-filter-dclm-Cx5.yaml b/configs/refine/olmo-1b-refine-mixed-50pctl-length-filter-dclm-Cx5.yaml new file mode 100644 index 000000000..57afd456f --- /dev/null +++ b/configs/refine/olmo-1b-refine-mixed-50pctl-length-filter-dclm-Cx5.yaml @@ -0,0 +1,492 @@ +run_name: olmo-1b-refine-mixed-50pctl-length-filter-dclm-Cx5-20241008 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 5e9 + t_max: 130e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_num_checkpoints_to_keep: 12 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 130e9T # Match total token count +stop_at: 61989 # 130e9 / (2048 * 1024) = 61_989 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 2500 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-80-00000.npy diff --git a/configs/refine/olmo-1b-refine-mixed-80pctl-fw-Cx5.yaml b/configs/refine/olmo-1b-refine-mixed-80pctl-fw-Cx5.yaml new file mode 100644 index 000000000..45935e442 --- /dev/null +++ b/configs/refine/olmo-1b-refine-mixed-80pctl-fw-Cx5.yaml @@ -0,0 +1,893 @@ +run_name: olmo-1b-refine-mixed-80pctl-fw-Cx5-20240924 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 5e9 + t_max: 130e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_num_checkpoints_to_keep: 12 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 130e9T # Match total token count +stop_at: 61989 # 130e9 / (2048 * 1024) = 61_989 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 2500 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0007/part-0-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0007/part-1-00000.npy diff --git a/configs/refine/olmo-1b-refine-mixed-Cx2.yaml b/configs/refine/olmo-1b-refine-mixed-Cx2.yaml new file mode 100644 index 000000000..095f2e663 --- /dev/null +++ b/configs/refine/olmo-1b-refine-mixed-Cx2.yaml @@ -0,0 +1,492 @@ +run_name: olmo-1b-refine-mixed-Cx2-20240823 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +# Cx1: t_max = 1.3B params * 20 = 26e9 +# Cx2: t_max = 1.3B params * 40 = 52e9 +# Cx3: t_max = 1.3B params * 60 = 78e9 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 2e9 + t_max: 52e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 52e9T # Match total token count +stop_at: 24795 # 52e9 / (2048 * 1024) = 24_795 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # -------------------------------------------- Source docs 26G -------------------------------------------- + # 19.76G sample set 01 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # 7.1G from sample set 02 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-17-00000.npy + + # -------------------------------------------- Rewrite docs 26G -------------------------------------------- + # 9.5G sample set 01 + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-95-00000.npy + + # 17.6G sample set 02 + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-95-00000.npy + diff --git a/configs/refine/olmo-1b-refine-mixed-Cx5.yaml b/configs/refine/olmo-1b-refine-mixed-Cx5.yaml new file mode 100644 index 000000000..8e0338aae --- /dev/null +++ b/configs/refine/olmo-1b-refine-mixed-Cx5.yaml @@ -0,0 +1,779 @@ +run_name: olmo-1b-refine-mixed-Cx5-20240822 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 5e9 + t_max: 130e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_num_checkpoints_to_keep: 12 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 130e9T # Match total token count +stop_at: 61989 # 130e9 / (2048 * 1024) = 61_989 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 2500 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # ------------------------------------------------------------ rewrite docs ------------------------------------------------------------ + # 100B (~78G) sample set (~70G included) + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-55-00000.npy + + # --------------------------------------------------------- source docs (~70G) --------------------------------------------------------- + + # Cx1 ~20b sample set 01 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # 100b (~50G) sample set 001 + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-63-00000.npy diff --git a/configs/refine/olmo-1b-refine-mixed-length-filter-dclm-Cx5.yaml b/configs/refine/olmo-1b-refine-mixed-length-filter-dclm-Cx5.yaml new file mode 100644 index 000000000..5334e85ae --- /dev/null +++ b/configs/refine/olmo-1b-refine-mixed-length-filter-dclm-Cx5.yaml @@ -0,0 +1,4251 @@ +run_name: olmo-1b-refine-mixed-length-filter-dclm-Cx5-20241008 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 5e9 + t_max: 130e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_num_checkpoints_to_keep: 12 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 130e9T # Match total token count +stop_at: 61989 # 130e9 / (2048 * 1024) = 61_989 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 2500 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-95-00000.npy diff --git a/configs/refine/olmo-1b-refine-rewrite-only-2ep.yaml b/configs/refine/olmo-1b-refine-rewrite-only-2ep.yaml new file mode 100644 index 000000000..581783f19 --- /dev/null +++ b/configs/refine/olmo-1b-refine-rewrite-only-2ep.yaml @@ -0,0 +1,321 @@ +run_name: olmo-1b-refine-rewrite-only-2ep-001 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +# Cx1: t_max = 1.3B params * 20 = 26e9 +# Cx2: t_max = 1.3B params * 40 = 52e9 +# Cx3: t_max = 1.3B params * 60 = 78e9 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 2e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 2ep +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # Cx1 20b sample set 01 + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # Cx1 <20b sample set 02 + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-12-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-13-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-14-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-15-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-16-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-17-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-18-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-19-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-20-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-21-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-22-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-23-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-24-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-25-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-26-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-27-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-28-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-29-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-30-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-31-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-32-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-33-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-34-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-35-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-36-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-37-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-38-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-39-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-40-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-41-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-42-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-43-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-44-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-45-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-46-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-47-00000.npy diff --git a/configs/refine/olmo-1b-refine-rewrite-only-Cx2.yaml b/configs/refine/olmo-1b-refine-rewrite-only-Cx2.yaml new file mode 100644 index 000000000..c5a98e26b --- /dev/null +++ b/configs/refine/olmo-1b-refine-rewrite-only-Cx2.yaml @@ -0,0 +1,523 @@ +run_name: olmo-1b-refine-rewrite-only-Cx2-20240823 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +# Cx1: t_max = 1.3B params * 20 = 26e9 +# Cx2: t_max = 1.3B params * 40 = 52e9 +# Cx3: t_max = 1.3B params * 60 = 78e9 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 2e9 + t_max: 52e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 52e9T # Match total token count +stop_at: 24795 # 52e9 / (2048 * 1024) = 24_795 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # >52G tokens + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-09-00000.npy diff --git a/configs/refine/olmo-1b-refine-rewrite-only-Cx5.yaml b/configs/refine/olmo-1b-refine-rewrite-only-Cx5.yaml new file mode 100644 index 000000000..0292aee47 --- /dev/null +++ b/configs/refine/olmo-1b-refine-rewrite-only-Cx5.yaml @@ -0,0 +1,991 @@ +run_name: olmo-1b-refine-rewrite-only-Cx5-20240822 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 5e9 + t_max: 130e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_num_checkpoints_to_keep: 12 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 130e9T # Match total token count +stop_at: 61989 # 130e9 / (2048 * 1024) = 61_989 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 2500 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # ------------------------------------------------------------ rewrite docs ------------------------------------------------------------ + # 20b rewrites-unfiltered set 01 + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # <20b rewrites-unfiltered set 02 + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-12-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-13-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-14-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-15-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-16-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-17-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-18-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-19-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-20-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-21-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-22-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-23-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-24-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-25-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-26-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-27-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-28-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-29-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-30-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-31-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-32-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-33-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-34-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-35-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-36-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-37-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-38-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-39-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-40-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-41-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-42-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-43-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-44-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-45-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-46-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-47-00000.npy + + # ~85B rewrites-unfiltered + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-95-00000.npy + + # extra rewrites to get more data (15b tokens) + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-000-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-001-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-002-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-003-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-004-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-005-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-006-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-007-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-008-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-009-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-010-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-011-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-012-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-013-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-014-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-015-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-016-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-017-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-018-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-019-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-020-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-021-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-022-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-023-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-024-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-025-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-026-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-027-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-028-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-029-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-030-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-031-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-032-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-033-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-034-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-035-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-036-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-037-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-038-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-039-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-040-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-041-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-042-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-043-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-044-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-045-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-046-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-047-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-048-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-049-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-050-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-051-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-052-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-053-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-054-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-055-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-056-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-057-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-058-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-059-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-060-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-061-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-062-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-063-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-064-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-065-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-066-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-067-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-068-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-069-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-070-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-071-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-072-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-073-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-074-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-075-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-076-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-077-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-078-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-079-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-080-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-081-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-082-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-083-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-084-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-085-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-086-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-087-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-088-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-089-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-090-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-091-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-092-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-093-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-094-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-095-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-096-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-097-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-098-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-099-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-100-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-101-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-102-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-103-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-104-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-105-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-106-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-107-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-108-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-109-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-110-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-111-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-112-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-113-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-114-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-115-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-116-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-117-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-118-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-119-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-120-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-121-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-122-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-123-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-124-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-125-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-126-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-127-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-128-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-129-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-130-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-131-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-132-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-133-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-134-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-135-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-136-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-137-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-138-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-139-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-140-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-141-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-142-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-143-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-144-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-145-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-146-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-147-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-148-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-149-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-150-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-151-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-152-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-153-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-154-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-155-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-156-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-157-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-158-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-159-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-160-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-161-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-162-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-163-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-164-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-165-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-166-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-167-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-168-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-169-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-170-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-171-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-172-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-173-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-174-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-175-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-176-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-177-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-178-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-179-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-180-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-181-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-182-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-183-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-184-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-185-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-186-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-187-00000.npy diff --git a/configs/refine/olmo-1b-refine-rewrite-only-filtered-Cx2.yaml b/configs/refine/olmo-1b-refine-rewrite-only-filtered-Cx2.yaml new file mode 100644 index 000000000..0db7b47f7 --- /dev/null +++ b/configs/refine/olmo-1b-refine-rewrite-only-filtered-Cx2.yaml @@ -0,0 +1,417 @@ +run_name: olmo-1b-refine-rewrite-only-filtered-Cx2-001 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 2e9 # Double our Cx1 warmup + t_max: 52e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 52e9T # Match total token count +stop_at: 24795 # 52e9 / (2048 * 1024) = 24_795 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # 19.7G Tokens (Group 1) + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-63-00000.npy + + # 21.9G Tokens (Group 2) + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-63-00000.npy + + # 19.0G Tokens (100b group 1) + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-63-00000.npy diff --git a/configs/refine/olmo-1b-refine-rewrite-only-filtered.yaml b/configs/refine/olmo-1b-refine-rewrite-only-filtered.yaml new file mode 100644 index 000000000..39dbc3a0e --- /dev/null +++ b/configs/refine/olmo-1b-refine-rewrite-only-filtered.yaml @@ -0,0 +1,355 @@ +run_name: olmo-1b-refine-rewrite-only-filtered-001 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +# Cx1: t_max = 1.3B params * 20 = 26e9 +# Cx2: t_max = 1.3B params * 40 = 52e9 +# Cx3: t_max = 1.3B params * 60 = 78e9 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 1e9 + t_max: 30e9 # We don't quite have enough for Cx2 so we'll stop at 30B + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 30e9T # Match total token count +stop_at: 14305 # 30e9 / (2048 * 1024) = 14305 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # 19.7G Tokens (Group 1) + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-63-00000.npy + + # 21.9G Tokens (Group 2) + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-63-00000.npy diff --git a/configs/refine/olmo-1b-refine-rewrite-only.yaml b/configs/refine/olmo-1b-refine-rewrite-only.yaml new file mode 100644 index 000000000..7ec326c94 --- /dev/null +++ b/configs/refine/olmo-1b-refine-rewrite-only.yaml @@ -0,0 +1,323 @@ +run_name: olmo-1b-refine-rewrite-only-001 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +# Cx1: t_max = 1.3B params * 20 = 26e9 +# Cx2: t_max = 1.3B params * 40 = 52e9 +# Cx3: t_max = 1.3B params * 60 = 78e9 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 1e9 + t_max: 30e9 # We don't quite have enough for Cx2 so we'll stop at 30B + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 30e9T # Match total token count +stop_at: 14305 # 30e9 / (2048 * 1024) = 14305 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # Cx1 20b sample set 01 + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # Cx1 <20b sample set 02 + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-12-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-13-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-14-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-15-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-16-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-17-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-18-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-19-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-20-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-21-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-22-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-23-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-24-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-25-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-26-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-27-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-28-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-29-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-30-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-31-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-32-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-33-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-34-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-35-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-36-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-37-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-38-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-39-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-40-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-41-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-42-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-43-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-44-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-45-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-46-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-47-00000.npy diff --git a/configs/refine/olmo-1b-refine-source-only-2ep.yaml b/configs/refine/olmo-1b-refine-source-only-2ep.yaml new file mode 100644 index 000000000..a9ea00bd8 --- /dev/null +++ b/configs/refine/olmo-1b-refine-source-only-2ep.yaml @@ -0,0 +1,321 @@ +run_name: olmo-1b-refine-source-only-2ep-001 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +# Cx1: t_max = 1.3B params * 20 = 26e9 +# Cx2: t_max = 1.3B params * 40 = 52e9 +# Cx3: t_max = 1.3B params * 60 = 78e9 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 2e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 2ep +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # Cx1 20b sample set 01 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # Cx1 20b sample set 02 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-47-00000.npy diff --git a/configs/refine/olmo-1b-refine-source-only-Cx2.yaml b/configs/refine/olmo-1b-refine-source-only-Cx2.yaml new file mode 100644 index 000000000..b82f712c1 --- /dev/null +++ b/configs/refine/olmo-1b-refine-source-only-Cx2.yaml @@ -0,0 +1,389 @@ +run_name: olmo-1b-refine-source-only-Cx2-001 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +# Cx1: t_max = 1.3B params * 20 = 26e9 +# Cx2: t_max = 1.3B params * 40 = 52e9 +# Cx3: t_max = 1.3B params * 60 = 78e9 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 2e9 + t_max: 52e9 # We don't quite have enough for Cx2 so we'll stop at 30B + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 52e9T # Match total token count +stop_at: 24795 # 52e9 / (2048 * 1024) = 24_795 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # Cx1 20b sample set 01 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # Cx1 20b sample set 02 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-47-00000.npy + + # (50b) 100b sample set 01 + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-63-00000.npy diff --git a/configs/refine/olmo-1b-refine-source-only-Cx5.yaml b/configs/refine/olmo-1b-refine-source-only-Cx5.yaml new file mode 100644 index 000000000..db516632c --- /dev/null +++ b/configs/refine/olmo-1b-refine-source-only-Cx5.yaml @@ -0,0 +1,450 @@ +run_name: olmo-1b-refine-source-only-Cx5-20240821 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 5e9 + t_max: 130e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_num_checkpoints_to_keep: 12 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 130e9T # Match total token count +stop_at: 61989 # 130e9 / (2048 * 1024) = 61_989 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 2500 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # Cx1 20b sample set 01 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # Cx1 20b sample set 02 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-47-00000.npy + + # 100b (50G) sample set 001 + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-63-00000.npy + + # 100b (46.9G) sample set 002 + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-63-00000.npy diff --git a/configs/refine/olmo-1b-refine-source-only.yaml b/configs/refine/olmo-1b-refine-source-only.yaml new file mode 100644 index 000000000..3d0312f35 --- /dev/null +++ b/configs/refine/olmo-1b-refine-source-only.yaml @@ -0,0 +1,323 @@ +run_name: olmo-1b-refine-source-only-001 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +# Cx1: t_max = 1.3B params * 20 = 26e9 +# Cx2: t_max = 1.3B params * 40 = 52e9 +# Cx3: t_max = 1.3B params * 60 = 78e9 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 1e9 + t_max: 30e9 # We don't quite have enough for Cx2 so we'll stop at 30B + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 30e9T # Match total token count +stop_at: 14305 # 30e9 / (2048 * 1024) = 14305 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # Cx1 20b sample set 01 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # Cx1 20b sample set 02 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-47-00000.npy diff --git a/scripts/beaker/refine/refine1-mixed-2ep-launch.sh b/scripts/beaker/refine/refine1-mixed-2ep-launch.sh new file mode 100755 index 000000000..af2133d31 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-2ep-launch.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=1 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-mixed-2ep \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-mixed-2ep.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-mixed-2ep.sh b/scripts/beaker/refine/refine1-mixed-2ep.sh new file mode 100755 index 000000000..88dbf5d9c --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-2ep.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-mixed-2ep.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-mixed-50pctl-dclm-Cx5-launch.sh b/scripts/beaker/refine/refine1-mixed-50pctl-dclm-Cx5-launch.sh new file mode 100755 index 000000000..481f94901 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-50pctl-dclm-Cx5-launch.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=2 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-mixed-50pctl-dclm-cx5-20240925 \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --synchronized-start-timeout 20m \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-mixed-50pctl-dclm-Cx5.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-mixed-50pctl-dclm-Cx5.sh b/scripts/beaker/refine/refine1-mixed-50pctl-dclm-Cx5.sh new file mode 100755 index 000000000..ac860b9c0 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-50pctl-dclm-Cx5.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +# export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-mixed-50pctl-dclm-Cx5.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-mixed-50pctl-fw-Cx5-launch.sh b/scripts/beaker/refine/refine1-mixed-50pctl-fw-Cx5-launch.sh new file mode 100755 index 000000000..dae14a471 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-50pctl-fw-Cx5-launch.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=2 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-mixed-50pctl-fw-cx5-20240920 \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --synchronized-start-timeout 20m \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-mixed-50pctl-fw-Cx5.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-mixed-50pctl-fw-Cx5.sh b/scripts/beaker/refine/refine1-mixed-50pctl-fw-Cx5.sh new file mode 100755 index 000000000..d919cf7d9 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-50pctl-fw-Cx5.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +# export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-mixed-50pctl-fw-Cx5.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-mixed-50pctl-length-filter-dclm-Cx5-launch.sh b/scripts/beaker/refine/refine1-mixed-50pctl-length-filter-dclm-Cx5-launch.sh new file mode 100755 index 000000000..def7c8e07 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-50pctl-length-filter-dclm-Cx5-launch.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=2 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-mixed-50pctl-length-filter-dclm-cx5-20241008 \ + --description "OLMo refine 1B" \ + --priority high \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --synchronized-start-timeout 20m \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-mixed-50pctl-length-filter-dclm-Cx5.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-mixed-50pctl-length-filter-dclm-Cx5.sh b/scripts/beaker/refine/refine1-mixed-50pctl-length-filter-dclm-Cx5.sh new file mode 100755 index 000000000..7804a0cf5 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-50pctl-length-filter-dclm-Cx5.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +# export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-mixed-length-filter-dclm-Cx5.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-mixed-80pctl-fw-Cx5-launch.sh b/scripts/beaker/refine/refine1-mixed-80pctl-fw-Cx5-launch.sh new file mode 100755 index 000000000..01b7bedd8 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-80pctl-fw-Cx5-launch.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=2 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-mixed-80pctl-fw-cx5-20240924 \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --synchronized-start-timeout 20m \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-mixed-80pctl-fw-Cx5.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-mixed-80pctl-fw-Cx5.sh b/scripts/beaker/refine/refine1-mixed-80pctl-fw-Cx5.sh new file mode 100755 index 000000000..e2d22fb13 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-80pctl-fw-Cx5.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +# export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-mixed-80pctl-fw-Cx5.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-mixed-Cx2-launch.sh b/scripts/beaker/refine/refine1-mixed-Cx2-launch.sh new file mode 100755 index 000000000..4a39b630e --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-Cx2-launch.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=1 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-mixed-cx2-20240823 \ + --description "OLMo refine 1B" \ + --priority high \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-mixed-Cx2.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-mixed-Cx2.sh b/scripts/beaker/refine/refine1-mixed-Cx2.sh new file mode 100755 index 000000000..bff9212e4 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-Cx2.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +# export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-mixed-Cx2.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-mixed-Cx5-launch.sh b/scripts/beaker/refine/refine1-mixed-Cx5-launch.sh new file mode 100755 index 000000000..88071108d --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-Cx5-launch.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=2 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-mixed-cx5-20240822 \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --synchronized-start-timeout 20m \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-mixed-Cx5.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-mixed-Cx5.sh b/scripts/beaker/refine/refine1-mixed-Cx5.sh new file mode 100755 index 000000000..5a5534be1 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-Cx5.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +# export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-mixed-Cx5.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-mixed-length-filter-dclm-Cx5-launch.sh b/scripts/beaker/refine/refine1-mixed-length-filter-dclm-Cx5-launch.sh new file mode 100755 index 000000000..d0d6fd8b6 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-length-filter-dclm-Cx5-launch.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=2 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-mixed-length-filter-dclm-cx5-20241008 \ + --description "OLMo refine 1B" \ + --priority high \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --synchronized-start-timeout 20m \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-mixed-length-filter-dclm-Cx5.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-mixed-length-filter-dclm-Cx5.sh b/scripts/beaker/refine/refine1-mixed-length-filter-dclm-Cx5.sh new file mode 100755 index 000000000..7804a0cf5 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-length-filter-dclm-Cx5.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +# export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-mixed-length-filter-dclm-Cx5.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-rewrite-only-2ep-launch.sh b/scripts/beaker/refine/refine1-rewrite-only-2ep-launch.sh new file mode 100755 index 000000000..7d818fdeb --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only-2ep-launch.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=1 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-rewrites-2ep \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-rewrite-only-2ep.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-rewrite-only-2ep.sh b/scripts/beaker/refine/refine1-rewrite-only-2ep.sh new file mode 100755 index 000000000..7dd7b19bd --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only-2ep.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-rewrite-only-2ep.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-rewrite-only-Cx2-launch.sh b/scripts/beaker/refine/refine1-rewrite-only-Cx2-launch.sh new file mode 100755 index 000000000..01525c24a --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only-Cx2-launch.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=1 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-rewrite-only-cx2-20240823 \ + --description "OLMo refine 1B" \ + --priority high \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-rewrite-only-Cx2.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-rewrite-only-Cx2.sh b/scripts/beaker/refine/refine1-rewrite-only-Cx2.sh new file mode 100755 index 000000000..519dcfe04 --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only-Cx2.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +# export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-rewrite-only-Cx2.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-rewrite-only-Cx5-launch.sh b/scripts/beaker/refine/refine1-rewrite-only-Cx5-launch.sh new file mode 100755 index 000000000..1f120f3c8 --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only-Cx5-launch.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=2 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-rewrite-unfiltered-cx5-20240822 \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --synchronized-start-timeout 20m \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-rewrite-only-Cx5.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-rewrite-only-Cx5.sh b/scripts/beaker/refine/refine1-rewrite-only-Cx5.sh new file mode 100755 index 000000000..7d8a978d8 --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only-Cx5.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +# export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-rewrite-only-Cx5.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-rewrite-only-filtered-Cx2-launch.sh b/scripts/beaker/refine/refine1-rewrite-only-filtered-Cx2-launch.sh new file mode 100755 index 000000000..393497c23 --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only-filtered-Cx2-launch.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=1 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-rewrites-filtered \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-rewrite-only-filtered-Cx2.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-rewrite-only-filtered-Cx2.sh b/scripts/beaker/refine/refine1-rewrite-only-filtered-Cx2.sh new file mode 100755 index 000000000..a23d69212 --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only-filtered-Cx2.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn and dependencies +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-rewrite-only-filtered-Cx2.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-rewrite-only-filtered-launch.sh b/scripts/beaker/refine/refine1-rewrite-only-filtered-launch.sh new file mode 100755 index 000000000..61d1df209 --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only-filtered-launch.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=1 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-rewrites-filtered \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-rewrite-only-filtered.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-rewrite-only-filtered.sh b/scripts/beaker/refine/refine1-rewrite-only-filtered.sh new file mode 100755 index 000000000..02777888a --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only-filtered.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn and dependencies +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-rewrite-only-filtered.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-rewrite-only-launch.sh b/scripts/beaker/refine/refine1-rewrite-only-launch.sh new file mode 100755 index 000000000..4d3407d28 --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only-launch.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=1 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-rewrites \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-rewrite-only.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-rewrite-only.sh b/scripts/beaker/refine/refine1-rewrite-only.sh new file mode 100755 index 000000000..c112e0afb --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-rewrite-only.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-source-only-2ep-launch.sh b/scripts/beaker/refine/refine1-source-only-2ep-launch.sh new file mode 100755 index 000000000..14b58f83c --- /dev/null +++ b/scripts/beaker/refine/refine1-source-only-2ep-launch.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=1 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-baseline-2ep \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-source-only-2ep.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-source-only-2ep.sh b/scripts/beaker/refine/refine1-source-only-2ep.sh new file mode 100755 index 000000000..effc356ec --- /dev/null +++ b/scripts/beaker/refine/refine1-source-only-2ep.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-source-only-2ep.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-source-only-Cx2-launch.sh b/scripts/beaker/refine/refine1-source-only-Cx2-launch.sh new file mode 100755 index 000000000..90989af29 --- /dev/null +++ b/scripts/beaker/refine/refine1-source-only-Cx2-launch.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=1 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-baseline \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-source-only-Cx2.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-source-only-Cx2.sh b/scripts/beaker/refine/refine1-source-only-Cx2.sh new file mode 100755 index 000000000..bd2f0ed6b --- /dev/null +++ b/scripts/beaker/refine/refine1-source-only-Cx2.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-source-only-Cx2.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-source-only-Cx5-launch.sh b/scripts/beaker/refine/refine1-source-only-Cx5-launch.sh new file mode 100755 index 000000000..271cef6bf --- /dev/null +++ b/scripts/beaker/refine/refine1-source-only-Cx5-launch.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=2 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-baseline-cx5-20240821 \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --synchronized-start-timeout 20m \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-source-only-Cx5.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-source-only-Cx5.sh b/scripts/beaker/refine/refine1-source-only-Cx5.sh new file mode 100755 index 000000000..9e0234902 --- /dev/null +++ b/scripts/beaker/refine/refine1-source-only-Cx5.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +# export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-source-only-Cx5.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-source-only-launch.sh b/scripts/beaker/refine/refine1-source-only-launch.sh new file mode 100755 index 000000000..8862d784e --- /dev/null +++ b/scripts/beaker/refine/refine1-source-only-launch.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=1 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-baseline \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-source-only.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-source-only.sh b/scripts/beaker/refine/refine1-source-only.sh new file mode 100755 index 000000000..00a25133c --- /dev/null +++ b/scripts/beaker/refine/refine1-source-only.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-source-only.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-test-launch.sh b/scripts/beaker/refine/refine1-test-launch.sh new file mode 100755 index 000000000..943894bb6 --- /dev/null +++ b/scripts/beaker/refine/refine1-test-launch.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=1 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1 \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-test.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-test.sh b/scripts/beaker/refine/refine1-test.sh new file mode 100755 index 000000000..4fca3fae1 --- /dev/null +++ b/scripts/beaker/refine/refine1-test.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-test.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite