From e7ab8698f915d931e6cb62747ca9aacb33351df9 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Mon, 29 Jul 2024 14:33:38 -0700 Subject: [PATCH 01/33] Adds refine launch/config --- configs/refine/olmo-1b-refine-mixed.yaml | 216 +++ .../refine/olmo-1b-refine-rewrite-only.yaml | 324 ++++ .../refine/olmo-1b-refine-source-only.yaml | 324 ++++ configs/refine/olmo-1b-refine-test.yaml | 1348 +++++++++++++++++ .../refine/refine1-rewrite-only-launch.sh | 38 + scripts/beaker/refine/refine1-rewrite-only.sh | 64 + scripts/beaker/refine/refine1-test-launch.sh | 38 + scripts/beaker/refine/refine1-test.sh | 64 + 8 files changed, 2416 insertions(+) create mode 100644 configs/refine/olmo-1b-refine-mixed.yaml create mode 100644 configs/refine/olmo-1b-refine-rewrite-only.yaml create mode 100644 configs/refine/olmo-1b-refine-source-only.yaml create mode 100644 configs/refine/olmo-1b-refine-test.yaml create mode 100755 scripts/beaker/refine/refine1-rewrite-only-launch.sh create mode 100755 scripts/beaker/refine/refine1-rewrite-only.sh create mode 100755 scripts/beaker/refine/refine1-test-launch.sh create mode 100755 scripts/beaker/refine/refine1-test.sh diff --git a/configs/refine/olmo-1b-refine-mixed.yaml b/configs/refine/olmo-1b-refine-mixed.yaml new file mode 100644 index 000000000..39d214d43 --- /dev/null +++ b/configs/refine/olmo-1b-refine-mixed.yaml @@ -0,0 +1,216 @@ +run_name: ${oc.env:SLURM_JOB_ID} +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} # luca to share + project: olmoe # CHANGE ME + group: null # set to run name as well, see refine examples + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + attention_layer_norm: false + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 4.0e-4 + eps: 1.0e-8 + weight_decay: 0.1 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +# Cx1: t_max = 1.3B params * 20 = 26e9 +# Cx2: t_max = 1.3B params * 40 = 52e9 +# Cx3: t_max = 1.3B params * 60 = 78e9 +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 8388608000 + t_max: 52e9 # Set to 30B + alpha_f: 0.1 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 52e9T # Match total token count +stop_at: 12398 # Do math total tokens / (gbts / sequence_len) +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +precision: amp_bf16 +distributed_strategy: fsdp + +fsdp: + wrapping_strategy: by_block + precision: mixed + sharding_strategy: FULL_SHARD + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + generate_doc_lengths: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + paths: diff --git a/configs/refine/olmo-1b-refine-rewrite-only.yaml b/configs/refine/olmo-1b-refine-rewrite-only.yaml new file mode 100644 index 000000000..dbed1ab9a --- /dev/null +++ b/configs/refine/olmo-1b-refine-rewrite-only.yaml @@ -0,0 +1,324 @@ +run_name: olmo-1b-refine-rewrite-only-001 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +# Cx1: t_max = 1.3B params * 20 = 26e9 +# Cx2: t_max = 1.3B params * 40 = 52e9 +# Cx3: t_max = 1.3B params * 60 = 78e9 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 1e9 + t_max: 30e9 # We don't quite have enough for Cx2 so we'll stop at 30B + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 30e9T # Match total token count +stop_at: 14305 # 30e9 / (2048 * 1024) = 14305 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + generate_doc_lengths: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # Cx1 20b sample set 01 + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # Cx1 <20b sample set 02 + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-12-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-13-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-14-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-15-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-16-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-17-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-18-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-19-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-20-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-21-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-22-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-23-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-24-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-25-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-26-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-27-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-28-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-29-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-30-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-31-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-32-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-33-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-34-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-35-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-36-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-37-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-38-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-39-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-40-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-41-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-42-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-43-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-44-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-45-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-46-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-47-00000.npy diff --git a/configs/refine/olmo-1b-refine-source-only.yaml b/configs/refine/olmo-1b-refine-source-only.yaml new file mode 100644 index 000000000..dbed1ab9a --- /dev/null +++ b/configs/refine/olmo-1b-refine-source-only.yaml @@ -0,0 +1,324 @@ +run_name: olmo-1b-refine-rewrite-only-001 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +# Cx1: t_max = 1.3B params * 20 = 26e9 +# Cx2: t_max = 1.3B params * 40 = 52e9 +# Cx3: t_max = 1.3B params * 60 = 78e9 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 1e9 + t_max: 30e9 # We don't quite have enough for Cx2 so we'll stop at 30B + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 30e9T # Match total token count +stop_at: 14305 # 30e9 / (2048 * 1024) = 14305 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + generate_doc_lengths: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # Cx1 20b sample set 01 + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # Cx1 <20b sample set 02 + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-12-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-13-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-14-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-15-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-16-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-17-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-18-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-19-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-20-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-21-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-22-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-23-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-24-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-25-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-26-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-27-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-28-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-29-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-30-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-31-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-32-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-33-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-34-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-35-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-36-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-37-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-38-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-39-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-40-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-41-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-42-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-43-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-44-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-45-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-46-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-47-00000.npy diff --git a/configs/refine/olmo-1b-refine-test.yaml b/configs/refine/olmo-1b-refine-test.yaml new file mode 100644 index 000000000..b783e5f7f --- /dev/null +++ b/configs/refine/olmo-1b-refine-test.yaml @@ -0,0 +1,1348 @@ +run_name: olmo-1b-refine-rewrite-only-001 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +# Cx1: t_max = 1.3B params * 20 = 26e9 +# Cx2: t_max = 1.3B params * 40 = 52e9 +# Cx3: t_max = 1.3B params * 60 = 78e9 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 1e9 + t_max: 30e9 # We don't quite have enough for Cx2 so we'll stop at 30B + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +auxiliary_loss_multiplier: 0.0001 + +max_duration: 30e9T # Match total token count +stop_at: 14305 # 30e9 / (2048 * 1024) = 14305 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-000-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-001-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-002-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-003-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-004-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-005-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-006-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-007-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-008-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-009-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-010-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-011-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-012-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-013-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-014-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-015-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-016-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-017-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-018-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-019-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-020-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-021-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-022-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-023-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-024-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-025-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-026-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-027-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-028-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-029-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-030-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-031-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-032-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-033-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-034-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-035-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-036-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-037-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-038-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-039-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-040-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-041-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-042-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-043-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-044-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-045-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-046-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-047-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-048-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-049-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-050-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-051-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-052-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-053-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-054-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-055-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-056-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-057-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-058-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-059-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-060-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-061-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-062-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-063-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-064-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-065-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-066-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-067-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-068-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-069-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-070-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-071-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-072-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-073-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-074-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-075-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-076-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-077-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-078-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-079-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-080-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-081-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-082-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-083-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-084-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-085-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-086-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-087-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-088-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-089-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-090-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-091-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-092-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-093-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-094-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-095-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-096-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-097-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-098-00000.npy + - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-099-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-000-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-000-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-000-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-000-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-000-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-001-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-001-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-001-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-001-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-001-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-002-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-002-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-002-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-002-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-002-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-003-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-003-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-003-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-003-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-003-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-004-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-004-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-004-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-004-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-004-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-005-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-005-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-005-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-005-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-005-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-006-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-006-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-006-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-006-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-006-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-007-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-007-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-007-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-007-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-007-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-008-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-008-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-008-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-008-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-008-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-009-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-009-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-009-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-009-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-009-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-010-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-010-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-010-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-010-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-010-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-011-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-011-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-011-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-011-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-011-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-012-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-012-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-012-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-012-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-012-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-013-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-013-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-013-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-013-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-013-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-014-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-014-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-014-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-014-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-014-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-015-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-015-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-015-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-015-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-015-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-016-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-016-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-016-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-016-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-016-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-017-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-017-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-017-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-017-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-017-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-018-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-018-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-018-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-018-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-018-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-019-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-019-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-019-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-019-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-019-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-020-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-020-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-020-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-020-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-020-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-021-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-021-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-021-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-021-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-021-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-022-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-022-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-022-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-022-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-022-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-023-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-023-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-023-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-023-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-023-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-024-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-024-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-024-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-024-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-024-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-025-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-025-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-025-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-025-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-025-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-026-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-026-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-026-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-026-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-026-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-027-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-027-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-027-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-027-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-027-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-028-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-028-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-028-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-028-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-028-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-029-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-029-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-029-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-029-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-029-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-030-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-030-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-030-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-030-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-030-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-031-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-031-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-031-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-031-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-031-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-032-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-032-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-032-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-032-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-032-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-033-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-033-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-033-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-033-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-033-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-034-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-034-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-034-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-034-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-034-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-035-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-035-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-035-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-035-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-035-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-036-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-036-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-036-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-036-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-036-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-037-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-037-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-037-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-037-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-037-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-038-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-038-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-038-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-038-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-038-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-039-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-039-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-039-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-039-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-039-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-040-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-040-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-040-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-040-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-040-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-041-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-041-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-041-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-041-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-041-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-042-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-042-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-042-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-042-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-042-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-043-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-043-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-043-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-043-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-043-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-044-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-044-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-044-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-044-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-044-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-045-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-045-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-045-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-045-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-045-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-046-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-046-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-046-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-046-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-046-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-047-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-047-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-047-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-047-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-047-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-048-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-048-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-048-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-048-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-048-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-049-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-049-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-049-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-049-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-049-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-050-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-050-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-050-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-050-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-050-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-051-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-051-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-051-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-051-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-051-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-052-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-052-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-052-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-052-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-052-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-053-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-053-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-053-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-053-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-053-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-054-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-054-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-054-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-054-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-054-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-055-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-055-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-055-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-055-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-055-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-056-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-056-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-056-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-056-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-056-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-057-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-057-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-057-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-057-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-057-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-058-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-058-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-058-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-058-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-058-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-059-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-059-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-059-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-059-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-059-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-060-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-060-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-060-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-060-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-060-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-061-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-061-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-061-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-061-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-061-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-062-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-062-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-062-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-062-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-062-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-063-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-063-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-063-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-063-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-063-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-064-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-064-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-064-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-064-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-064-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-065-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-065-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-065-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-065-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-065-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-066-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-066-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-066-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-066-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-066-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-067-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-067-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-067-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-067-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-067-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-068-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-068-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-068-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-068-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-068-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-069-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-069-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-069-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-069-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-069-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-070-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-070-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-070-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-070-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-070-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-071-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-071-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-071-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-071-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-071-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-072-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-072-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-072-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-072-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-072-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-073-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-073-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-073-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-073-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-073-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-074-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-074-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-074-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-074-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-074-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-075-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-075-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-075-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-075-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-075-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-076-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-076-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-076-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-076-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-076-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-077-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-077-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-077-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-077-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-077-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-078-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-078-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-078-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-078-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-078-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-079-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-079-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-079-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-079-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-079-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-080-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-080-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-080-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-080-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-080-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-081-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-081-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-081-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-081-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-081-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-082-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-082-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-082-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-082-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-082-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-083-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-083-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-083-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-083-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-083-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-084-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-084-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-084-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-084-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-084-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-085-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-085-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-085-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-085-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-085-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-086-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-086-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-086-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-086-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-086-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-087-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-087-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-087-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-087-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-087-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-088-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-088-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-088-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-088-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-088-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-089-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-089-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-089-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-089-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-089-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-090-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-090-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-090-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-090-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-090-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-091-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-091-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-091-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-091-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-091-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-092-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-092-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-092-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-092-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-092-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-093-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-093-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-093-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-093-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-093-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-094-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-094-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-094-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-094-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-094-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-095-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-095-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-095-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-095-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-095-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-096-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-096-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-096-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-096-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-096-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-097-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-097-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-097-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-097-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-097-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-098-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-098-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-098-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-098-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-098-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-099-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-099-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-099-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-099-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-099-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-100-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-100-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-100-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-100-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-100-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-101-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-101-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-101-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-101-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-101-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-102-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-102-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-102-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-102-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-102-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-103-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-103-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-103-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-103-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-103-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-105-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-105-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-105-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-105-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-105-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-106-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-106-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-106-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-106-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-106-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-107-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-107-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-107-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-107-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-107-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-108-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-108-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-108-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-108-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-108-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-109-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-109-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-109-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-109-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-109-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-110-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-110-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-110-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-110-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-110-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-111-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-111-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-111-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-111-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-111-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-112-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-112-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-112-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-112-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-112-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-113-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-113-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-113-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-113-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-113-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-114-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-114-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-114-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-114-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-114-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-115-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-115-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-115-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-115-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-115-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-116-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-116-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-116-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-116-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-116-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-117-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-117-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-117-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-117-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-117-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-118-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-118-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-118-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-118-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-118-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-119-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-119-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-119-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-119-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-119-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-120-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-120-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-120-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-120-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-120-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-121-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-121-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-121-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-121-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-121-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-122-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-122-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-122-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-122-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-122-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-123-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-123-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-123-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-123-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-123-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-124-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-124-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-124-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-124-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-124-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-125-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-125-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-125-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-125-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-125-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-126-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-126-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-126-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-126-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-126-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-127-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-127-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-127-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-127-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-127-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-128-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-128-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-128-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-128-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-128-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-129-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-129-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-129-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-129-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-129-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-130-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-130-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-130-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-130-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-130-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-131-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-131-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-131-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-131-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-131-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-132-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-132-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-132-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-132-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-132-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-133-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-133-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-133-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-133-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-133-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-134-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-134-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-134-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-134-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-134-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-135-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-135-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-135-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-135-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-135-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-136-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-136-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-136-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-136-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-136-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-137-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-137-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-137-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-137-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-137-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-138-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-138-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-138-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-138-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-138-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-139-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-139-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-139-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-139-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-139-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-140-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-140-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-140-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-140-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-140-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-141-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-141-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-141-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-141-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-141-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-142-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-142-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-142-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-142-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-142-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-143-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-143-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-143-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-143-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-143-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-144-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-144-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-144-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-144-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-144-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-145-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-145-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-145-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-145-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-145-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-146-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-146-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-146-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-146-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-146-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-147-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-147-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-147-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-147-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-147-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-148-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-148-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-148-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-148-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-148-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-149-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-149-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-149-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-149-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-149-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-150-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-150-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-150-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-150-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-150-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-151-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-151-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-151-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-151-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-151-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-152-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-152-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-152-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-152-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-152-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-153-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-153-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-153-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-153-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-153-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-154-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-154-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-154-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-154-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-154-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-155-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-155-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-155-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-155-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-155-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-156-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-156-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-156-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-156-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-156-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-157-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-157-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-157-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-157-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-157-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-158-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-158-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-158-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-158-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-158-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-159-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-159-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-159-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-159-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-159-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-160-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-160-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-160-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-160-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-160-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-161-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-161-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-161-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-161-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-161-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-162-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-162-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-162-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-162-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-162-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-163-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-163-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-163-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-163-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-163-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-164-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-164-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-164-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-164-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-164-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-165-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-165-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-165-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-165-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-165-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-166-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-166-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-166-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-166-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-166-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-167-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-167-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-167-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-167-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-167-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-168-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-168-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-168-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-168-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-168-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-169-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-169-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-169-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-169-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-169-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-170-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-170-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-170-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-170-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-170-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-171-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-171-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-171-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-171-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-171-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-172-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-172-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-172-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-172-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-172-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-173-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-173-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-173-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-173-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-173-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-174-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-174-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-174-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-174-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-174-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-175-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-175-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-175-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-175-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-175-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-176-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-176-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-176-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-176-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-176-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-177-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-177-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-177-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-177-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-177-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-178-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-178-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-178-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-178-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-178-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-179-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-179-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-179-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-179-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-179-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-180-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-180-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-180-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-180-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-180-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-181-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-181-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-181-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-181-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-181-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-182-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-182-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-182-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-182-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-182-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-183-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-183-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-183-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-183-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-183-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-184-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-184-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-184-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-184-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-184-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-185-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-185-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-185-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-185-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-185-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-186-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-186-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-186-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-186-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-186-00004.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-187-00000.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-187-00001.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-187-00002.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-187-00003.npy + - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-187-00004.npy + - s3://ai2-llm//preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm//preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy diff --git a/scripts/beaker/refine/refine1-rewrite-only-launch.sh b/scripts/beaker/refine/refine1-rewrite-only-launch.sh new file mode 100755 index 000000000..6cc3204c6 --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only-launch.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=1 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1 \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-training \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-rewrite-only.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-rewrite-only.sh b/scripts/beaker/refine/refine1-rewrite-only.sh new file mode 100755 index 000000000..c112e0afb --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-rewrite-only.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-test-launch.sh b/scripts/beaker/refine/refine1-test-launch.sh new file mode 100755 index 000000000..9a515f8eb --- /dev/null +++ b/scripts/beaker/refine/refine1-test-launch.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=1 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1 \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-training \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-test.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-test.sh b/scripts/beaker/refine/refine1-test.sh new file mode 100755 index 000000000..4fca3fae1 --- /dev/null +++ b/scripts/beaker/refine/refine1-test.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-test.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite From 89c1c0d4f427bff73a2008d738be3e9ac3a21cd0 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Mon, 29 Jul 2024 14:37:05 -0700 Subject: [PATCH 02/33] Drop new config option --- configs/refine/olmo-1b-refine-test.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/configs/refine/olmo-1b-refine-test.yaml b/configs/refine/olmo-1b-refine-test.yaml index b783e5f7f..8929cca2a 100644 --- a/configs/refine/olmo-1b-refine-test.yaml +++ b/configs/refine/olmo-1b-refine-test.yaml @@ -82,8 +82,6 @@ save_num_unsharded_checkpoints_to_keep: -1 load_path: null -auxiliary_loss_multiplier: 0.0001 - max_duration: 30e9T # Match total token count stop_at: 14305 # 30e9 / (2048 * 1024) = 14305 global_train_batch_size: 1024 From 94c8ae8d35b3dd10559fc0dd7b99ec63e988b78c Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Mon, 29 Jul 2024 15:04:51 -0700 Subject: [PATCH 03/33] Small change to rewrite config --- configs/refine/olmo-1b-refine-rewrite-only.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/configs/refine/olmo-1b-refine-rewrite-only.yaml b/configs/refine/olmo-1b-refine-rewrite-only.yaml index dbed1ab9a..7ec326c94 100644 --- a/configs/refine/olmo-1b-refine-rewrite-only.yaml +++ b/configs/refine/olmo-1b-refine-rewrite-only.yaml @@ -111,7 +111,6 @@ evaluators: data: num_workers: 0 drop_last: true - generate_doc_lengths: true memmap_dtype: uint32 datasets: c4_en-validation: From f9fcf162f8a238a5e1a62b214c3b78b9b452b9ab Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Mon, 29 Jul 2024 15:29:24 -0700 Subject: [PATCH 04/33] Add baseline launch/config --- .../refine/olmo-1b-refine-source-only.yaml | 2 +- .../refine/refine1-source-only-launch.sh | 38 +++++++++++ scripts/beaker/refine/refine1-source-only.sh | 64 +++++++++++++++++++ 3 files changed, 103 insertions(+), 1 deletion(-) create mode 100755 scripts/beaker/refine/refine1-source-only-launch.sh create mode 100755 scripts/beaker/refine/refine1-source-only.sh diff --git a/configs/refine/olmo-1b-refine-source-only.yaml b/configs/refine/olmo-1b-refine-source-only.yaml index dbed1ab9a..f936bb92a 100644 --- a/configs/refine/olmo-1b-refine-source-only.yaml +++ b/configs/refine/olmo-1b-refine-source-only.yaml @@ -1,4 +1,4 @@ -run_name: olmo-1b-refine-rewrite-only-001 +run_name: olmo-1b-refine-source-only-001 seed: 6198 dry_run: false no_pre_train_checkpoint: true diff --git a/scripts/beaker/refine/refine1-source-only-launch.sh b/scripts/beaker/refine/refine1-source-only-launch.sh new file mode 100755 index 000000000..ccde18b4c --- /dev/null +++ b/scripts/beaker/refine/refine1-source-only-launch.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=1 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-baseline \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-training \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-source-only.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-source-only.sh b/scripts/beaker/refine/refine1-source-only.sh new file mode 100755 index 000000000..00a25133c --- /dev/null +++ b/scripts/beaker/refine/refine1-source-only.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-source-only.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite From 9018e88a18afc1633a0e7d19c774aced6b0f2429 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Mon, 29 Jul 2024 15:31:04 -0700 Subject: [PATCH 05/33] Match rewrite config --- configs/refine/olmo-1b-refine-source-only.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/configs/refine/olmo-1b-refine-source-only.yaml b/configs/refine/olmo-1b-refine-source-only.yaml index f936bb92a..209b746bd 100644 --- a/configs/refine/olmo-1b-refine-source-only.yaml +++ b/configs/refine/olmo-1b-refine-source-only.yaml @@ -111,7 +111,6 @@ evaluators: data: num_workers: 0 drop_last: true - generate_doc_lengths: true memmap_dtype: uint32 datasets: c4_en-validation: From 61ee2f05358e1b47501a119d3f160d585bf00c89 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Mon, 29 Jul 2024 16:09:13 -0700 Subject: [PATCH 06/33] Use s3 for now --- .../refine/olmo-1b-refine-source-only.yaml | 196 +++++++++--------- .../refine/refine1-source-only-launch.sh | 2 +- 2 files changed, 99 insertions(+), 99 deletions(-) diff --git a/configs/refine/olmo-1b-refine-source-only.yaml b/configs/refine/olmo-1b-refine-source-only.yaml index 209b746bd..3d0312f35 100644 --- a/configs/refine/olmo-1b-refine-source-only.yaml +++ b/configs/refine/olmo-1b-refine-source-only.yaml @@ -223,101 +223,101 @@ data: repetition_max_count: 32 paths: # Cx1 20b sample set 01 - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy - - # Cx1 <20b sample set 02 - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-03-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-04-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-05-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-06-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-07-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-08-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-09-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-10-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-11-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-12-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-13-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-14-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-15-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-16-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-17-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-18-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-19-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-20-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-21-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-22-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-23-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-24-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-25-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-26-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-27-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-28-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-29-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-30-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-31-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-32-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-33-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-34-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-35-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-36-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-37-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-38-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-39-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-40-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-41-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-42-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-43-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-44-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-45-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-46-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # Cx1 20b sample set 02 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-47-00000.npy diff --git a/scripts/beaker/refine/refine1-source-only-launch.sh b/scripts/beaker/refine/refine1-source-only-launch.sh index ccde18b4c..a7a288bf6 100755 --- a/scripts/beaker/refine/refine1-source-only-launch.sh +++ b/scripts/beaker/refine/refine1-source-only-launch.sh @@ -12,7 +12,7 @@ gantry run \ --priority urgent \ --preemptible \ --beaker-image petew/olmo-torch23-gantry \ - --cluster ai2/jupiter-cirrascale-2 \ + --cluster ai2/pluto-cirrascale \ --gpus 8 \ --replicas "${NUM_NODES}" \ --leader-selection \ From 8c69b9767b9b42f12df93c9c0e5d5ec31e69bfa7 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Mon, 29 Jul 2024 16:14:26 -0700 Subject: [PATCH 07/33] Cleanup --- scripts/beaker/refine/refine1-source-only-launch.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/beaker/refine/refine1-source-only-launch.sh b/scripts/beaker/refine/refine1-source-only-launch.sh index a7a288bf6..0eaeb21ef 100755 --- a/scripts/beaker/refine/refine1-source-only-launch.sh +++ b/scripts/beaker/refine/refine1-source-only-launch.sh @@ -4,6 +4,8 @@ set -ex NUM_NODES=1 + # --weka oe-training-default:/weka/oe-training-default \ + gantry run \ --workspace ai2/oe-data-model-based-cleanup \ --allow-dirty \ @@ -19,7 +21,6 @@ gantry run \ --host-networking \ --budget ai2/oe-training \ --no-nfs \ - --weka oe-training-default:/weka/oe-training-default \ --propagate-failure \ --propagate-preemption \ --no-python \ From 4a85106d00e71a498e4cee6c97aa942fd136d3e1 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Mon, 29 Jul 2024 16:35:11 -0700 Subject: [PATCH 08/33] Only jupiter --- scripts/beaker/refine/refine1-source-only-launch.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/beaker/refine/refine1-source-only-launch.sh b/scripts/beaker/refine/refine1-source-only-launch.sh index 0eaeb21ef..ccde18b4c 100755 --- a/scripts/beaker/refine/refine1-source-only-launch.sh +++ b/scripts/beaker/refine/refine1-source-only-launch.sh @@ -4,8 +4,6 @@ set -ex NUM_NODES=1 - # --weka oe-training-default:/weka/oe-training-default \ - gantry run \ --workspace ai2/oe-data-model-based-cleanup \ --allow-dirty \ @@ -14,13 +12,14 @@ gantry run \ --priority urgent \ --preemptible \ --beaker-image petew/olmo-torch23-gantry \ - --cluster ai2/pluto-cirrascale \ + --cluster ai2/jupiter-cirrascale-2 \ --gpus 8 \ --replicas "${NUM_NODES}" \ --leader-selection \ --host-networking \ --budget ai2/oe-training \ --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ --propagate-failure \ --propagate-preemption \ --no-python \ From 12b4f4c98bf0eb5e315cea81040b835a1b38c61d Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Wed, 31 Jul 2024 13:03:12 -0700 Subject: [PATCH 09/33] Double CxN mixed data setup --- configs/refine/olmo-1b-refine-mixed.yaml | 271 +++++++++++++++--- scripts/beaker/refine/refine1-mixed-launch.sh | 38 +++ scripts/beaker/refine/refine1-mixed.sh | 64 +++++ 3 files changed, 341 insertions(+), 32 deletions(-) create mode 100755 scripts/beaker/refine/refine1-mixed-launch.sh create mode 100755 scripts/beaker/refine/refine1-mixed.sh diff --git a/configs/refine/olmo-1b-refine-mixed.yaml b/configs/refine/olmo-1b-refine-mixed.yaml index 39d214d43..d11e45c23 100644 --- a/configs/refine/olmo-1b-refine-mixed.yaml +++ b/configs/refine/olmo-1b-refine-mixed.yaml @@ -1,12 +1,12 @@ -run_name: ${oc.env:SLURM_JOB_ID} +run_name: olmo-1b-refine-mixed-001 seed: 6198 dry_run: false no_pre_train_checkpoint: true wandb: - name: ${run_name} # luca to share - project: olmoe # CHANGE ME - group: null # set to run name as well, see refine examples + name: ${run_name} + project: refine-train + group: ${run_name} model: d_model: 2048 @@ -18,22 +18,23 @@ model: rope: true flash_attention: true attention_dropout: 0.0 - attention_layer_norm: false include_bias: false block_type: sequential layer_norm_type: rms layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true bias_for_layer_norm: false attention_layer_norm_with_affine: false activation_type: swiglu residual_dropout: 0.0 embedding_dropout: 0.0 - max_sequence_length: 4096 + max_sequence_length: 2048 vocab_size: 100278 embedding_size: 100352 eos_token_id: 100257 pad_token_id: 100277 - init_device: meta + init_device: cuda init_fn: normal init_std: 0.02 init_cutoff_factor: 3 @@ -42,11 +43,11 @@ compile: null optimizer: name: adamw - learning_rate: 4.0e-4 + learning_rate: 0.002 eps: 1.0e-8 - weight_decay: 0.1 + weight_decay: 0.05 decay_norm_and_bias: true - decay_embeddings: false + decay_embeddings: true betas: - 0.9 - 0.95 @@ -55,12 +56,13 @@ optimizer: # Cx1: t_max = 1.3B params * 20 = 26e9 # Cx2: t_max = 1.3B params * 40 = 52e9 # Cx3: t_max = 1.3B params * 60 = 78e9 + scheduler: name: cosine_with_warmup units: tokens - t_warmup: 8388608000 - t_max: 52e9 # Set to 30B - alpha_f: 0.1 + t_warmup: 1e9 + t_max: 30e9 # We don't quite have enough for Cx2 so we'll stop at 30B + alpha_f: 0.01 tokenizer: identifier: allenai/dolma2-tokenizer @@ -80,18 +82,20 @@ save_num_unsharded_checkpoints_to_keep: -1 load_path: null -max_duration: 52e9T # Match total token count -stop_at: 12398 # Do math total tokens / (gbts / sequence_len) +max_duration: 30e9T # Match total token count +stop_at: 14305 # 30e9 / (2048 * 1024) = 14305 global_train_batch_size: 1024 device_train_microbatch_size: 4 +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + precision: amp_bf16 -distributed_strategy: fsdp -fsdp: - wrapping_strategy: by_block - precision: mixed - sharding_strategy: FULL_SHARD +distributed_strategy: ddp max_grad_norm: 1.0 max_grad_norm_ratio: null @@ -107,31 +111,30 @@ evaluators: data: num_workers: 0 drop_last: true - generate_doc_lengths: true memmap_dtype: uint32 datasets: c4_en-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy dolma_books-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy dolma_common-crawl-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy dolma_pes2o-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy dolma_reddit-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy dolma_stack-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy dolma_wiki-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy ice-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy m2d2_s2orc-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy pile-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy wikitext_103-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy ########################## # Downstream evaluations # @@ -213,4 +216,208 @@ data: prefetch_factor: 8 persistent_workers: true timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 paths: + # Cx1 20b REWRITE sample set 01 + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # Cx1 <20b REWRITE sample set 02 + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-12-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-13-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-14-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-15-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-16-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-17-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-18-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-19-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-20-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-21-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-22-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-23-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-24-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-25-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-26-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-27-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-28-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-29-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-30-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-31-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-32-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-33-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-34-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-35-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-36-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-37-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-38-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-39-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-40-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-41-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-42-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-43-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-44-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-45-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-46-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-47-00000.npy + + # Cx1 20b SOURCE sample set 01 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # Cx1 20b SOURCE sample set 02 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-47-00000.npy diff --git a/scripts/beaker/refine/refine1-mixed-launch.sh b/scripts/beaker/refine/refine1-mixed-launch.sh new file mode 100755 index 000000000..90f29bfa9 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-launch.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=1 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-baseline \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-training \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-mixed.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-mixed.sh b/scripts/beaker/refine/refine1-mixed.sh new file mode 100755 index 000000000..2209484a2 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-mixed.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite From a858da6f4bc00714332b3bd615d84acb8c1ecdbd Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Wed, 31 Jul 2024 15:00:25 -0700 Subject: [PATCH 10/33] Add 2ep setups --- configs/refine/olmo-1b-refine-mixed-2ep.yaml | 321 ++++ ...l => olmo-1b-refine-rewrite-only-2ep.yaml} | 106 +- .../olmo-1b-refine-source-only-2ep.yaml | 321 ++++ configs/refine/olmo-1b-refine-test.yaml | 1346 ----------------- ...-launch.sh => refine1-mixed-2ep-launch.sh} | 2 +- ...{refine1-mixed.sh => refine1-mixed-2ep.sh} | 2 +- .../refine/refine1-rewrite-only-2ep-launch.sh | 38 + .../beaker/refine/refine1-rewrite-only-2ep.sh | 64 + .../refine/refine1-source-only-2ep-launch.sh | 38 + .../beaker/refine/refine1-source-only-2ep.sh | 64 + 10 files changed, 851 insertions(+), 1451 deletions(-) create mode 100644 configs/refine/olmo-1b-refine-mixed-2ep.yaml rename configs/refine/{olmo-1b-refine-mixed.yaml => olmo-1b-refine-rewrite-only-2ep.yaml} (66%) create mode 100644 configs/refine/olmo-1b-refine-source-only-2ep.yaml delete mode 100644 configs/refine/olmo-1b-refine-test.yaml rename scripts/beaker/refine/{refine1-mixed-launch.sh => refine1-mixed-2ep-launch.sh} (87%) rename scripts/beaker/refine/{refine1-mixed.sh => refine1-mixed-2ep.sh} (96%) create mode 100755 scripts/beaker/refine/refine1-rewrite-only-2ep-launch.sh create mode 100755 scripts/beaker/refine/refine1-rewrite-only-2ep.sh create mode 100755 scripts/beaker/refine/refine1-source-only-2ep-launch.sh create mode 100755 scripts/beaker/refine/refine1-source-only-2ep.sh diff --git a/configs/refine/olmo-1b-refine-mixed-2ep.yaml b/configs/refine/olmo-1b-refine-mixed-2ep.yaml new file mode 100644 index 000000000..9435236a1 --- /dev/null +++ b/configs/refine/olmo-1b-refine-mixed-2ep.yaml @@ -0,0 +1,321 @@ +run_name: olmo-1b-refine-mixed-2ep-001 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +# Cx1: t_max = 1.3B params * 20 = 26e9 +# Cx2: t_max = 1.3B params * 40 = 52e9 +# Cx3: t_max = 1.3B params * 60 = 78e9 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 2e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 2ep +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # Cx1 20b REWRITE sample set 01 + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # Cx1 20b SOURCE sample set 01 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy diff --git a/configs/refine/olmo-1b-refine-mixed.yaml b/configs/refine/olmo-1b-refine-rewrite-only-2ep.yaml similarity index 66% rename from configs/refine/olmo-1b-refine-mixed.yaml rename to configs/refine/olmo-1b-refine-rewrite-only-2ep.yaml index d11e45c23..72bd63579 100644 --- a/configs/refine/olmo-1b-refine-mixed.yaml +++ b/configs/refine/olmo-1b-refine-rewrite-only-2ep.yaml @@ -1,4 +1,4 @@ -run_name: olmo-1b-refine-mixed-001 +run_name: olmo-1b-refine-rewrite-only-2ep-001 seed: 6198 dry_run: false no_pre_train_checkpoint: true @@ -222,7 +222,7 @@ data: repetition_min_period: 1 repetition_max_count: 32 paths: - # Cx1 20b REWRITE sample set 01 + # Cx1 20b sample set 01 - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy @@ -272,7 +272,7 @@ data: - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy - # Cx1 <20b REWRITE sample set 02 + # Cx1 <20b sample set 02 - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy @@ -321,103 +321,3 @@ data: - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-45-00000.npy - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-46-00000.npy - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-47-00000.npy - - # Cx1 20b SOURCE sample set 01 - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy - - # Cx1 20b SOURCE sample set 02 - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-03-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-04-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-05-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-06-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-07-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-08-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-09-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-10-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-11-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-12-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-13-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-14-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-15-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-16-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-17-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-18-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-19-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-20-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-21-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-22-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-23-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-24-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-25-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-26-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-27-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-28-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-29-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-30-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-31-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-32-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-33-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-34-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-35-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-36-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-37-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-38-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-39-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-40-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-41-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-42-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-43-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-44-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-45-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-46-00000.npy - - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-47-00000.npy diff --git a/configs/refine/olmo-1b-refine-source-only-2ep.yaml b/configs/refine/olmo-1b-refine-source-only-2ep.yaml new file mode 100644 index 000000000..a9ea00bd8 --- /dev/null +++ b/configs/refine/olmo-1b-refine-source-only-2ep.yaml @@ -0,0 +1,321 @@ +run_name: olmo-1b-refine-source-only-2ep-001 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +# Cx1: t_max = 1.3B params * 20 = 26e9 +# Cx2: t_max = 1.3B params * 40 = 52e9 +# Cx3: t_max = 1.3B params * 60 = 78e9 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 2e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 2ep +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # Cx1 20b sample set 01 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # Cx1 20b sample set 02 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-47-00000.npy diff --git a/configs/refine/olmo-1b-refine-test.yaml b/configs/refine/olmo-1b-refine-test.yaml deleted file mode 100644 index 8929cca2a..000000000 --- a/configs/refine/olmo-1b-refine-test.yaml +++ /dev/null @@ -1,1346 +0,0 @@ -run_name: olmo-1b-refine-rewrite-only-001 -seed: 6198 -dry_run: false -no_pre_train_checkpoint: true - -wandb: - name: ${run_name} - project: refine-train - group: ${run_name} - -model: - d_model: 2048 - n_heads: 16 - n_layers: 16 - mlp_ratio: 8 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - include_bias: false - block_type: sequential - layer_norm_type: rms - layer_norm_with_affine: true - layer_norm_eps: 1e-6 - attention_layer_norm: true - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 100278 - embedding_size: 100352 - eos_token_id: 100257 - pad_token_id: 100277 - init_device: cuda - init_fn: normal - init_std: 0.02 - init_cutoff_factor: 3 - -compile: null - -optimizer: - name: adamw - learning_rate: 0.002 - eps: 1.0e-8 - weight_decay: 0.05 - decay_norm_and_bias: true - decay_embeddings: true - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -# Cx1: t_max = 1.3B params * 20 = 26e9 -# Cx2: t_max = 1.3B params * 40 = 52e9 -# Cx3: t_max = 1.3B params * 60 = 78e9 - -scheduler: - name: cosine_with_warmup - units: tokens - t_warmup: 1e9 - t_max: 30e9 # We don't quite have enough for Cx2 so we'll stop at 30B - alpha_f: 0.01 - -tokenizer: - identifier: allenai/dolma2-tokenizer - truncate_direction: right - -save_folder: runs/${run_name} -remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} -save_overwrite: false - -save_interval: 5000 -save_interval_ephemeral: null -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 30e9T # Match total token count -stop_at: 14305 # 30e9 / (2048 * 1024) = 14305 -global_train_batch_size: 1024 -device_train_microbatch_size: 4 - -fused_loss: true - -ddp: - grad_sync_mode: batch - find_unused_params: false - -precision: amp_bf16 - -distributed_strategy: ddp - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: 1000 -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - memmap_dtype: uint32 - datasets: - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy - dolma_books-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy - dolma_common-crawl-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy - dolma_pes2o-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy - dolma_reddit-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy - dolma_stack-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy - dolma_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: copa - type: downstream - - - label: commonsense_qa - type: downstream - - - label: social_iqa - type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - - label: mmlu_stem_mc_5shot_test - type: downstream - - - label: mmlu_humanities_mc_5shot_test - type: downstream - - - label: mmlu_social_sciences_mc_5shot_test - type: downstream - - - label: mmlu_other_mc_5shot_test - type: downstream - -data: - pad_direction: right - num_workers: 16 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - memmap_dtype: uint32 - instance_filter: - repetition_max_period: 13 - repetition_min_period: 1 - repetition_max_count: 32 - paths: - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-00-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-01-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-02-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-03-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-04-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-05-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-06-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-07-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-08-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-09-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-10-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-11-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-12-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-13-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-14-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-15-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-00-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-01-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-02-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-03-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-04-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-05-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-06-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-07-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-08-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-09-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-10-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-11-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-12-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-13-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-14-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-15-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-16-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-17-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-18-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-19-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-00-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-01-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-02-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-03-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-04-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-05-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-06-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-07-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-08-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-09-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-10-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-11-00000.npy - - s3://ai2-llm//preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-12-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-00-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-01-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-02-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-03-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-04-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-05-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-06-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-07-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-08-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-09-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-10-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-11-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-12-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-13-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-14-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-15-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-16-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-17-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-18-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-19-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-20-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-21-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-22-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-23-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-24-00000.npy - - s3://ai2-llm//preprocessed/pes2o/allenai/dolma2-tokenizer/part-25-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-000-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-001-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-002-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-003-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-004-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-005-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-006-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-007-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-008-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-009-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-010-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-011-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-012-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-013-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-014-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-015-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-016-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-017-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-018-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-019-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-020-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-021-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-022-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-023-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-024-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-025-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-026-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-027-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-028-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-029-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-030-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-031-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-032-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-033-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-034-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-035-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-036-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-037-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-038-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-039-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-040-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-041-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-042-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-043-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-044-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-045-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-046-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-047-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-048-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-049-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-050-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-051-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-052-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-053-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-054-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-055-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-056-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-057-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-058-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-059-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-060-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-061-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-062-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-063-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-064-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-065-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-066-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-067-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-068-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-069-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-070-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-071-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-072-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-073-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-074-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-075-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-076-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-077-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-078-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-079-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-080-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-081-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-082-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-083-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-084-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-085-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-086-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-087-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-088-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-089-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-090-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-091-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-092-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-093-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-094-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-095-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-096-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-097-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-098-00000.npy - - s3://ai2-llm//preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-099-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-000-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-000-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-000-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-000-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-000-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-001-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-001-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-001-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-001-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-001-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-002-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-002-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-002-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-002-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-002-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-003-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-003-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-003-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-003-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-003-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-004-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-004-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-004-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-004-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-004-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-005-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-005-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-005-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-005-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-005-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-006-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-006-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-006-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-006-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-006-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-007-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-007-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-007-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-007-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-007-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-008-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-008-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-008-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-008-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-008-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-009-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-009-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-009-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-009-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-009-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-010-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-010-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-010-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-010-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-010-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-011-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-011-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-011-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-011-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-011-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-012-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-012-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-012-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-012-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-012-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-013-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-013-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-013-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-013-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-013-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-014-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-014-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-014-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-014-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-014-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-015-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-015-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-015-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-015-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-015-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-016-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-016-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-016-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-016-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-016-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-017-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-017-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-017-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-017-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-017-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-018-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-018-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-018-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-018-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-018-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-019-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-019-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-019-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-019-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-019-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-020-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-020-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-020-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-020-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-020-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-021-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-021-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-021-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-021-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-021-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-022-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-022-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-022-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-022-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-022-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-023-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-023-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-023-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-023-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-023-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-024-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-024-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-024-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-024-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-024-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-025-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-025-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-025-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-025-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-025-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-026-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-026-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-026-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-026-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-026-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-027-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-027-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-027-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-027-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-027-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-028-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-028-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-028-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-028-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-028-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-029-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-029-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-029-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-029-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-029-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-030-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-030-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-030-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-030-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-030-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-031-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-031-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-031-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-031-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-031-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-032-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-032-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-032-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-032-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-032-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-033-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-033-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-033-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-033-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-033-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-034-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-034-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-034-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-034-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-034-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-035-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-035-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-035-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-035-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-035-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-036-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-036-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-036-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-036-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-036-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-037-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-037-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-037-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-037-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-037-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-038-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-038-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-038-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-038-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-038-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-039-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-039-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-039-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-039-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-039-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-040-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-040-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-040-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-040-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-040-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-041-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-041-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-041-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-041-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-041-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-042-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-042-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-042-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-042-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-042-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-043-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-043-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-043-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-043-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-043-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-044-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-044-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-044-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-044-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-044-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-045-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-045-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-045-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-045-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-045-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-046-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-046-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-046-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-046-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-046-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-047-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-047-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-047-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-047-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-047-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-048-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-048-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-048-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-048-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-048-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-049-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-049-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-049-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-049-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-049-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-050-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-050-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-050-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-050-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-050-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-051-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-051-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-051-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-051-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-051-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-052-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-052-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-052-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-052-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-052-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-053-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-053-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-053-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-053-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-053-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-054-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-054-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-054-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-054-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-054-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-055-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-055-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-055-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-055-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-055-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-056-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-056-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-056-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-056-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-056-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-057-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-057-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-057-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-057-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-057-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-058-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-058-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-058-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-058-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-058-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-059-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-059-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-059-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-059-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-059-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-060-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-060-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-060-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-060-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-060-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-061-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-061-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-061-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-061-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-061-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-062-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-062-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-062-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-062-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-062-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-063-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-063-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-063-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-063-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-063-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-064-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-064-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-064-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-064-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-064-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-065-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-065-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-065-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-065-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-065-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-066-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-066-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-066-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-066-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-066-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-067-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-067-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-067-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-067-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-067-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-068-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-068-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-068-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-068-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-068-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-069-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-069-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-069-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-069-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-069-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-070-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-070-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-070-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-070-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-070-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-071-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-071-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-071-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-071-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-071-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-072-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-072-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-072-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-072-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-072-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-073-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-073-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-073-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-073-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-073-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-074-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-074-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-074-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-074-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-074-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-075-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-075-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-075-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-075-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-075-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-076-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-076-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-076-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-076-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-076-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-077-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-077-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-077-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-077-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-077-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-078-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-078-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-078-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-078-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-078-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-079-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-079-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-079-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-079-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-079-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-080-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-080-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-080-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-080-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-080-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-081-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-081-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-081-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-081-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-081-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-082-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-082-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-082-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-082-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-082-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-083-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-083-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-083-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-083-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-083-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-084-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-084-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-084-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-084-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-084-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-085-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-085-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-085-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-085-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-085-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-086-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-086-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-086-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-086-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-086-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-087-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-087-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-087-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-087-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-087-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-088-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-088-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-088-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-088-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-088-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-089-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-089-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-089-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-089-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-089-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-090-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-090-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-090-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-090-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-090-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-091-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-091-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-091-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-091-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-091-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-092-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-092-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-092-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-092-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-092-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-093-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-093-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-093-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-093-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-093-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-094-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-094-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-094-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-094-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-094-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-095-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-095-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-095-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-095-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-095-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-096-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-096-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-096-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-096-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-096-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-097-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-097-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-097-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-097-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-097-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-098-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-098-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-098-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-098-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-098-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-099-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-099-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-099-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-099-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-099-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-100-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-100-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-100-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-100-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-100-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-101-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-101-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-101-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-101-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-101-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-102-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-102-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-102-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-102-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-102-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-103-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-103-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-103-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-103-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-103-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-105-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-105-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-105-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-105-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-105-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-106-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-106-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-106-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-106-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-106-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-107-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-107-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-107-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-107-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-107-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-108-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-108-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-108-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-108-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-108-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-109-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-109-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-109-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-109-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-109-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-110-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-110-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-110-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-110-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-110-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-111-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-111-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-111-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-111-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-111-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-112-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-112-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-112-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-112-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-112-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-113-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-113-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-113-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-113-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-113-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-114-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-114-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-114-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-114-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-114-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-115-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-115-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-115-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-115-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-115-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-116-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-116-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-116-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-116-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-116-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-117-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-117-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-117-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-117-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-117-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-118-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-118-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-118-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-118-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-118-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-119-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-119-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-119-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-119-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-119-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-120-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-120-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-120-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-120-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-120-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-121-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-121-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-121-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-121-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-121-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-122-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-122-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-122-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-122-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-122-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-123-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-123-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-123-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-123-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-123-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-124-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-124-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-124-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-124-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-124-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-125-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-125-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-125-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-125-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-125-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-126-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-126-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-126-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-126-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-126-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-127-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-127-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-127-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-127-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-127-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-128-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-128-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-128-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-128-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-128-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-129-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-129-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-129-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-129-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-129-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-130-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-130-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-130-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-130-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-130-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-131-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-131-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-131-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-131-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-131-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-132-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-132-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-132-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-132-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-132-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-133-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-133-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-133-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-133-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-133-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-134-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-134-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-134-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-134-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-134-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-135-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-135-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-135-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-135-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-135-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-136-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-136-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-136-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-136-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-136-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-137-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-137-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-137-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-137-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-137-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-138-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-138-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-138-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-138-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-138-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-139-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-139-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-139-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-139-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-139-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-140-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-140-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-140-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-140-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-140-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-141-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-141-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-141-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-141-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-141-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-142-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-142-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-142-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-142-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-142-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-143-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-143-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-143-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-143-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-143-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-144-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-144-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-144-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-144-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-144-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-145-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-145-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-145-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-145-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-145-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-146-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-146-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-146-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-146-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-146-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-147-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-147-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-147-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-147-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-147-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-148-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-148-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-148-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-148-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-148-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-149-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-149-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-149-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-149-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-149-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-150-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-150-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-150-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-150-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-150-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-151-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-151-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-151-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-151-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-151-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-152-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-152-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-152-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-152-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-152-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-153-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-153-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-153-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-153-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-153-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-154-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-154-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-154-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-154-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-154-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-155-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-155-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-155-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-155-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-155-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-156-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-156-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-156-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-156-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-156-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-157-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-157-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-157-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-157-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-157-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-158-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-158-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-158-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-158-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-158-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-159-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-159-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-159-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-159-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-159-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-160-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-160-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-160-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-160-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-160-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-161-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-161-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-161-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-161-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-161-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-162-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-162-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-162-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-162-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-162-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-163-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-163-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-163-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-163-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-163-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-164-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-164-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-164-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-164-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-164-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-165-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-165-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-165-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-165-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-165-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-166-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-166-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-166-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-166-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-166-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-167-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-167-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-167-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-167-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-167-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-168-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-168-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-168-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-168-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-168-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-169-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-169-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-169-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-169-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-169-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-170-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-170-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-170-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-170-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-170-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-171-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-171-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-171-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-171-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-171-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-172-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-172-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-172-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-172-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-172-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-173-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-173-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-173-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-173-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-173-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-174-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-174-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-174-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-174-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-174-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-175-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-175-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-175-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-175-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-175-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-176-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-176-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-176-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-176-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-176-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-177-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-177-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-177-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-177-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-177-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-178-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-178-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-178-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-178-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-178-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-179-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-179-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-179-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-179-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-179-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-180-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-180-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-180-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-180-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-180-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-181-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-181-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-181-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-181-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-181-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-182-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-182-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-182-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-182-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-182-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-183-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-183-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-183-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-183-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-183-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-184-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-184-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-184-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-184-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-184-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-185-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-185-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-185-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-185-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-185-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-186-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-186-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-186-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-186-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-186-00004.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-187-00000.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-187-00001.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-187-00002.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-187-00003.npy - - s3://ai2-llm//preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-187-00004.npy - - s3://ai2-llm//preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy - - s3://ai2-llm//preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy diff --git a/scripts/beaker/refine/refine1-mixed-launch.sh b/scripts/beaker/refine/refine1-mixed-2ep-launch.sh similarity index 87% rename from scripts/beaker/refine/refine1-mixed-launch.sh rename to scripts/beaker/refine/refine1-mixed-2ep-launch.sh index 90f29bfa9..b8812081e 100755 --- a/scripts/beaker/refine/refine1-mixed-launch.sh +++ b/scripts/beaker/refine/refine1-mixed-2ep-launch.sh @@ -35,4 +35,4 @@ gantry run \ --shared-memory 10GiB \ --yes \ --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/refine/refine1-mixed.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" + -- /bin/bash -c "scripts/beaker/refine/refine1-mixed-2ep.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-mixed.sh b/scripts/beaker/refine/refine1-mixed-2ep.sh similarity index 96% rename from scripts/beaker/refine/refine1-mixed.sh rename to scripts/beaker/refine/refine1-mixed-2ep.sh index 2209484a2..88dbf5d9c 100755 --- a/scripts/beaker/refine/refine1-mixed.sh +++ b/scripts/beaker/refine/refine1-mixed-2ep.sh @@ -58,7 +58,7 @@ torchrun \ --node_rank "${BEAKER_REPLICA_RANK}" \ --rdzv_conf 'read_timeout=420' \ scripts/train.py \ - configs/refine/olmo-1b-refine-mixed.yaml \ + configs/refine/olmo-1b-refine-mixed-2ep.yaml \ --run_name="${GANTRY_TASK_NAME}" \ --save_interval_ephemeral=null \ --save_overwrite diff --git a/scripts/beaker/refine/refine1-rewrite-only-2ep-launch.sh b/scripts/beaker/refine/refine1-rewrite-only-2ep-launch.sh new file mode 100755 index 000000000..f7336dafb --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only-2ep-launch.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=1 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1 \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-training \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-rewrite-only-2ep.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-rewrite-only-2ep.sh b/scripts/beaker/refine/refine1-rewrite-only-2ep.sh new file mode 100755 index 000000000..7dd7b19bd --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only-2ep.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-rewrite-only-2ep.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-source-only-2ep-launch.sh b/scripts/beaker/refine/refine1-source-only-2ep-launch.sh new file mode 100755 index 000000000..9dac8ab40 --- /dev/null +++ b/scripts/beaker/refine/refine1-source-only-2ep-launch.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=1 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-baseline \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-training \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-source-only-2ep.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-source-only-2ep.sh b/scripts/beaker/refine/refine1-source-only-2ep.sh new file mode 100755 index 000000000..effc356ec --- /dev/null +++ b/scripts/beaker/refine/refine1-source-only-2ep.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-source-only-2ep.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite From 0ba08d828e64934e8dbfd3e8a03ef21b055597d8 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Thu, 1 Aug 2024 08:40:48 -0700 Subject: [PATCH 11/33] Task names --- scripts/beaker/refine/refine1-mixed-2ep-launch.sh | 2 +- scripts/beaker/refine/refine1-rewrite-only-2ep-launch.sh | 2 +- scripts/beaker/refine/refine1-rewrite-only-launch.sh | 2 +- scripts/beaker/refine/refine1-source-only-2ep-launch.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/beaker/refine/refine1-mixed-2ep-launch.sh b/scripts/beaker/refine/refine1-mixed-2ep-launch.sh index b8812081e..4dc84b691 100755 --- a/scripts/beaker/refine/refine1-mixed-2ep-launch.sh +++ b/scripts/beaker/refine/refine1-mixed-2ep-launch.sh @@ -7,7 +7,7 @@ NUM_NODES=1 gantry run \ --workspace ai2/oe-data-model-based-cleanup \ --allow-dirty \ - --task-name refine1-baseline \ + --task-name refine1-mixed-2ep \ --description "OLMo refine 1B" \ --priority urgent \ --preemptible \ diff --git a/scripts/beaker/refine/refine1-rewrite-only-2ep-launch.sh b/scripts/beaker/refine/refine1-rewrite-only-2ep-launch.sh index f7336dafb..ab0a18d7b 100755 --- a/scripts/beaker/refine/refine1-rewrite-only-2ep-launch.sh +++ b/scripts/beaker/refine/refine1-rewrite-only-2ep-launch.sh @@ -7,7 +7,7 @@ NUM_NODES=1 gantry run \ --workspace ai2/oe-data-model-based-cleanup \ --allow-dirty \ - --task-name refine1 \ + --task-name refine1-rewrites-2ep \ --description "OLMo refine 1B" \ --priority urgent \ --preemptible \ diff --git a/scripts/beaker/refine/refine1-rewrite-only-launch.sh b/scripts/beaker/refine/refine1-rewrite-only-launch.sh index 6cc3204c6..1b3108e93 100755 --- a/scripts/beaker/refine/refine1-rewrite-only-launch.sh +++ b/scripts/beaker/refine/refine1-rewrite-only-launch.sh @@ -7,7 +7,7 @@ NUM_NODES=1 gantry run \ --workspace ai2/oe-data-model-based-cleanup \ --allow-dirty \ - --task-name refine1 \ + --task-name refine1-rewrites \ --description "OLMo refine 1B" \ --priority urgent \ --preemptible \ diff --git a/scripts/beaker/refine/refine1-source-only-2ep-launch.sh b/scripts/beaker/refine/refine1-source-only-2ep-launch.sh index 9dac8ab40..4afac8e99 100755 --- a/scripts/beaker/refine/refine1-source-only-2ep-launch.sh +++ b/scripts/beaker/refine/refine1-source-only-2ep-launch.sh @@ -7,7 +7,7 @@ NUM_NODES=1 gantry run \ --workspace ai2/oe-data-model-based-cleanup \ --allow-dirty \ - --task-name refine1-baseline \ + --task-name refine1-baseline-2ep \ --description "OLMo refine 1B" \ --priority urgent \ --preemptible \ From 15d58fd2b6eb6a91cf75ff77d22cf7a3eabc28b9 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Fri, 2 Aug 2024 08:46:02 -0700 Subject: [PATCH 12/33] Fix config for rewrite only 2ep --- configs/refine/olmo-1b-refine-rewrite-only-2ep.yaml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/configs/refine/olmo-1b-refine-rewrite-only-2ep.yaml b/configs/refine/olmo-1b-refine-rewrite-only-2ep.yaml index 72bd63579..581783f19 100644 --- a/configs/refine/olmo-1b-refine-rewrite-only-2ep.yaml +++ b/configs/refine/olmo-1b-refine-rewrite-only-2ep.yaml @@ -60,8 +60,7 @@ optimizer: scheduler: name: cosine_with_warmup units: tokens - t_warmup: 1e9 - t_max: 30e9 # We don't quite have enough for Cx2 so we'll stop at 30B + t_warmup: 2e9 alpha_f: 0.01 tokenizer: @@ -82,8 +81,7 @@ save_num_unsharded_checkpoints_to_keep: -1 load_path: null -max_duration: 30e9T # Match total token count -stop_at: 14305 # 30e9 / (2048 * 1024) = 14305 +max_duration: 2ep global_train_batch_size: 1024 device_train_microbatch_size: 4 From bc59dcb11c3220e831ae20168b6ab75065016baa Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Thu, 8 Aug 2024 11:07:02 -0700 Subject: [PATCH 13/33] Filtered config and launch --- .../olmo-1b-refine-rewrite-only-filtered.yaml | 355 ++++++++++++++++++ .../refine1-rewrite-only-filtered-launch.sh | 37 ++ .../refine/refine1-rewrite-only-filtered.sh | 60 +++ 3 files changed, 452 insertions(+) create mode 100644 configs/refine/olmo-1b-refine-rewrite-only-filtered.yaml create mode 100755 scripts/beaker/refine/refine1-rewrite-only-filtered-launch.sh create mode 100755 scripts/beaker/refine/refine1-rewrite-only-filtered.sh diff --git a/configs/refine/olmo-1b-refine-rewrite-only-filtered.yaml b/configs/refine/olmo-1b-refine-rewrite-only-filtered.yaml new file mode 100644 index 000000000..39dbc3a0e --- /dev/null +++ b/configs/refine/olmo-1b-refine-rewrite-only-filtered.yaml @@ -0,0 +1,355 @@ +run_name: olmo-1b-refine-rewrite-only-filtered-001 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +# Cx1: t_max = 1.3B params * 20 = 26e9 +# Cx2: t_max = 1.3B params * 40 = 52e9 +# Cx3: t_max = 1.3B params * 60 = 78e9 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 1e9 + t_max: 30e9 # We don't quite have enough for Cx2 so we'll stop at 30B + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 30e9T # Match total token count +stop_at: 14305 # 30e9 / (2048 * 1024) = 14305 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # 19.7G Tokens (Group 1) + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-63-00000.npy + + # 21.9G Tokens (Group 2) + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-63-00000.npy diff --git a/scripts/beaker/refine/refine1-rewrite-only-filtered-launch.sh b/scripts/beaker/refine/refine1-rewrite-only-filtered-launch.sh new file mode 100755 index 000000000..7157856fd --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only-filtered-launch.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=1 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-rewrites-filtered \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-training \ + --no-nfs \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-rewrite-only-filtered.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-rewrite-only-filtered.sh b/scripts/beaker/refine/refine1-rewrite-only-filtered.sh new file mode 100755 index 000000000..02777888a --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only-filtered.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn and dependencies +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-rewrite-only-filtered.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite From 160e53974bf502b27155aac4fa948324023dedaf Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Thu, 8 Aug 2024 11:18:38 -0700 Subject: [PATCH 14/33] Use oe-data budget --- scripts/beaker/refine/refine1-mixed-2ep-launch.sh | 2 +- scripts/beaker/refine/refine1-rewrite-only-2ep-launch.sh | 2 +- scripts/beaker/refine/refine1-rewrite-only-filtered-launch.sh | 2 +- scripts/beaker/refine/refine1-rewrite-only-launch.sh | 2 +- scripts/beaker/refine/refine1-source-only-2ep-launch.sh | 2 +- scripts/beaker/refine/refine1-source-only-launch.sh | 2 +- scripts/beaker/refine/refine1-test-launch.sh | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/beaker/refine/refine1-mixed-2ep-launch.sh b/scripts/beaker/refine/refine1-mixed-2ep-launch.sh index 4dc84b691..af2133d31 100755 --- a/scripts/beaker/refine/refine1-mixed-2ep-launch.sh +++ b/scripts/beaker/refine/refine1-mixed-2ep-launch.sh @@ -17,7 +17,7 @@ gantry run \ --replicas "${NUM_NODES}" \ --leader-selection \ --host-networking \ - --budget ai2/oe-training \ + --budget ai2/oe-data \ --no-nfs \ --weka oe-training-default:/weka/oe-training-default \ --propagate-failure \ diff --git a/scripts/beaker/refine/refine1-rewrite-only-2ep-launch.sh b/scripts/beaker/refine/refine1-rewrite-only-2ep-launch.sh index ab0a18d7b..7d818fdeb 100755 --- a/scripts/beaker/refine/refine1-rewrite-only-2ep-launch.sh +++ b/scripts/beaker/refine/refine1-rewrite-only-2ep-launch.sh @@ -17,7 +17,7 @@ gantry run \ --replicas "${NUM_NODES}" \ --leader-selection \ --host-networking \ - --budget ai2/oe-training \ + --budget ai2/oe-data \ --no-nfs \ --weka oe-training-default:/weka/oe-training-default \ --propagate-failure \ diff --git a/scripts/beaker/refine/refine1-rewrite-only-filtered-launch.sh b/scripts/beaker/refine/refine1-rewrite-only-filtered-launch.sh index 7157856fd..61d1df209 100755 --- a/scripts/beaker/refine/refine1-rewrite-only-filtered-launch.sh +++ b/scripts/beaker/refine/refine1-rewrite-only-filtered-launch.sh @@ -17,7 +17,7 @@ gantry run \ --replicas "${NUM_NODES}" \ --leader-selection \ --host-networking \ - --budget ai2/oe-training \ + --budget ai2/oe-data \ --no-nfs \ --propagate-failure \ --propagate-preemption \ diff --git a/scripts/beaker/refine/refine1-rewrite-only-launch.sh b/scripts/beaker/refine/refine1-rewrite-only-launch.sh index 1b3108e93..4d3407d28 100755 --- a/scripts/beaker/refine/refine1-rewrite-only-launch.sh +++ b/scripts/beaker/refine/refine1-rewrite-only-launch.sh @@ -17,7 +17,7 @@ gantry run \ --replicas "${NUM_NODES}" \ --leader-selection \ --host-networking \ - --budget ai2/oe-training \ + --budget ai2/oe-data \ --no-nfs \ --weka oe-training-default:/weka/oe-training-default \ --propagate-failure \ diff --git a/scripts/beaker/refine/refine1-source-only-2ep-launch.sh b/scripts/beaker/refine/refine1-source-only-2ep-launch.sh index 4afac8e99..14b58f83c 100755 --- a/scripts/beaker/refine/refine1-source-only-2ep-launch.sh +++ b/scripts/beaker/refine/refine1-source-only-2ep-launch.sh @@ -17,7 +17,7 @@ gantry run \ --replicas "${NUM_NODES}" \ --leader-selection \ --host-networking \ - --budget ai2/oe-training \ + --budget ai2/oe-data \ --no-nfs \ --weka oe-training-default:/weka/oe-training-default \ --propagate-failure \ diff --git a/scripts/beaker/refine/refine1-source-only-launch.sh b/scripts/beaker/refine/refine1-source-only-launch.sh index ccde18b4c..8862d784e 100755 --- a/scripts/beaker/refine/refine1-source-only-launch.sh +++ b/scripts/beaker/refine/refine1-source-only-launch.sh @@ -17,7 +17,7 @@ gantry run \ --replicas "${NUM_NODES}" \ --leader-selection \ --host-networking \ - --budget ai2/oe-training \ + --budget ai2/oe-data \ --no-nfs \ --weka oe-training-default:/weka/oe-training-default \ --propagate-failure \ diff --git a/scripts/beaker/refine/refine1-test-launch.sh b/scripts/beaker/refine/refine1-test-launch.sh index 9a515f8eb..943894bb6 100755 --- a/scripts/beaker/refine/refine1-test-launch.sh +++ b/scripts/beaker/refine/refine1-test-launch.sh @@ -17,7 +17,7 @@ gantry run \ --replicas "${NUM_NODES}" \ --leader-selection \ --host-networking \ - --budget ai2/oe-training \ + --budget ai2/oe-data \ --no-nfs \ --weka oe-training-default:/weka/oe-training-default \ --propagate-failure \ From 38d57c9a6a117973cf87c70e60687bc92d804e83 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Mon, 12 Aug 2024 13:53:41 -0700 Subject: [PATCH 15/33] Cx2 setups --- ...o-1b-refine-rewrite-only-filtered-Cx2.yaml | 417 ++++++++++++++++++ .../olmo-1b-refine-source-only-Cx2.yaml | 389 ++++++++++++++++ ...efine1-rewrite-only-filtered-Cx2-launch.sh | 37 ++ .../refine1-rewrite-only-filtered-Cx2.sh | 60 +++ .../refine/refine1-source-only-Cx2-launch.sh | 38 ++ .../beaker/refine/refine1-source-only-Cx2.sh | 64 +++ 6 files changed, 1005 insertions(+) create mode 100644 configs/refine/olmo-1b-refine-rewrite-only-filtered-Cx2.yaml create mode 100644 configs/refine/olmo-1b-refine-source-only-Cx2.yaml create mode 100755 scripts/beaker/refine/refine1-rewrite-only-filtered-Cx2-launch.sh create mode 100755 scripts/beaker/refine/refine1-rewrite-only-filtered-Cx2.sh create mode 100755 scripts/beaker/refine/refine1-source-only-Cx2-launch.sh create mode 100755 scripts/beaker/refine/refine1-source-only-Cx2.sh diff --git a/configs/refine/olmo-1b-refine-rewrite-only-filtered-Cx2.yaml b/configs/refine/olmo-1b-refine-rewrite-only-filtered-Cx2.yaml new file mode 100644 index 000000000..bcbbec725 --- /dev/null +++ b/configs/refine/olmo-1b-refine-rewrite-only-filtered-Cx2.yaml @@ -0,0 +1,417 @@ +run_name: olmo-1b-refine-rewrite-only-filtered-Cx2-001 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 2e9 # Double our Cx1 warmup + t_max: 52e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 52e9 # Match total token count +stop_at: 24795 # 52e9 / (2048 * 1024) = 24_795 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # 19.7G Tokens (Group 1) + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0001/allenai/dolma2-tokenizer/part-63-00000.npy + + # 21.9G Tokens (Group 2) + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-filtered/0002/allenai/dolma2-tokenizer/part-63-00000.npy + + # 19.0G Tokens (100b group 1) + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-v0-100b-filtered/0001/allenai/dolma2-tokenizer/part-63-00000.npy diff --git a/configs/refine/olmo-1b-refine-source-only-Cx2.yaml b/configs/refine/olmo-1b-refine-source-only-Cx2.yaml new file mode 100644 index 000000000..b82f712c1 --- /dev/null +++ b/configs/refine/olmo-1b-refine-source-only-Cx2.yaml @@ -0,0 +1,389 @@ +run_name: olmo-1b-refine-source-only-Cx2-001 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +# Cx1: t_max = 1.3B params * 20 = 26e9 +# Cx2: t_max = 1.3B params * 40 = 52e9 +# Cx3: t_max = 1.3B params * 60 = 78e9 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 2e9 + t_max: 52e9 # We don't quite have enough for Cx2 so we'll stop at 30B + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 52e9T # Match total token count +stop_at: 24795 # 52e9 / (2048 * 1024) = 24_795 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # Cx1 20b sample set 01 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # Cx1 20b sample set 02 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-47-00000.npy + + # (50b) 100b sample set 01 + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-63-00000.npy diff --git a/scripts/beaker/refine/refine1-rewrite-only-filtered-Cx2-launch.sh b/scripts/beaker/refine/refine1-rewrite-only-filtered-Cx2-launch.sh new file mode 100755 index 000000000..393497c23 --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only-filtered-Cx2-launch.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=1 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-rewrites-filtered \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-rewrite-only-filtered-Cx2.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-rewrite-only-filtered-Cx2.sh b/scripts/beaker/refine/refine1-rewrite-only-filtered-Cx2.sh new file mode 100755 index 000000000..a23d69212 --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only-filtered-Cx2.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn and dependencies +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-rewrite-only-filtered-Cx2.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-source-only-Cx2-launch.sh b/scripts/beaker/refine/refine1-source-only-Cx2-launch.sh new file mode 100755 index 000000000..90989af29 --- /dev/null +++ b/scripts/beaker/refine/refine1-source-only-Cx2-launch.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=1 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-baseline \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-source-only-Cx2.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-source-only-Cx2.sh b/scripts/beaker/refine/refine1-source-only-Cx2.sh new file mode 100755 index 000000000..bd2f0ed6b --- /dev/null +++ b/scripts/beaker/refine/refine1-source-only-Cx2.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-source-only-Cx2.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite From f33451dda3340fd1a6a4690b0b09d4b6855ff401 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Mon, 12 Aug 2024 17:23:09 -0700 Subject: [PATCH 16/33] Typo --- configs/refine/olmo-1b-refine-rewrite-only-filtered-Cx2.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/refine/olmo-1b-refine-rewrite-only-filtered-Cx2.yaml b/configs/refine/olmo-1b-refine-rewrite-only-filtered-Cx2.yaml index bcbbec725..0db7b47f7 100644 --- a/configs/refine/olmo-1b-refine-rewrite-only-filtered-Cx2.yaml +++ b/configs/refine/olmo-1b-refine-rewrite-only-filtered-Cx2.yaml @@ -78,7 +78,7 @@ save_num_unsharded_checkpoints_to_keep: -1 load_path: null -max_duration: 52e9 # Match total token count +max_duration: 52e9T # Match total token count stop_at: 24795 # 52e9 / (2048 * 1024) = 24_795 global_train_batch_size: 1024 device_train_microbatch_size: 4 From 1a58147dad37288fe54761e377fcdb9293843e61 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Wed, 21 Aug 2024 14:04:16 -0700 Subject: [PATCH 17/33] cx5 setup for baseline --- .../olmo-1b-refine-source-only-Cx5.yaml | 450 ++++++++++++++++++ .../refine/refine1-source-only-Cx5-launch.sh | 38 ++ .../beaker/refine/refine1-source-only-Cx5.sh | 64 +++ 3 files changed, 552 insertions(+) create mode 100644 configs/refine/olmo-1b-refine-source-only-Cx5.yaml create mode 100755 scripts/beaker/refine/refine1-source-only-Cx5-launch.sh create mode 100755 scripts/beaker/refine/refine1-source-only-Cx5.sh diff --git a/configs/refine/olmo-1b-refine-source-only-Cx5.yaml b/configs/refine/olmo-1b-refine-source-only-Cx5.yaml new file mode 100644 index 000000000..db516632c --- /dev/null +++ b/configs/refine/olmo-1b-refine-source-only-Cx5.yaml @@ -0,0 +1,450 @@ +run_name: olmo-1b-refine-source-only-Cx5-20240821 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 5e9 + t_max: 130e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_num_checkpoints_to_keep: 12 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 130e9T # Match total token count +stop_at: 61989 # 130e9 / (2048 * 1024) = 61_989 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 2500 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # Cx1 20b sample set 01 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # Cx1 20b sample set 02 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-47-00000.npy + + # 100b (50G) sample set 001 + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-63-00000.npy + + # 100b (46.9G) sample set 002 + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/002/allenai/dolma2-tokenizer/part-63-00000.npy diff --git a/scripts/beaker/refine/refine1-source-only-Cx5-launch.sh b/scripts/beaker/refine/refine1-source-only-Cx5-launch.sh new file mode 100755 index 000000000..3b9c11c02 --- /dev/null +++ b/scripts/beaker/refine/refine1-source-only-Cx5-launch.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=2 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-baseline-cx5-20240821 \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-source-only-Cx5.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-source-only-Cx5.sh b/scripts/beaker/refine/refine1-source-only-Cx5.sh new file mode 100755 index 000000000..0912d34a7 --- /dev/null +++ b/scripts/beaker/refine/refine1-source-only-Cx5.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-source-only-Cx5.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite From ebd731109e88729ad2f5a0175a390f41525b804a Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Wed, 21 Aug 2024 14:17:22 -0700 Subject: [PATCH 18/33] sync start --- scripts/beaker/refine/refine1-source-only-Cx5-launch.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/beaker/refine/refine1-source-only-Cx5-launch.sh b/scripts/beaker/refine/refine1-source-only-Cx5-launch.sh index 3b9c11c02..271cef6bf 100755 --- a/scripts/beaker/refine/refine1-source-only-Cx5-launch.sh +++ b/scripts/beaker/refine/refine1-source-only-Cx5-launch.sh @@ -23,6 +23,7 @@ gantry run \ --propagate-failure \ --propagate-preemption \ --no-python \ + --synchronized-start-timeout 20m \ --env LOG_FILTER_TYPE=local_rank0_only \ --env OMP_NUM_THREADS=8 \ --env OLMO_TASK=model \ From 80c5ff3d656286b4d7a80254c3a83b2c7301de29 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Wed, 21 Aug 2024 14:34:11 -0700 Subject: [PATCH 19/33] Try without saving --- configs/refine/olmo-1b-refine-source-only-Cx5.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/refine/olmo-1b-refine-source-only-Cx5.yaml b/configs/refine/olmo-1b-refine-source-only-Cx5.yaml index db516632c..4c036d5d2 100644 --- a/configs/refine/olmo-1b-refine-source-only-Cx5.yaml +++ b/configs/refine/olmo-1b-refine-source-only-Cx5.yaml @@ -69,7 +69,7 @@ remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} save_overwrite: false save_interval: 5000 -save_num_checkpoints_to_keep: 12 +save_num_checkpoints_to_keep: -1 sharded_checkpointer: olmo_core save_interval_unsharded: null From bb1e53c37b8484144c07e3920f88d67ac61627a6 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Wed, 21 Aug 2024 14:47:58 -0700 Subject: [PATCH 20/33] Drop olmo shared fs --- scripts/beaker/refine/refine1-source-only-Cx5.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/beaker/refine/refine1-source-only-Cx5.sh b/scripts/beaker/refine/refine1-source-only-Cx5.sh index 0912d34a7..9e0234902 100755 --- a/scripts/beaker/refine/refine1-source-only-Cx5.sh +++ b/scripts/beaker/refine/refine1-source-only-Cx5.sh @@ -42,7 +42,7 @@ export HF_DATASETS_OFFLINE=1 export TORCH_DIST_INIT_BARRIER=1 # Tell OLMo all ranks share the same filesystem for checkpoints. -export OLMO_SHARED_FS=1 +# export OLMO_SHARED_FS=1 export NCCL_DEBUG=INFO export NCCL_IB_HCA="^=mlx5_bond_0" From c8e414848d23b47dd14a18bcaa42e77a97e50098 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Wed, 21 Aug 2024 14:48:33 -0700 Subject: [PATCH 21/33] Add checkpoints back --- configs/refine/olmo-1b-refine-source-only-Cx5.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/refine/olmo-1b-refine-source-only-Cx5.yaml b/configs/refine/olmo-1b-refine-source-only-Cx5.yaml index 4c036d5d2..db516632c 100644 --- a/configs/refine/olmo-1b-refine-source-only-Cx5.yaml +++ b/configs/refine/olmo-1b-refine-source-only-Cx5.yaml @@ -69,7 +69,7 @@ remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} save_overwrite: false save_interval: 5000 -save_num_checkpoints_to_keep: -1 +save_num_checkpoints_to_keep: 12 sharded_checkpointer: olmo_core save_interval_unsharded: null From d9f0920a7b6fafc144b595c24719c89215fda80b Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Thu, 22 Aug 2024 10:05:36 -0700 Subject: [PATCH 22/33] More Cx5 setups --- configs/refine/olmo-1b-refine-mixed-Cx5.yaml | 441 ++++++++++ .../olmo-1b-refine-rewrite-only-Cx5.yaml | 803 ++++++++++++++++++ .../refine/refine1-rewrite-only-Cx5-launch.sh | 39 + .../beaker/refine/refine1-rewrite-only-Cx5.sh | 64 ++ 4 files changed, 1347 insertions(+) create mode 100644 configs/refine/olmo-1b-refine-mixed-Cx5.yaml create mode 100644 configs/refine/olmo-1b-refine-rewrite-only-Cx5.yaml create mode 100755 scripts/beaker/refine/refine1-rewrite-only-Cx5-launch.sh create mode 100755 scripts/beaker/refine/refine1-rewrite-only-Cx5.sh diff --git a/configs/refine/olmo-1b-refine-mixed-Cx5.yaml b/configs/refine/olmo-1b-refine-mixed-Cx5.yaml new file mode 100644 index 000000000..e608faf75 --- /dev/null +++ b/configs/refine/olmo-1b-refine-mixed-Cx5.yaml @@ -0,0 +1,441 @@ +run_name: olmo-1b-refine-mixed-Cx5-20240822 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 5e9 + t_max: 130e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_num_checkpoints_to_keep: 12 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 130e9T # Match total token count +stop_at: 61989 # 130e9 / (2048 * 1024) = 61_989 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 2500 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # ------------------------------------------------------------ rewrite docs ------------------------------------------------------------ + + # 20b rewrites-unfiltered set 01 + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # <20b rewrites-unfiltered set 02 + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-12-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-13-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-14-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-15-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-16-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-17-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-18-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-19-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-20-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-21-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-22-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-23-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-24-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-25-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-26-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-27-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-28-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-29-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-30-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-31-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-32-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-33-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-34-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-35-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-36-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-37-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-38-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-39-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-40-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-41-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-42-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-43-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-44-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-45-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-46-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-47-00000.npy + + + + # ------------------------------------------------------------ source docs ------------------------------------------------------------ + + # Cx1 20b sample set 01 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # 100b (50G) sample set 001 + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-63-00000.npy + diff --git a/configs/refine/olmo-1b-refine-rewrite-only-Cx5.yaml b/configs/refine/olmo-1b-refine-rewrite-only-Cx5.yaml new file mode 100644 index 000000000..30a49c89c --- /dev/null +++ b/configs/refine/olmo-1b-refine-rewrite-only-Cx5.yaml @@ -0,0 +1,803 @@ +run_name: olmo-1b-refine-rewrite-only-Cx5-20240822 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 5e9 + t_max: 130e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_num_checkpoints_to_keep: 12 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 130e9T # Match total token count +stop_at: 61989 # 130e9 / (2048 * 1024) = 61_989 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 2500 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # ------------------------------------------------------------ rewrite docs ------------------------------------------------------------ + + # 20b rewrites-unfiltered set 01 + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # <20b rewrites-unfiltered set 02 + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-12-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-13-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-14-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-15-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-16-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-17-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-18-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-19-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-20-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-21-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-22-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-23-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-24-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-25-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-26-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-27-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-28-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-29-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-30-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-31-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-32-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-33-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-34-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-35-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-36-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-37-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-38-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-39-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-40-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-41-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-42-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-43-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-44-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-45-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-46-00000.npy + - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-47-00000.npy + + # ~85B rewrites-unfiltered + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-95-00000.npy + diff --git a/scripts/beaker/refine/refine1-rewrite-only-Cx5-launch.sh b/scripts/beaker/refine/refine1-rewrite-only-Cx5-launch.sh new file mode 100755 index 000000000..1f120f3c8 --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only-Cx5-launch.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=2 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-rewrite-unfiltered-cx5-20240822 \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --synchronized-start-timeout 20m \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-rewrite-only-Cx5.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-rewrite-only-Cx5.sh b/scripts/beaker/refine/refine1-rewrite-only-Cx5.sh new file mode 100755 index 000000000..7d8a978d8 --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only-Cx5.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +# export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-rewrite-only-Cx5.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite From 863540359b67f0b772f4e8cc2a104b1441e12db1 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Thu, 22 Aug 2024 10:26:50 -0700 Subject: [PATCH 23/33] Mixed setup --- configs/refine/olmo-1b-refine-mixed-Cx5.yaml | 556 ++++++++++++++---- .../beaker/refine/refine1-mixed-Cx5-launch.sh | 39 ++ scripts/beaker/refine/refine1-mixed-Cx5.sh | 64 ++ 3 files changed, 550 insertions(+), 109 deletions(-) create mode 100755 scripts/beaker/refine/refine1-mixed-Cx5-launch.sh create mode 100755 scripts/beaker/refine/refine1-mixed-Cx5.sh diff --git a/configs/refine/olmo-1b-refine-mixed-Cx5.yaml b/configs/refine/olmo-1b-refine-mixed-Cx5.yaml index e608faf75..8e0338aae 100644 --- a/configs/refine/olmo-1b-refine-mixed-Cx5.yaml +++ b/configs/refine/olmo-1b-refine-mixed-Cx5.yaml @@ -217,113 +217,452 @@ data: repetition_min_period: 1 repetition_max_count: 32 paths: - # ------------------------------------------------------------ rewrite docs ------------------------------------------------------------ - - # 20b rewrites-unfiltered set 01 - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy - - # <20b rewrites-unfiltered set 02 - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-03-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-04-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-05-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-06-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-07-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-08-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-09-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-10-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-11-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-12-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-13-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-14-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-15-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-16-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-17-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-18-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-19-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-20-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-21-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-22-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-23-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-24-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-25-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-26-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-27-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-28-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-29-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-30-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-31-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-32-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-33-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-34-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-35-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-36-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-37-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-38-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-39-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-40-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-41-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-42-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-43-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-44-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-45-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-46-00000.npy - - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/002/allenai/dolma2-tokenizer/part-47-00000.npy - - - - # ------------------------------------------------------------ source docs ------------------------------------------------------------ - - # Cx1 20b sample set 01 + # ------------------------------------------------------------ rewrite docs ------------------------------------------------------------ + # 100B (~78G) sample set (~70G included) + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-55-00000.npy + + # --------------------------------------------------------- source docs (~70G) --------------------------------------------------------- + + # Cx1 ~20b sample set 01 - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy @@ -373,7 +712,7 @@ data: - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy - # 100b (50G) sample set 001 + # 100b (~50G) sample set 001 - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-00-00000.npy - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-01-00000.npy - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-02-00000.npy @@ -438,4 +777,3 @@ data: - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-61-00000.npy - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-62-00000.npy - s3://ai2-llm/preprocessed/dclm/samples/src-100b/001/allenai/dolma2-tokenizer/part-63-00000.npy - diff --git a/scripts/beaker/refine/refine1-mixed-Cx5-launch.sh b/scripts/beaker/refine/refine1-mixed-Cx5-launch.sh new file mode 100755 index 000000000..88071108d --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-Cx5-launch.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=2 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-mixed-cx5-20240822 \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --synchronized-start-timeout 20m \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-mixed-Cx5.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-mixed-Cx5.sh b/scripts/beaker/refine/refine1-mixed-Cx5.sh new file mode 100755 index 000000000..5a5534be1 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-Cx5.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +# export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-mixed-Cx5.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite From f10b1e2db6b739fd7f242c9d03585c31d8b6a7a2 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Fri, 23 Aug 2024 15:50:19 -0700 Subject: [PATCH 24/33] Cx2 setups for mixed and unfiltered --- configs/refine/olmo-1b-refine-mixed-Cx2.yaml | 492 ++++++++++++++++ .../olmo-1b-refine-rewrite-only-Cx2.yaml | 523 ++++++++++++++++++ .../olmo-1b-refine-rewrite-only-Cx5.yaml | 1 - .../beaker/refine/refine1-mixed-Cx2-launch.sh | 39 ++ scripts/beaker/refine/refine1-mixed-Cx2.sh | 64 +++ .../refine/refine1-rewrite-only-Cx2-launch.sh | 39 ++ .../beaker/refine/refine1-rewrite-only-Cx2.sh | 64 +++ 7 files changed, 1221 insertions(+), 1 deletion(-) create mode 100644 configs/refine/olmo-1b-refine-mixed-Cx2.yaml create mode 100644 configs/refine/olmo-1b-refine-rewrite-only-Cx2.yaml create mode 100755 scripts/beaker/refine/refine1-mixed-Cx2-launch.sh create mode 100755 scripts/beaker/refine/refine1-mixed-Cx2.sh create mode 100755 scripts/beaker/refine/refine1-rewrite-only-Cx2-launch.sh create mode 100755 scripts/beaker/refine/refine1-rewrite-only-Cx2.sh diff --git a/configs/refine/olmo-1b-refine-mixed-Cx2.yaml b/configs/refine/olmo-1b-refine-mixed-Cx2.yaml new file mode 100644 index 000000000..095f2e663 --- /dev/null +++ b/configs/refine/olmo-1b-refine-mixed-Cx2.yaml @@ -0,0 +1,492 @@ +run_name: olmo-1b-refine-mixed-Cx2-20240823 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +# Cx1: t_max = 1.3B params * 20 = 26e9 +# Cx2: t_max = 1.3B params * 40 = 52e9 +# Cx3: t_max = 1.3B params * 60 = 78e9 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 2e9 + t_max: 52e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 52e9T # Match total token count +stop_at: 24795 # 52e9 / (2048 * 1024) = 24_795 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # -------------------------------------------- Source docs 26G -------------------------------------------- + # 19.76G sample set 01 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/001/allenai/dolma2-tokenizer/part-47-00000.npy + + # 7.1G from sample set 02 + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/src-20b/002/allenai/dolma2-tokenizer/part-17-00000.npy + + # -------------------------------------------- Rewrite docs 26G -------------------------------------------- + # 9.5G sample set 01 + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/001/allenai/dolma2-tokenizer/part-95-00000.npy + + # 17.6G sample set 02 + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-95-00000.npy + diff --git a/configs/refine/olmo-1b-refine-rewrite-only-Cx2.yaml b/configs/refine/olmo-1b-refine-rewrite-only-Cx2.yaml new file mode 100644 index 000000000..c5a98e26b --- /dev/null +++ b/configs/refine/olmo-1b-refine-rewrite-only-Cx2.yaml @@ -0,0 +1,523 @@ +run_name: olmo-1b-refine-rewrite-only-Cx2-20240823 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +# Cx1: t_max = 1.3B params * 20 = 26e9 +# Cx2: t_max = 1.3B params * 40 = 52e9 +# Cx3: t_max = 1.3B params * 60 = 78e9 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 2e9 + t_max: 52e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 52e9T # Match total token count +stop_at: 24795 # 52e9 / (2048 * 1024) = 24_795 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # >52G tokens + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0000/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0001/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0002/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-09-00000.npy diff --git a/configs/refine/olmo-1b-refine-rewrite-only-Cx5.yaml b/configs/refine/olmo-1b-refine-rewrite-only-Cx5.yaml index 30a49c89c..f189de88b 100644 --- a/configs/refine/olmo-1b-refine-rewrite-only-Cx5.yaml +++ b/configs/refine/olmo-1b-refine-rewrite-only-Cx5.yaml @@ -218,7 +218,6 @@ data: repetition_max_count: 32 paths: # ------------------------------------------------------------ rewrite docs ------------------------------------------------------------ - # 20b rewrites-unfiltered set 01 - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-00-00000.npy - /weka/oe-training-default/oe-training-default/ai2-llm/preprocessed/dclm/samples/rewrite-20b/001/allenai/dolma2-tokenizer/part-01-00000.npy diff --git a/scripts/beaker/refine/refine1-mixed-Cx2-launch.sh b/scripts/beaker/refine/refine1-mixed-Cx2-launch.sh new file mode 100755 index 000000000..960395a0b --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-Cx2-launch.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=2 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-mixed-cx5-20240822 \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --synchronized-start-timeout 20m \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-mixed-Cx2.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-mixed-Cx2.sh b/scripts/beaker/refine/refine1-mixed-Cx2.sh new file mode 100755 index 000000000..bff9212e4 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-Cx2.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +# export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-mixed-Cx2.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-rewrite-only-Cx2-launch.sh b/scripts/beaker/refine/refine1-rewrite-only-Cx2-launch.sh new file mode 100755 index 000000000..2c1b5351f --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only-Cx2-launch.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=2 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-rewrite-unfiltered-cx5-20240822 \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --synchronized-start-timeout 20m \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-rewrite-only-Cx2.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-rewrite-only-Cx2.sh b/scripts/beaker/refine/refine1-rewrite-only-Cx2.sh new file mode 100755 index 000000000..519dcfe04 --- /dev/null +++ b/scripts/beaker/refine/refine1-rewrite-only-Cx2.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +# export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-rewrite-only-Cx2.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite From 70462e305d66441c983f45327014b48404d2b2e0 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Fri, 23 Aug 2024 15:52:59 -0700 Subject: [PATCH 25/33] copy pazta --- scripts/beaker/refine/refine1-mixed-Cx2-launch.sh | 2 +- scripts/beaker/refine/refine1-rewrite-only-Cx2-launch.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/beaker/refine/refine1-mixed-Cx2-launch.sh b/scripts/beaker/refine/refine1-mixed-Cx2-launch.sh index 960395a0b..fce2eb791 100755 --- a/scripts/beaker/refine/refine1-mixed-Cx2-launch.sh +++ b/scripts/beaker/refine/refine1-mixed-Cx2-launch.sh @@ -7,7 +7,7 @@ NUM_NODES=2 gantry run \ --workspace ai2/oe-data-model-based-cleanup \ --allow-dirty \ - --task-name refine1-mixed-cx5-20240822 \ + --task-name refine1-mixed-cx2-20240823 \ --description "OLMo refine 1B" \ --priority urgent \ --preemptible \ diff --git a/scripts/beaker/refine/refine1-rewrite-only-Cx2-launch.sh b/scripts/beaker/refine/refine1-rewrite-only-Cx2-launch.sh index 2c1b5351f..b89b7a846 100755 --- a/scripts/beaker/refine/refine1-rewrite-only-Cx2-launch.sh +++ b/scripts/beaker/refine/refine1-rewrite-only-Cx2-launch.sh @@ -7,7 +7,7 @@ NUM_NODES=2 gantry run \ --workspace ai2/oe-data-model-based-cleanup \ --allow-dirty \ - --task-name refine1-rewrite-unfiltered-cx5-20240822 \ + --task-name refine1-rewrite-only-cx2-20240823 \ --description "OLMo refine 1B" \ --priority urgent \ --preemptible \ From 1e154a497d82342563843a0a0b91b33a8f9cb373 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Fri, 23 Aug 2024 15:55:42 -0700 Subject: [PATCH 26/33] Use high/1 node for cx2 runs --- scripts/beaker/refine/refine1-mixed-Cx2-launch.sh | 4 ++-- scripts/beaker/refine/refine1-rewrite-only-Cx2-launch.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/beaker/refine/refine1-mixed-Cx2-launch.sh b/scripts/beaker/refine/refine1-mixed-Cx2-launch.sh index fce2eb791..8e025f1e7 100755 --- a/scripts/beaker/refine/refine1-mixed-Cx2-launch.sh +++ b/scripts/beaker/refine/refine1-mixed-Cx2-launch.sh @@ -2,14 +2,14 @@ set -ex -NUM_NODES=2 +NUM_NODES=1 gantry run \ --workspace ai2/oe-data-model-based-cleanup \ --allow-dirty \ --task-name refine1-mixed-cx2-20240823 \ --description "OLMo refine 1B" \ - --priority urgent \ + --priority high \ --preemptible \ --beaker-image petew/olmo-torch23-gantry \ --cluster ai2/jupiter-cirrascale-2 \ diff --git a/scripts/beaker/refine/refine1-rewrite-only-Cx2-launch.sh b/scripts/beaker/refine/refine1-rewrite-only-Cx2-launch.sh index b89b7a846..7cef858b3 100755 --- a/scripts/beaker/refine/refine1-rewrite-only-Cx2-launch.sh +++ b/scripts/beaker/refine/refine1-rewrite-only-Cx2-launch.sh @@ -2,14 +2,14 @@ set -ex -NUM_NODES=2 +NUM_NODES=1 gantry run \ --workspace ai2/oe-data-model-based-cleanup \ --allow-dirty \ --task-name refine1-rewrite-only-cx2-20240823 \ --description "OLMo refine 1B" \ - --priority urgent \ + --priority high \ --preemptible \ --beaker-image petew/olmo-torch23-gantry \ --cluster ai2/jupiter-cirrascale-2 \ From ea458357f42f6a3c9abd3ec6544a40d125afe333 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Fri, 23 Aug 2024 15:56:26 -0700 Subject: [PATCH 27/33] gantry things --- scripts/beaker/refine/refine1-mixed-Cx2-launch.sh | 1 - scripts/beaker/refine/refine1-rewrite-only-Cx2-launch.sh | 1 - 2 files changed, 2 deletions(-) diff --git a/scripts/beaker/refine/refine1-mixed-Cx2-launch.sh b/scripts/beaker/refine/refine1-mixed-Cx2-launch.sh index 8e025f1e7..4a39b630e 100755 --- a/scripts/beaker/refine/refine1-mixed-Cx2-launch.sh +++ b/scripts/beaker/refine/refine1-mixed-Cx2-launch.sh @@ -23,7 +23,6 @@ gantry run \ --propagate-failure \ --propagate-preemption \ --no-python \ - --synchronized-start-timeout 20m \ --env LOG_FILTER_TYPE=local_rank0_only \ --env OMP_NUM_THREADS=8 \ --env OLMO_TASK=model \ diff --git a/scripts/beaker/refine/refine1-rewrite-only-Cx2-launch.sh b/scripts/beaker/refine/refine1-rewrite-only-Cx2-launch.sh index 7cef858b3..01525c24a 100755 --- a/scripts/beaker/refine/refine1-rewrite-only-Cx2-launch.sh +++ b/scripts/beaker/refine/refine1-rewrite-only-Cx2-launch.sh @@ -23,7 +23,6 @@ gantry run \ --propagate-failure \ --propagate-preemption \ --no-python \ - --synchronized-start-timeout 20m \ --env LOG_FILTER_TYPE=local_rank0_only \ --env OMP_NUM_THREADS=8 \ --env OLMO_TASK=model \ From 1c38953b3c470190a69af7db05bae7a8a3c8a6b5 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Tue, 27 Aug 2024 23:54:26 -0700 Subject: [PATCH 28/33] npy paths --- .../olmo-1b-refine-rewrite-only-Cx5.yaml | 189 ++++++++++++++++++ 1 file changed, 189 insertions(+) diff --git a/configs/refine/olmo-1b-refine-rewrite-only-Cx5.yaml b/configs/refine/olmo-1b-refine-rewrite-only-Cx5.yaml index f189de88b..0292aee47 100644 --- a/configs/refine/olmo-1b-refine-rewrite-only-Cx5.yaml +++ b/configs/refine/olmo-1b-refine-rewrite-only-Cx5.yaml @@ -800,3 +800,192 @@ data: - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-94-00000.npy - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/002/allenai/dolma2-tokenizer/0003/part-95-00000.npy + # extra rewrites to get more data (15b tokens) + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-000-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-001-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-002-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-003-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-004-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-005-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-006-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-007-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-008-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-009-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-010-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-011-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-012-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-013-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-014-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-015-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-016-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-017-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-018-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-019-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-020-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-021-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-022-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-023-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-024-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-025-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-026-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-027-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-028-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-029-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-030-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-031-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-032-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-033-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-034-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-035-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-036-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-037-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-038-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-039-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-040-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-041-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-042-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-043-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-044-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-045-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-046-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-047-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-048-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-049-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-050-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-051-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-052-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-053-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-054-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-055-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-056-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-057-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-058-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-059-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-060-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-061-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-062-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-063-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-064-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-065-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-066-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-067-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-068-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-069-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-070-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-071-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-072-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-073-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-074-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-075-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-076-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-077-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-078-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-079-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-080-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-081-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-082-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-083-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-084-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-085-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-086-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-087-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-088-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-089-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-090-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-091-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-092-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-093-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-094-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-095-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-096-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-097-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-098-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-099-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-100-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-101-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-102-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-103-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-104-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-105-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-106-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-107-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-108-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-109-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-110-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-111-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-112-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-113-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-114-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-115-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-116-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-117-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-118-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-119-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-120-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-121-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-122-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-123-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-124-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-125-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-126-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-127-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-128-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-129-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-130-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-131-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-132-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-133-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-134-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-135-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-136-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-137-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-138-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-139-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-140-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-141-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-142-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-143-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-144-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-145-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-146-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-147-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-148-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-149-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-150-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-151-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-152-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-153-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-154-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-155-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-156-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-157-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-158-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-159-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-160-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-161-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-162-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-163-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-164-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-165-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-166-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-167-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-168-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-169-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-170-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-171-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-172-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-173-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-174-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-175-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-176-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-177-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-178-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-179-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-180-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-181-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-182-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-183-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-184-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-185-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-186-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-100b/003/allenai/dolma2-tokenizer/part-187-00000.npy From 44df68a6c5783ded8f45d5af33dbab23ab788003 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Fri, 20 Sep 2024 16:39:15 -0700 Subject: [PATCH 29/33] Add fw filtered cx5 setup --- .../olmo-1b-refine-mixed-50pctl-fw-Cx5.yaml | 893 ++++++++++++++++++ .../refine1-mixed-50pctl-fw-Cx5-launch.sh | 39 + .../refine/refine1-mixed-50pctl-fw-Cx5.sh | 64 ++ 3 files changed, 996 insertions(+) create mode 100644 configs/refine/olmo-1b-refine-mixed-50pctl-fw-Cx5.yaml create mode 100755 scripts/beaker/refine/refine1-mixed-50pctl-fw-Cx5-launch.sh create mode 100755 scripts/beaker/refine/refine1-mixed-50pctl-fw-Cx5.sh diff --git a/configs/refine/olmo-1b-refine-mixed-50pctl-fw-Cx5.yaml b/configs/refine/olmo-1b-refine-mixed-50pctl-fw-Cx5.yaml new file mode 100644 index 000000000..78b5636ef --- /dev/null +++ b/configs/refine/olmo-1b-refine-mixed-50pctl-fw-Cx5.yaml @@ -0,0 +1,893 @@ +run_name: olmo-1b-refine-mixed-50pctl-fw-Cx5-20240920 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 5e9 + t_max: 130e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_num_checkpoints_to_keep: 12 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 130e9T # Match total token count +stop_at: 61989 # 130e9 / (2048 * 1024) = 61_989 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 2500 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0000/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0001/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0002/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0003/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0004/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0005/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0006/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0007/part-0-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-50pctl/allenai/dolma2-tokenizer/0007/part-1-00000.npy diff --git a/scripts/beaker/refine/refine1-mixed-50pctl-fw-Cx5-launch.sh b/scripts/beaker/refine/refine1-mixed-50pctl-fw-Cx5-launch.sh new file mode 100755 index 000000000..dae14a471 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-50pctl-fw-Cx5-launch.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=2 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-mixed-50pctl-fw-cx5-20240920 \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --synchronized-start-timeout 20m \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-mixed-50pctl-fw-Cx5.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-mixed-50pctl-fw-Cx5.sh b/scripts/beaker/refine/refine1-mixed-50pctl-fw-Cx5.sh new file mode 100755 index 000000000..d919cf7d9 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-50pctl-fw-Cx5.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +# export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-mixed-50pctl-fw-Cx5.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite From 7eae80e90d8b747235595a2a73718cb3bf80f9df Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Tue, 24 Sep 2024 12:05:04 -0700 Subject: [PATCH 30/33] Adds 80th percentile fw score setup --- .../olmo-1b-refine-mixed-80pctl-fw-Cx5.yaml | 893 ++++++++++++++++++ .../refine1-mixed-80pctl-fw-Cx5-launch.sh | 39 + .../refine/refine1-mixed-80pctl-fw-Cx5.sh | 64 ++ 3 files changed, 996 insertions(+) create mode 100644 configs/refine/olmo-1b-refine-mixed-80pctl-fw-Cx5.yaml create mode 100755 scripts/beaker/refine/refine1-mixed-80pctl-fw-Cx5-launch.sh create mode 100755 scripts/beaker/refine/refine1-mixed-80pctl-fw-Cx5.sh diff --git a/configs/refine/olmo-1b-refine-mixed-80pctl-fw-Cx5.yaml b/configs/refine/olmo-1b-refine-mixed-80pctl-fw-Cx5.yaml new file mode 100644 index 000000000..45935e442 --- /dev/null +++ b/configs/refine/olmo-1b-refine-mixed-80pctl-fw-Cx5.yaml @@ -0,0 +1,893 @@ +run_name: olmo-1b-refine-mixed-80pctl-fw-Cx5-20240924 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 5e9 + t_max: 130e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_num_checkpoints_to_keep: 12 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 130e9T # Match total token count +stop_at: 61989 # 130e9 / (2048 * 1024) = 61_989 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 2500 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0000/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0001/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0002/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0003/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0004/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0005/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0006/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0007/part-0-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-baseline-fw-delta-80pctl/allenai/dolma2-tokenizer/0007/part-1-00000.npy diff --git a/scripts/beaker/refine/refine1-mixed-80pctl-fw-Cx5-launch.sh b/scripts/beaker/refine/refine1-mixed-80pctl-fw-Cx5-launch.sh new file mode 100755 index 000000000..01b7bedd8 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-80pctl-fw-Cx5-launch.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=2 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-mixed-80pctl-fw-cx5-20240924 \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --synchronized-start-timeout 20m \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-mixed-80pctl-fw-Cx5.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-mixed-80pctl-fw-Cx5.sh b/scripts/beaker/refine/refine1-mixed-80pctl-fw-Cx5.sh new file mode 100755 index 000000000..e2d22fb13 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-80pctl-fw-Cx5.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +# export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-mixed-80pctl-fw-Cx5.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite From f0029e15127ca383ddfb5d4b748e66fd2038d636 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Wed, 25 Sep 2024 16:12:34 -0700 Subject: [PATCH 31/33] Add dclm ft delta setup --- .../olmo-1b-refine-mixed-50pctl-dclm-Cx5.yaml | 480 ++++++++++++++++++ .../refine1-mixed-50pctl-dclm-Cx5-launch.sh | 39 ++ .../refine/refine1-mixed-50pctl-dclm-Cx5.sh | 64 +++ 3 files changed, 583 insertions(+) create mode 100644 configs/refine/olmo-1b-refine-mixed-50pctl-dclm-Cx5.yaml create mode 100755 scripts/beaker/refine/refine1-mixed-50pctl-dclm-Cx5-launch.sh create mode 100755 scripts/beaker/refine/refine1-mixed-50pctl-dclm-Cx5.sh diff --git a/configs/refine/olmo-1b-refine-mixed-50pctl-dclm-Cx5.yaml b/configs/refine/olmo-1b-refine-mixed-50pctl-dclm-Cx5.yaml new file mode 100644 index 000000000..2cf46a1d6 --- /dev/null +++ b/configs/refine/olmo-1b-refine-mixed-50pctl-dclm-Cx5.yaml @@ -0,0 +1,480 @@ +run_name: olmo-1b-refine-mixed-50pctl-dclm-Cx5-20240925 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 5e9 + t_max: 130e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_num_checkpoints_to_keep: 12 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 130e9T # Match total token count +stop_at: 61989 # 130e9 / (2048 * 1024) = 61_989 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 2500 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0000/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0001/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl/allenai/dolma2-tokenizer/0002/part-67-00000.npy + diff --git a/scripts/beaker/refine/refine1-mixed-50pctl-dclm-Cx5-launch.sh b/scripts/beaker/refine/refine1-mixed-50pctl-dclm-Cx5-launch.sh new file mode 100755 index 000000000..481f94901 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-50pctl-dclm-Cx5-launch.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=2 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-mixed-50pctl-dclm-cx5-20240925 \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --synchronized-start-timeout 20m \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-mixed-50pctl-dclm-Cx5.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-mixed-50pctl-dclm-Cx5.sh b/scripts/beaker/refine/refine1-mixed-50pctl-dclm-Cx5.sh new file mode 100755 index 000000000..ac860b9c0 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-50pctl-dclm-Cx5.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +# export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-mixed-50pctl-dclm-Cx5.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite From 6025d976b0456cba925c91ae61cea67be58df562 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Tue, 8 Oct 2024 13:38:56 -0700 Subject: [PATCH 32/33] Cx5 rewrites inupt length filter --- ...b-refine-mixed-length-filter-dclm-Cx5.yaml | 4251 +++++++++++++++++ ...ne1-mixed-length-filter-dclm-Cx5-launch.sh | 39 + .../refine1-mixed-length-filter-dclm-Cx5.sh | 64 + 3 files changed, 4354 insertions(+) create mode 100644 configs/refine/olmo-1b-refine-mixed-length-filter-dclm-Cx5.yaml create mode 100755 scripts/beaker/refine/refine1-mixed-length-filter-dclm-Cx5-launch.sh create mode 100755 scripts/beaker/refine/refine1-mixed-length-filter-dclm-Cx5.sh diff --git a/configs/refine/olmo-1b-refine-mixed-length-filter-dclm-Cx5.yaml b/configs/refine/olmo-1b-refine-mixed-length-filter-dclm-Cx5.yaml new file mode 100644 index 000000000..5334e85ae --- /dev/null +++ b/configs/refine/olmo-1b-refine-mixed-length-filter-dclm-Cx5.yaml @@ -0,0 +1,4251 @@ +run_name: olmo-1b-refine-mixed-length-filter-dclm-Cx5-20241008 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 5e9 + t_max: 130e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_num_checkpoints_to_keep: 12 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 130e9T # Match total token count +stop_at: 61989 # 130e9 / (2048 * 1024) = 61_989 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 2500 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0000/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0001/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0002/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0003/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0004/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0005/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0006/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0007/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0008/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0009/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0010/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0011/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0012/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0013/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0014/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0015/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0016/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0017/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0018/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0019/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0020/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0021/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0022/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0023/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0024/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0025/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0026/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0027/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0028/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0029/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0030/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0031/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0032/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0033/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0034/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0035/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0036/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0037/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0038/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0039/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0040/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-len-split/allenai/dolma2-tokenizer/0041/part-95-00000.npy diff --git a/scripts/beaker/refine/refine1-mixed-length-filter-dclm-Cx5-launch.sh b/scripts/beaker/refine/refine1-mixed-length-filter-dclm-Cx5-launch.sh new file mode 100755 index 000000000..dd0ac003d --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-length-filter-dclm-Cx5-launch.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=2 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-mixed-length-filter-dclm-cx5-20241008 \ + --description "OLMo refine 1B" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --synchronized-start-timeout 20m \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-mixed-length-filter-dclm-Cx5.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-mixed-length-filter-dclm-Cx5.sh b/scripts/beaker/refine/refine1-mixed-length-filter-dclm-Cx5.sh new file mode 100755 index 000000000..7804a0cf5 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-length-filter-dclm-Cx5.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +# export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-mixed-length-filter-dclm-Cx5.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite From 6398fb47609d3ea3dd8dec2bc809484e6a25554e Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Tue, 8 Oct 2024 16:45:38 -0700 Subject: [PATCH 33/33] 50pctl + length filter --- ...e-mixed-50pctl-length-filter-dclm-Cx5.yaml | 492 ++++++++++++++++++ ...ed-50pctl-length-filter-dclm-Cx5-launch.sh | 39 ++ ...ne1-mixed-50pctl-length-filter-dclm-Cx5.sh | 64 +++ ...ne1-mixed-length-filter-dclm-Cx5-launch.sh | 2 +- 4 files changed, 596 insertions(+), 1 deletion(-) create mode 100644 configs/refine/olmo-1b-refine-mixed-50pctl-length-filter-dclm-Cx5.yaml create mode 100755 scripts/beaker/refine/refine1-mixed-50pctl-length-filter-dclm-Cx5-launch.sh create mode 100755 scripts/beaker/refine/refine1-mixed-50pctl-length-filter-dclm-Cx5.sh diff --git a/configs/refine/olmo-1b-refine-mixed-50pctl-length-filter-dclm-Cx5.yaml b/configs/refine/olmo-1b-refine-mixed-50pctl-length-filter-dclm-Cx5.yaml new file mode 100644 index 000000000..57afd456f --- /dev/null +++ b/configs/refine/olmo-1b-refine-mixed-50pctl-length-filter-dclm-Cx5.yaml @@ -0,0 +1,492 @@ +run_name: olmo-1b-refine-mixed-50pctl-length-filter-dclm-Cx5-20241008 +seed: 6198 +dry_run: false +no_pre_train_checkpoint: true + +wandb: + name: ${run_name} + project: refine-train + group: ${run_name} + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + attention_layer_norm: true + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: cuda + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 0.002 + eps: 1.0e-8 + weight_decay: 0.05 + decay_norm_and_bias: true + decay_embeddings: true + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 5e9 + t_max: 130e9 + alpha_f: 0.01 + +tokenizer: + identifier: allenai/dolma2-tokenizer + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/refine-1b/${run_name} +save_overwrite: false + +save_interval: 5000 +save_num_checkpoints_to_keep: 12 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 130e9T # Match total token count +stop_at: 61989 # 130e9 / (2048 * 1024) = 61_989 +global_train_batch_size: 1024 +device_train_microbatch_size: 4 + +fused_loss: true + +ddp: + grad_sync_mode: batch + find_unused_params: false + +precision: amp_bf16 + +distributed_strategy: ddp + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 2500 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 16 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + memmap_dtype: uint32 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0000/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-91-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-92-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-93-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-94-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0001/part-95-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/samples/rewrite-dclm-ft-delta-50pctl-len-split/allenai/dolma2-tokenizer/0002/part-80-00000.npy diff --git a/scripts/beaker/refine/refine1-mixed-50pctl-length-filter-dclm-Cx5-launch.sh b/scripts/beaker/refine/refine1-mixed-50pctl-length-filter-dclm-Cx5-launch.sh new file mode 100755 index 000000000..def7c8e07 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-50pctl-length-filter-dclm-Cx5-launch.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=2 + +gantry run \ + --workspace ai2/oe-data-model-based-cleanup \ + --allow-dirty \ + --task-name refine1-mixed-50pctl-length-filter-dclm-cx5-20241008 \ + --description "OLMo refine 1B" \ + --priority high \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-data \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --no-python \ + --synchronized-start-timeout 20m \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=TCM_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=TCM_AWS_CREDENTIALS \ + --env-secret WANDB_API_KEY=TCM_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/refine/refine1-mixed-50pctl-length-filter-dclm-Cx5.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/refine/refine1-mixed-50pctl-length-filter-dclm-Cx5.sh b/scripts/beaker/refine/refine1-mixed-50pctl-length-filter-dclm-Cx5.sh new file mode 100755 index 000000000..7804a0cf5 --- /dev/null +++ b/scripts/beaker/refine/refine1-mixed-50pctl-length-filter-dclm-Cx5.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd + +export HF_DATASETS_OFFLINE=1 + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +# export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ + configs/refine/olmo-1b-refine-mixed-length-filter-dclm-Cx5.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite diff --git a/scripts/beaker/refine/refine1-mixed-length-filter-dclm-Cx5-launch.sh b/scripts/beaker/refine/refine1-mixed-length-filter-dclm-Cx5-launch.sh index dd0ac003d..d0d6fd8b6 100755 --- a/scripts/beaker/refine/refine1-mixed-length-filter-dclm-Cx5-launch.sh +++ b/scripts/beaker/refine/refine1-mixed-length-filter-dclm-Cx5-launch.sh @@ -9,7 +9,7 @@ gantry run \ --allow-dirty \ --task-name refine1-mixed-length-filter-dclm-cx5-20241008 \ --description "OLMo refine 1B" \ - --priority urgent \ + --priority high \ --preemptible \ --beaker-image petew/olmo-torch23-gantry \ --cluster ai2/jupiter-cirrascale-2 \