From d69e598e0dfd34f8d7ff3083924fd88a0eb07836 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Sun, 25 Feb 2024 10:20:51 +0100 Subject: [PATCH 1/7] Olmo > OLMo --- configs/c4-medium_ckptfine.yaml | 184 ++++++++++++++++++ configs/c4-medium_ckptoneintwo.yaml | 184 ++++++++++++++++++ configs/c4-medium_ckptwhole.yaml | 184 ++++++++++++++++++ configs/c4-medium_nockpt.yaml | 182 +++++++++++++++++ configs/c4-small_nockpt.yaml | 183 +++++++++++++++++ configs/olmo_nockpt.yml | 86 ++++++++ configs/olmo_wholeckpt.yml | 88 +++++++++ configs/v1_5-mix-medium-mitch-ish_nockpt.yaml | 98 ++++++++++ docs/NOTES.md | 4 +- hf_olmo/configuration_olmo.py | 4 +- hf_olmo/modeling_olmo.py | 6 +- hf_olmo/tokenization_olmo_fast.py | 2 +- inference/NOTES.md | 4 +- .../AutoGPTQ/auto_gptq/modeling/auto.py | 4 +- .../AutoGPTQ/auto_gptq/modeling/olmo.py | 4 +- inference/compression/olmo_gptq_class.py | 8 +- olmo/ckptavg.py | 17 ++ olmo/config.py | 6 +- olmo/data/__init__.py | 8 +- olmo/data/memmap_dataset.py | 4 +- olmo/eval/__init__.py | 4 +- olmo/exceptions.py | 14 +- olmo/model.py | 82 ++++---- olmo/tokenizer.py | 6 +- olmo/train.py | 8 +- olmo/util.py | 28 +-- scripts/average_ckpts.py | 71 +++++++ scripts/average_ckpts_advanced.py | 123 ++++++++++++ scripts/avgckpts_stepbystep.py | 127 ++++++++++++ scripts/init_config.py | 4 +- scripts/inspect_train_data.py | 4 +- .../lumi/v1_5-mix-medium-mitch-ish_nockpt.sh | 55 ++++++ scripts/show_model_size.py | 8 +- scripts/train.py | 14 +- test_fixtures/test-olmo-model/config.json | 2 +- tests/hf_olmo/hf_olmo_test.py | 4 +- tests/hf_olmo/modeling_olmo_test.py | 4 +- tests/model_test.py | 12 +- 38 files changed, 1706 insertions(+), 124 deletions(-) create mode 100644 configs/c4-medium_ckptfine.yaml create mode 100644 configs/c4-medium_ckptoneintwo.yaml create mode 100644 configs/c4-medium_ckptwhole.yaml create mode 100644 configs/c4-medium_nockpt.yaml create mode 100644 configs/c4-small_nockpt.yaml create mode 100644 configs/olmo_nockpt.yml create mode 100644 configs/olmo_wholeckpt.yml create mode 100644 configs/v1_5-mix-medium-mitch-ish_nockpt.yaml create mode 100644 olmo/ckptavg.py create mode 100644 scripts/average_ckpts.py create mode 100644 scripts/average_ckpts_advanced.py create mode 100644 scripts/avgckpts_stepbystep.py create mode 100644 scripts/lumi/v1_5-mix-medium-mitch-ish_nockpt.sh diff --git a/configs/c4-medium_ckptfine.yaml b/configs/c4-medium_ckptfine.yaml new file mode 100644 index 000000000..f5e77a958 --- /dev/null +++ b/configs/c4-medium_ckptfine.yaml @@ -0,0 +1,184 @@ +run_name: c4-medium-run-001 +seed: 6198 +dry_run: false + +wandb: + name: ${run_name} + +activation_checkpointing: fine_grained + +model: + d_model: 4096 + n_heads: 16 + n_layers: 30 + mlp_ratio: 8 + alibi: true + alibi_bias_max: 8.0 + attention_dropout: 0.0 + attention_layer_norm: true + multi_query_attention: true + block_type: sequential + layer_norm_type: low_precision # if not compiling, use 'low_precision' + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 50257 + embedding_size: 50304 + eos_token_id: 50256 + pad_token_id: 50256 + init_device: meta + init_std: 0.02 + +compile: null # causes instability on AMD GPUs + +optimizer: + name: lionw + learning_rate: 1.0e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.95 + +scheduler: + name: cosine_with_warmup + t_warmup: 2000 + t_max: null + +data: + paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy} + pad_direction: right + num_workers: 2 + drop_last: true + pin_memory: true + prefetch_factor: 16 + persistent_workers: true + timeout: 0 + +tokenizer: + identifier: gpt2 + truncate_direction: right + +save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} +save_overwrite: false +# Sharded checkpoints (best for restarts) +save_interval: 1000 +save_num_checkpoints_to_keep: 2 +# Unsharded checkpoints (for final storage) +save_interval_unsharded: 50000 +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +# max_duration: 476837 # 2T tokens +max_duration: 50 # 200B tokens +global_train_batch_size: 2048 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +max_grad_norm: 1.0 + +speed_monitor: + window_size: 20 + +eval_interval: ${save_interval} +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + ########################## + # Perplexity evaluations # + ########################## + - label: c4-validation + subset_num_batches: 10 + data: + paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} + num_workers: 1 + drop_last: true + pin_memory: true + persistent_workers: true + prefetch_factor: 4 + + - label: rp-validation + subset_num_batches: 10 + data: + paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} + num_workers: 1 + drop_last: true + pin_memory: true + persistent_workers: true + prefetch_factor: 4 + + # lump all the small datasets together (we still get separate metrics). + - label: all-small-ppl-validation + data: + datasets: + 4chan-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy + c4_100_domains-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy + c4_en-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy + gab-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy + ice-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy + m2d2_s2orc-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy + m2d2_wiki-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy + manosphere-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy + mc4_en-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy + pile-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy + ptb-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy + twitterAEE-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy + wikitext_103-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy + drop_last: true + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + # - label: boolq # requires implemention of the pmi_dc matrix + # type: downstream + # + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + # - label: arc_challenge # requires implemention of the pmi_dc matrix + # type: downstream + # + - label: copa + type: downstream + + - label: rte + type: downstream + + - label: commitment_bank + type: downstream + + - label: mrpc + type: downstream + + - label: sst2 + type: downstream diff --git a/configs/c4-medium_ckptoneintwo.yaml b/configs/c4-medium_ckptoneintwo.yaml new file mode 100644 index 000000000..4ceb2901f --- /dev/null +++ b/configs/c4-medium_ckptoneintwo.yaml @@ -0,0 +1,184 @@ +run_name: c4-medium-run-001 +seed: 6198 +dry_run: false + +wandb: + name: ${run_name} + +activation_checkpointing: one_in_two + +model: + d_model: 4096 + n_heads: 16 + n_layers: 30 + mlp_ratio: 8 + alibi: true + alibi_bias_max: 8.0 + attention_dropout: 0.0 + attention_layer_norm: true + multi_query_attention: true + block_type: sequential + layer_norm_type: low_precision # if not compiling, use 'low_precision' + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 50257 + embedding_size: 50304 + eos_token_id: 50256 + pad_token_id: 50256 + init_device: meta + init_std: 0.02 + +compile: null # causes instability on AMD GPUs + +optimizer: + name: lionw + learning_rate: 1.0e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.95 + +scheduler: + name: cosine_with_warmup + t_warmup: 2000 + t_max: null + +data: + paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy} + pad_direction: right + num_workers: 2 + drop_last: true + pin_memory: true + prefetch_factor: 16 + persistent_workers: true + timeout: 0 + +tokenizer: + identifier: gpt2 + truncate_direction: right + +save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} +save_overwrite: false +# Sharded checkpoints (best for restarts) +save_interval: 1000 +save_num_checkpoints_to_keep: 2 +# Unsharded checkpoints (for final storage) +save_interval_unsharded: 50000 +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +# max_duration: 476837 # 2T tokens +max_duration: 50 # 200B tokens +global_train_batch_size: 2048 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +max_grad_norm: 1.0 + +speed_monitor: + window_size: 20 + +eval_interval: ${save_interval} +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + ########################## + # Perplexity evaluations # + ########################## + - label: c4-validation + subset_num_batches: 10 + data: + paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} + num_workers: 1 + drop_last: true + pin_memory: true + persistent_workers: true + prefetch_factor: 4 + + - label: rp-validation + subset_num_batches: 10 + data: + paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} + num_workers: 1 + drop_last: true + pin_memory: true + persistent_workers: true + prefetch_factor: 4 + + # lump all the small datasets together (we still get separate metrics). + - label: all-small-ppl-validation + data: + datasets: + 4chan-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy + c4_100_domains-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy + c4_en-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy + gab-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy + ice-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy + m2d2_s2orc-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy + m2d2_wiki-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy + manosphere-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy + mc4_en-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy + pile-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy + ptb-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy + twitterAEE-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy + wikitext_103-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy + drop_last: true + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + # - label: boolq # requires implemention of the pmi_dc matrix + # type: downstream + # + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + # - label: arc_challenge # requires implemention of the pmi_dc matrix + # type: downstream + # + - label: copa + type: downstream + + - label: rte + type: downstream + + - label: commitment_bank + type: downstream + + - label: mrpc + type: downstream + + - label: sst2 + type: downstream diff --git a/configs/c4-medium_ckptwhole.yaml b/configs/c4-medium_ckptwhole.yaml new file mode 100644 index 000000000..2e8084d32 --- /dev/null +++ b/configs/c4-medium_ckptwhole.yaml @@ -0,0 +1,184 @@ +run_name: c4-medium-run-001 +seed: 6198 +dry_run: false + +wandb: + name: ${run_name} + +activation_checkpointing: whole_layer + +model: + d_model: 4096 + n_heads: 16 + n_layers: 30 + mlp_ratio: 8 + alibi: true + alibi_bias_max: 8.0 + attention_dropout: 0.0 + attention_layer_norm: true + multi_query_attention: true + block_type: sequential + layer_norm_type: low_precision # if not compiling, use 'low_precision' + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 50257 + embedding_size: 50304 + eos_token_id: 50256 + pad_token_id: 50256 + init_device: meta + init_std: 0.02 + +compile: null # causes instability on AMD GPUs + +optimizer: + name: lionw + learning_rate: 1.0e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.95 + +scheduler: + name: cosine_with_warmup + t_warmup: 2000 + t_max: null + +data: + paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy} + pad_direction: right + num_workers: 2 + drop_last: true + pin_memory: true + prefetch_factor: 16 + persistent_workers: true + timeout: 0 + +tokenizer: + identifier: gpt2 + truncate_direction: right + +save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} +save_overwrite: false +# Sharded checkpoints (best for restarts) +save_interval: 1000 +save_num_checkpoints_to_keep: 2 +# Unsharded checkpoints (for final storage) +save_interval_unsharded: 50000 +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +# max_duration: 476837 # 2T tokens +max_duration: 50 # 200B tokens +global_train_batch_size: 2048 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +max_grad_norm: 1.0 + +speed_monitor: + window_size: 20 + +eval_interval: ${save_interval} +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + ########################## + # Perplexity evaluations # + ########################## + - label: c4-validation + subset_num_batches: 10 + data: + paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} + num_workers: 1 + drop_last: true + pin_memory: true + persistent_workers: true + prefetch_factor: 4 + + - label: rp-validation + subset_num_batches: 10 + data: + paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} + num_workers: 1 + drop_last: true + pin_memory: true + persistent_workers: true + prefetch_factor: 4 + + # lump all the small datasets together (we still get separate metrics). + - label: all-small-ppl-validation + data: + datasets: + 4chan-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy + c4_100_domains-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy + c4_en-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy + gab-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy + ice-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy + m2d2_s2orc-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy + m2d2_wiki-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy + manosphere-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy + mc4_en-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy + pile-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy + ptb-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy + twitterAEE-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy + wikitext_103-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy + drop_last: true + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + # - label: boolq # requires implemention of the pmi_dc matrix + # type: downstream + # + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + # - label: arc_challenge # requires implemention of the pmi_dc matrix + # type: downstream + # + - label: copa + type: downstream + + - label: rte + type: downstream + + - label: commitment_bank + type: downstream + + - label: mrpc + type: downstream + + - label: sst2 + type: downstream diff --git a/configs/c4-medium_nockpt.yaml b/configs/c4-medium_nockpt.yaml new file mode 100644 index 000000000..0d862117b --- /dev/null +++ b/configs/c4-medium_nockpt.yaml @@ -0,0 +1,182 @@ +run_name: c4-medium-run-001 +seed: 6198 +dry_run: false + +wandb: + name: ${run_name} + +model: + d_model: 4096 + n_heads: 16 + n_layers: 30 + mlp_ratio: 8 + alibi: true + alibi_bias_max: 8.0 + attention_dropout: 0.0 + attention_layer_norm: true + multi_query_attention: true + block_type: sequential + layer_norm_type: low_precision # if not compiling, use 'low_precision' + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 50257 + embedding_size: 50304 + eos_token_id: 50256 + pad_token_id: 50256 + init_device: meta + init_std: 0.02 + +compile: null # causes instability on AMD GPUs + +optimizer: + name: lionw + learning_rate: 1.0e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.95 + +scheduler: + name: cosine_with_warmup + t_warmup: 2000 + t_max: null + +data: + paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy} + pad_direction: right + num_workers: 2 + drop_last: true + pin_memory: true + prefetch_factor: 16 + persistent_workers: true + timeout: 0 + +tokenizer: + identifier: gpt2 + truncate_direction: right + +save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} +save_overwrite: false +# Sharded checkpoints (best for restarts) +save_interval: 1000 +save_num_checkpoints_to_keep: 2 +# Unsharded checkpoints (for final storage) +save_interval_unsharded: 50000 +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +# max_duration: 476837 # 2T tokens +max_duration: 50 # 200B tokens +global_train_batch_size: 2048 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +max_grad_norm: 1.0 + +speed_monitor: + window_size: 20 + +eval_interval: ${save_interval} +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + ########################## + # Perplexity evaluations # + ########################## + - label: c4-validation + subset_num_batches: 10 + data: + paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} + num_workers: 1 + drop_last: true + pin_memory: true + persistent_workers: true + prefetch_factor: 4 + + - label: rp-validation + subset_num_batches: 10 + data: + paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} + num_workers: 1 + drop_last: true + pin_memory: true + persistent_workers: true + prefetch_factor: 4 + + # lump all the small datasets together (we still get separate metrics). + - label: all-small-ppl-validation + data: + datasets: + 4chan-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy + c4_100_domains-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy + c4_en-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy + gab-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy + ice-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy + m2d2_s2orc-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy + m2d2_wiki-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy + manosphere-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy + mc4_en-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy + pile-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy + ptb-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy + twitterAEE-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy + wikitext_103-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy + drop_last: true + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + # - label: boolq # requires implemention of the pmi_dc matrix + # type: downstream + # + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + # - label: arc_challenge # requires implemention of the pmi_dc matrix + # type: downstream + # + - label: copa + type: downstream + + - label: rte + type: downstream + + - label: commitment_bank + type: downstream + + - label: mrpc + type: downstream + + - label: sst2 + type: downstream diff --git a/configs/c4-small_nockpt.yaml b/configs/c4-small_nockpt.yaml new file mode 100644 index 000000000..bdc2e04a9 --- /dev/null +++ b/configs/c4-small_nockpt.yaml @@ -0,0 +1,183 @@ +run_name: c4-small-run-001 +seed: 6198 +dry_run: false + +wandb: + name: ${run_name} + project: c4-small + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + alibi: true + alibi_bias_max: 8.0 + attention_dropout: 0.0 + attention_layer_norm: true + multi_query_attention: true + block_type: sequential + layer_norm_type: low_precision # if not compiling, use 'low_precision' + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 50257 + embedding_size: 50304 + eos_token_id: 50256 + pad_token_id: 50256 + init_device: meta + init_std: 0.02 + +compile: null # causes instability on AMD GPUs + +optimizer: + name: lionw + learning_rate: 2.0e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.95 + +scheduler: + name: cosine_with_warmup + t_warmup: 2000 + t_max: null + +data: + paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy} + pad_direction: right + num_workers: 2 + drop_last: true + pin_memory: true + prefetch_factor: 16 + persistent_workers: true + timeout: 0 + +tokenizer: + identifier: gpt2 + truncate_direction: right + +save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} +save_overwrite: false +# Sharded checkpoints (best for restarts) +save_interval: 1000 +save_num_checkpoints_to_keep: 9 +# Unsharded checkpoints (for final storage) +save_interval_unsharded: 10000 +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +# max_duration: 476837 # 2T tokens +max_duration: 50 # 200B tokens +global_train_batch_size: 2048 +device_train_microbatch_size: 8 + +precision: amp_bf16 + +max_grad_norm: 1.0 + +speed_monitor: + window_size: 20 + +eval_interval: ${save_interval} +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + ########################## + # Perplexity evaluations # + ########################## + - label: c4-validation + subset_num_batches: 10 + data: + paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} + num_workers: 1 + drop_last: true + pin_memory: true + persistent_workers: true + prefetch_factor: 4 + + - label: rp-validation + subset_num_batches: 10 + data: + paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} + num_workers: 1 + drop_last: true + pin_memory: true + persistent_workers: true + prefetch_factor: 4 + + # lump all the small datasets together (we still get separate metrics). + - label: all-small-ppl-validation + data: + datasets: + 4chan-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy + c4_100_domains-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy + c4_en-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy + gab-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy + ice-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy + m2d2_s2orc-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy + m2d2_wiki-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy + manosphere-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy + mc4_en-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy + pile-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy + ptb-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy + twitterAEE-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy + wikitext_103-validation: + - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy + drop_last: true + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + # - label: boolq # requires implemention of the pmi_dc matrix + # type: downstream + # + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + # - label: arc_challenge # requires implemention of the pmi_dc matrix + # type: downstream + # + - label: copa + type: downstream + + - label: rte + type: downstream + + - label: commitment_bank + type: downstream + + - label: mrpc + type: downstream + + - label: sst2 + type: downstream diff --git a/configs/olmo_nockpt.yml b/configs/olmo_nockpt.yml new file mode 100644 index 000000000..f09396738 --- /dev/null +++ b/configs/olmo_nockpt.yml @@ -0,0 +1,86 @@ +run_name: olmo-small-ablation +seed: 6198 +dry_run: false + +wandb: + name: ${run_name} + project: c4-small + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + alibi: true + alibi_bias_max: 8.0 + attention_dropout: 0.0 + attention_layer_norm: true + multi_query_attention: true + block_type: sequential + layer_norm_type: low_precision # if not compiling, use 'low_precision' + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 50277 + embedding_size: 50304 + eos_token_id: 50276 + pad_token_id: 50276 + init_device: meta + init_std: 0.02 + +compile: null # causes instability on AMD GPUs + +optimizer: + name: lionw + learning_rate: 1.0e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.95 + +scheduler: + name: cosine_with_warmup + t_warmup: 2000 + t_max: null + +data: + paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4_en/gpt-neox-20b/c4-train.*.npy} + pad_direction: right + num_workers: 4 + drop_last: true + pin_memory: true + prefetch_factor: 16 + persistent_workers: true + timeout: 0 + +tokenizer: + identifier: EleutherAI/gpt-neox-20b + truncate_direction: right + +save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} +save_overwrite: false +# Sharded checkpoints (best for restarts) +save_interval: 1000 +save_num_checkpoints_to_keep: 9 +# Unsharded checkpoints (for final storage) +save_interval_unsharded: 10000 +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +# max_duration: 953674 # 2T tokens +max_duration: 50 # 200B tokens +global_train_batch_size: 1024 +device_train_microbatch_size: 8 + +precision: amp_bf16 + +max_grad_norm: 1.0 + +speed_monitor: + window_size: 20 + +eval_interval: ${save_interval} +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} \ No newline at end of file diff --git a/configs/olmo_wholeckpt.yml b/configs/olmo_wholeckpt.yml new file mode 100644 index 000000000..efc53dbf2 --- /dev/null +++ b/configs/olmo_wholeckpt.yml @@ -0,0 +1,88 @@ +run_name: olmo-small-ablation +seed: 6198 +dry_run: false + +wandb: + name: ${run_name} + project: c4-small + +activation_checkpointing: whole_layer + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + alibi: true + alibi_bias_max: 8.0 + attention_dropout: 0.0 + attention_layer_norm: true + multi_query_attention: true + block_type: sequential + layer_norm_type: low_precision # if not compiling, use 'low_precision' + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 50277 + embedding_size: 50304 + eos_token_id: 50276 + pad_token_id: 50276 + init_device: meta + init_std: 0.02 + +compile: null # causes instability on AMD GPUs + +optimizer: + name: lionw + learning_rate: 1.0e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.95 + +scheduler: + name: cosine_with_warmup + t_warmup: 2000 + t_max: null + +data: + paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4_en/gpt-neox-20b/c4-train.*.npy} + pad_direction: right + num_workers: 4 + drop_last: true + pin_memory: true + prefetch_factor: 16 + persistent_workers: true + timeout: 0 + +tokenizer: + identifier: EleutherAI/gpt-neox-20b + truncate_direction: right + +save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} +save_overwrite: false +# Sharded checkpoints (best for restarts) +save_interval: 1000 +save_num_checkpoints_to_keep: 9 +# Unsharded checkpoints (for final storage) +save_interval_unsharded: 10000 +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +# max_duration: 953674 # 2T tokens +max_duration: 50 # 200B tokens +global_train_batch_size: 1024 +device_train_microbatch_size: 8 + +precision: amp_bf16 + +max_grad_norm: 1.0 + +speed_monitor: + window_size: 20 + +eval_interval: ${save_interval} +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} \ No newline at end of file diff --git a/configs/v1_5-mix-medium-mitch-ish_nockpt.yaml b/configs/v1_5-mix-medium-mitch-ish_nockpt.yaml new file mode 100644 index 000000000..0c0974f0e --- /dev/null +++ b/configs/v1_5-mix-medium-mitch-ish_nockpt.yaml @@ -0,0 +1,98 @@ +run_name: v1_5-mix-medium-mitch-ish +seed: 6198 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: v1_5-mix + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + # mlp_ratio: 6 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + flash_attention: false # not available on AMD + attention_dropout: 0.0 + attention_layer_norm: false + multi_query_attention: false + include_bias: false + block_type: sequential + layer_norm_type: default + layer_norm_with_affine: false + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 50280 + embedding_size: 50304 + eos_token_id: 0 + pad_token_id: 1 + init_device: meta + init_fn: mitchell + +compile: null + +optimizer: + name: adamw + learning_rate: 3.0e-4 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: linear_with_warmup + t_warmup: 5000 + alpha_f: 0.1 + grad_clip_warmup_steps: 1000 + grad_clip_warmup_factor: 10.0 + +tokenizer: + identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json + truncate_direction: right + +save_folder: ${oc.env:CHECKPOINTS_PATH}/${oc.env:SLURM_JOB_ID,${run_name}} +save_overwrite: false +# Sharded checkpoints (best for restarts) +save_interval: 1000 +save_num_checkpoints_to_keep: -1 +# Unsharded checkpoints (for final storage) +save_interval_unsharded: null # getting errors on LUMI right now +save_num_unsharded_checkpoints_to_keep: -1 +no_pre_train_checkpoint: true + +load_path: null + +max_duration: 50 # 2T tokens +global_train_batch_size: 2048 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: null + precision: mixed + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 20 + +data: + paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4_en/gpt-neox-20b/c4-train.*.npy} + pad_direction: right + num_workers: 0 + drop_last: true + pin_memory: true + prefetch_factor: 16 + persistent_workers: true + timeout: 0 diff --git a/docs/NOTES.md b/docs/NOTES.md index 6a8f3bfa7..c6611b33a 100644 --- a/docs/NOTES.md +++ b/docs/NOTES.md @@ -70,10 +70,10 @@ For example, checkpoints for the run [https://wandb.ai/ai2-llm/c4-small/runs/euo You can load a checkpoint like this: ```python -from olmo import Olmo, Tokenizer +from olmo import OLMo, Tokenizer checkpoint = "gs://ai2-olmo/ai2-llm/c4-small/euox4j8q/step73000-unsharded" -model = Olmo.from_checkpoint(checkpoint, device="cuda") +model = OLMo.from_checkpoint(checkpoint, device="cuda") tokenizer = Tokenizer.from_checkpoint(checkpoint) ``` diff --git a/hf_olmo/configuration_olmo.py b/hf_olmo/configuration_olmo.py index 5b15fa194..cb7670f6c 100644 --- a/hf_olmo/configuration_olmo.py +++ b/hf_olmo/configuration_olmo.py @@ -21,8 +21,8 @@ def __init__(self, use_cache: bool = False, **kwargs): all_kwargs.update({"use_cache": use_cache}) all_kwargs.update( { - "architectures": all_kwargs.get("architectures", ["OlmoModelForCausalLM"]) - or ["OlmoModelForCausalLM"] + "architectures": all_kwargs.get("architectures", ["OLMoModelForCausalLM"]) + or ["OLMoModelForCausalLM"] } ) super().__init__(**all_kwargs) diff --git a/hf_olmo/modeling_olmo.py b/hf_olmo/modeling_olmo.py index 6a279cb10..a1cc569f7 100644 --- a/hf_olmo/modeling_olmo.py +++ b/hf_olmo/modeling_olmo.py @@ -7,7 +7,7 @@ from transformers.models.auto import AutoModelForCausalLM from olmo.config import ModelConfig -from olmo.model import Olmo +from olmo.model import OLMo from .configuration_olmo import OLMoConfig @@ -34,14 +34,14 @@ class OLMoForCausalLM(PreTrainedModel): base_model_prefix = "model" _no_split_modules = ["OLMoBlock"] - def __init__(self, config: OLMoConfig, model: Optional[Olmo] = None, init_params: bool = False): + def __init__(self, config: OLMoConfig, model: Optional[OLMo] = None, init_params: bool = False): super().__init__(config) if not model: model_config = create_model_config_from_pretrained_config(config) # Initialize model (always on CPU to start with so we don't run out of GPU memory). model_config.init_device = "cpu" - self.model = Olmo(model_config, init_params=init_params) + self.model = OLMo(model_config, init_params=init_params) else: self.model = model diff --git a/hf_olmo/tokenization_olmo_fast.py b/hf_olmo/tokenization_olmo_fast.py index e2bd665d1..19543a6c7 100644 --- a/hf_olmo/tokenization_olmo_fast.py +++ b/hf_olmo/tokenization_olmo_fast.py @@ -4,7 +4,7 @@ class OLMoTokenizerFast(PreTrainedTokenizerFast): - # Note: Olmo's tokenizer is already a wrapper around huggingface. This is potentially unnecessary. + # Note: OLMo's tokenizer is already a wrapper around huggingface. This is potentially unnecessary. pass # def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: diff --git a/inference/NOTES.md b/inference/NOTES.md index 0af0f9d09..ea57a322a 100644 --- a/inference/NOTES.md +++ b/inference/NOTES.md @@ -45,12 +45,12 @@ To add an `olmo.py` module, we can basically just imitate what was done for othe There's one important wrinkle here: some OLMo models use *fused linear attention*. I'm not sure how GPTQ handles this or whether any existing supported models implement attention the same way. This might be something to discuss with Dirk and Pete. ```python -Olmo( +OLMo( (transformer): ModuleDict( (wte): Embedding(50304, 768) (emb_drop): Dropout(p=0.1, inplace=False) (blocks): ModuleList( - (0-11): 12 x OlmoSequentialBlock( + (0-11): 12 x OLMoSequentialBlock( (dropout): Dropout(p=0.1, inplace=False) (norm): LayerNorm() (act): SwiGLU() diff --git a/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/auto.py b/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/auto.py index 46ce32e69..9beb2ff33 100644 --- a/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/auto.py +++ b/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/auto.py @@ -13,7 +13,7 @@ from .internlm import InternLMGPTQForCausalLM from .llama import LlamaGPTQForCausalLM from .moss import MOSSGPTQForCausalLM -from .olmo import OlmoGPTQForCausalLM +from .olmo import OLMoGPTQForCausalLM from .opt import OPTGPTQForCausalLM from .qwen import QwenGPTQForCausalLM from .rw import RWGPTQForCausalLM @@ -24,7 +24,7 @@ "gptj": GPTJGPTQForCausalLM, "gpt2": GPT2GPTQForCausalLM, "llama": LlamaGPTQForCausalLM, - "olmo": OlmoGPTQForCausalLM, + "olmo": OLMoGPTQForCausalLM, "opt": OPTGPTQForCausalLM, "moss": MOSSGPTQForCausalLM, "gpt_bigcode": GPTBigCodeGPTQForCausalLM, diff --git a/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/olmo.py b/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/olmo.py index 0bf18fc8c..01264bfdb 100644 --- a/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/olmo.py +++ b/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/olmo.py @@ -1,7 +1,7 @@ from ._base import * -class OlmoGPTQForCausalLM(BaseGPTQForCausalLM): +class OLMoGPTQForCausalLM(BaseGPTQForCausalLM): # Attribute name of Transformer layer block. layers_block_name = "model.transformer.blocks" @@ -19,4 +19,4 @@ class OlmoGPTQForCausalLM(BaseGPTQForCausalLM): inside_layer_modules = [["att_proj"], ["attn_out"], ["ff_proj"], ["ff_out"]] -__all__ = ["OlmoGPTQForCausalLM"] +__all__ = ["OLMoGPTQForCausalLM"] diff --git a/inference/compression/olmo_gptq_class.py b/inference/compression/olmo_gptq_class.py index 645349d7b..0f6580a59 100644 --- a/inference/compression/olmo_gptq_class.py +++ b/inference/compression/olmo_gptq_class.py @@ -1,7 +1,7 @@ from auto_gptq.modeling._base import BaseGPTQForCausalLM -class OlmoGPTQForCausalLM(BaseGPTQForCausalLM): +class OLMoGPTQForCausalLM(BaseGPTQForCausalLM): # Attribute name of Transformer layer block. layers_block_name = "model.transformer.blocks" @@ -17,12 +17,12 @@ class OlmoGPTQForCausalLM(BaseGPTQForCausalLM): inside_layer_modules = [["att_proj"], ["attn_out"], ["ff_proj"], ["ff_out"]] -__all__ = ["OlmoGPTQForCausalLM"] +__all__ = ["OLMoGPTQForCausalLM"] # NOTE: In progress; may change if OLMo model is updated. -# class OlmoGPTQForCausalLM(BaseGPTQForCausalLM): +# class OLMoGPTQForCausalLM(BaseGPTQForCausalLM): # # Attribute name of Transformer layer block. # layers_block_name = "transformer.blocks" # NOTE(wadden) Correct # @@ -51,4 +51,4 @@ class OlmoGPTQForCausalLM(BaseGPTQForCausalLM): # ] -# __all__ = ["OlmoGPTQForCausalLM"] +# __all__ = ["OLMoGPTQForCausalLM"] diff --git a/olmo/ckptavg.py b/olmo/ckptavg.py new file mode 100644 index 000000000..8531f62da --- /dev/null +++ b/olmo/ckptavg.py @@ -0,0 +1,17 @@ +import torch + +STATEDICTS = [ + "advaveraged25.pt", + "advaveraged2550.pt", + "advaveraged5075.pt", + "advaveraged75.pt", +] + +sd = torch.load(STATEDICTS[0]) +for state_dict in STATEDICTS[1:]: + sd2 = torch.load(state_dict) + for k,v in sd2.items(): + assert k not in sd + sd[k] = v + +torch.save(sd, "advaveraged.pt") diff --git a/olmo/config.py b/olmo/config.py index c0f26b08b..618ee97dd 100644 --- a/olmo/config.py +++ b/olmo/config.py @@ -23,7 +23,7 @@ from torch.distributed.fsdp import MixedPrecision, ShardingStrategy from .aliases import PathOrStr -from .exceptions import OlmoConfigurationError +from .exceptions import OLMoConfigurationError from .util import StrEnum __all__ = [ @@ -116,7 +116,7 @@ def new(cls: Type[C], **kwargs) -> C: conf = om.merge(conf, kwargs) return cast(C, om.to_object(conf)) except OmegaConfBaseException as e: - raise OlmoConfigurationError(str(e)) + raise OLMoConfigurationError(str(e)) @classmethod def load( @@ -139,7 +139,7 @@ def load( conf = om.merge(conf, om.from_dotlist(overrides)) return cast(C, om.to_object(conf)) except OmegaConfBaseException as e: - raise OlmoConfigurationError(str(e)) + raise OLMoConfigurationError(str(e)) def save(self, path: PathOrStr) -> None: """Save to a YAML file.""" diff --git a/olmo/data/__init__.py b/olmo/data/__init__.py index 52421b57a..7d8fbb56b 100644 --- a/olmo/data/__init__.py +++ b/olmo/data/__init__.py @@ -5,7 +5,7 @@ from ..aliases import PathOrStr from ..config import DataConfig, TrainConfig -from ..exceptions import OlmoConfigurationError +from ..exceptions import OLMoConfigurationError from ..torch_util import barrier, get_global_rank, get_world_size from .collator import DataCollator from .iterable_dataset import IterableDataset @@ -21,7 +21,7 @@ def build_memmap_dataset( metadata: List[Dict[str, Any]] = [] if data_config.paths: if data_config.datasets: - raise OlmoConfigurationError("DataConfig.paths is mutually exclusive with DataConfig.datasets") + raise OLMoConfigurationError("DataConfig.paths is mutually exclusive with DataConfig.datasets") paths = data_config.paths for path in paths: metadata.append({"path": str(path)}) @@ -32,7 +32,7 @@ def build_memmap_dataset( paths.extend(label_paths) metadata.extend([{"label": label}] * len(label_paths)) else: - raise OlmoConfigurationError("One of DataConfig.paths or DataConfig.datasets is required") + raise OLMoConfigurationError("One of DataConfig.paths or DataConfig.datasets is required") return MemMapDataset( *paths, chunk_size=train_config.model.max_sequence_length, @@ -87,7 +87,7 @@ def build_train_dataloader(train_config: TrainConfig) -> DataLoader: work_dir = Path(train_config.save_folder) / "train_data" if get_global_rank() == 0: if work_dir.is_dir() and not train_config.save_overwrite: - raise OlmoConfigurationError( + raise OLMoConfigurationError( "train data working directory already exists, use --save_overwrite to overwrite" ) else: diff --git a/olmo/data/memmap_dataset.py b/olmo/data/memmap_dataset.py index 5af73c277..c00f29e06 100644 --- a/olmo/data/memmap_dataset.py +++ b/olmo/data/memmap_dataset.py @@ -7,7 +7,7 @@ import torch from torch.utils.data import Dataset -from olmo.exceptions import OlmoEnvironmentError +from olmo.exceptions import OLMoEnvironmentError from ..aliases import PathOrStr from ..util import _get_s3_client, file_size, get_bytes_range @@ -93,7 +93,7 @@ def offsets(self) -> List[Tuple[int, int]]: _get_s3_client("s3") try: _get_s3_client("r2") - except OlmoEnvironmentError: + except OLMoEnvironmentError: # R2 might not be needed, so ignore this error. We will get an error # later if R2 is needed. pass diff --git a/olmo/eval/__init__.py b/olmo/eval/__init__.py index 4c53f4b25..17dcc77fe 100644 --- a/olmo/eval/__init__.py +++ b/olmo/eval/__init__.py @@ -5,7 +5,7 @@ from torchmetrics import MeanMetric, Metric from ..config import EvaluatorConfig, EvaluatorType, TrainConfig -from ..exceptions import OlmoConfigurationError +from ..exceptions import OLMoConfigurationError from ..tokenizer import Tokenizer from ..torch_util import get_global_rank, get_world_size from .downstream import ICLMetric, label_to_task_map @@ -90,7 +90,7 @@ def make_metric(): elif eval_config.data.datasets: eval_metric = {label: make_metric() for label in eval_config.data.datasets.keys()} else: - raise OlmoConfigurationError("One of DataConfig.paths or DataConfig.datasets is required") + raise OLMoConfigurationError("One of DataConfig.paths or DataConfig.datasets is required") return Evaluator( label=eval_config.label, diff --git a/olmo/exceptions.py b/olmo/exceptions.py index 754580c95..5474facc3 100644 --- a/olmo/exceptions.py +++ b/olmo/exceptions.py @@ -1,37 +1,37 @@ -__all__ = ["OlmoError", "OlmoConfigurationError", "OlmoCliError", "OlmoEnvironmentError", "OlmoNetworkError"] +__all__ = ["OLMoError", "OLMoConfigurationError", "OLMoCliError", "OLMoEnvironmentError", "OLMoNetworkError"] -class OlmoError(Exception): +class OLMoError(Exception): """ Base class for all custom OLMo exceptions. """ -class OlmoConfigurationError(OlmoError): +class OLMoConfigurationError(OLMoError): """ An error with a configuration file. """ -class OlmoCliError(OlmoError): +class OLMoCliError(OLMoError): """ An error from incorrect CLI usage. """ -class OlmoEnvironmentError(OlmoError): +class OLMoEnvironmentError(OLMoError): """ An error from incorrect environment variables. """ -class OlmoNetworkError(OlmoError): +class OLMoNetworkError(OLMoError): """ An error with a network request. """ -class OlmoThreadError(Exception): +class OLMoThreadError(Exception): """ Raised when a thread fails. """ diff --git a/olmo/model.py b/olmo/model.py index a11eceb71..6100005ee 100644 --- a/olmo/model.py +++ b/olmo/model.py @@ -42,7 +42,7 @@ LayerNormType, ModelConfig, ) -from .exceptions import OlmoConfigurationError +from .exceptions import OLMoConfigurationError from .initialization import ModuleType, init_weights from .torch_util import ensure_finite_ @@ -63,12 +63,12 @@ "GELU", "ReLU", "SwiGLU", - "OlmoBlock", - "OlmoSequentialBlock", - "OlmoParallelBlock", - "Olmo", - "OlmoOutput", - "OlmoGenerateOutput", + "OLMoBlock", + "OLMoSequentialBlock", + "OLMoParallelBlock", + "OLMo", + "OLMoOutput", + "OLMoGenerateOutput", ] @@ -421,7 +421,7 @@ def alibi_attention_bias(seq_len: int, config: ModelConfig, device: torch.device return alibi_bias * (1.0 / (2 ** m.view(1, config.n_heads, 1, 1))) # type: ignore -class OlmoBlock(nn.Module): +class OLMoBlock(nn.Module): """ A base class for transformer block implementations. """ @@ -620,18 +620,18 @@ def forward( raise NotImplementedError @classmethod - def build(cls, layer_id: int, config: ModelConfig, cache: BufferCache) -> OlmoBlock: + def build(cls, layer_id: int, config: ModelConfig, cache: BufferCache) -> OLMoBlock: if config.block_type == BlockType.sequential: - return OlmoSequentialBlock(layer_id, config, cache) + return OLMoSequentialBlock(layer_id, config, cache) elif config.block_type == BlockType.parallel: - return OlmoParallelBlock(layer_id, config, cache) + return OLMoParallelBlock(layer_id, config, cache) elif config.block_type == BlockType.llama: - return OlmoLlamaBlock(layer_id, config, cache) + return OLMoLlamaBlock(layer_id, config, cache) else: raise NotImplementedError(f"Unknown block type: '{config.block_type}'") -class OlmoSequentialBlock(OlmoBlock): +class OLMoSequentialBlock(OLMoBlock): """ This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))`` (plus another skip connection). @@ -717,11 +717,11 @@ def forward( return x, cache -class OlmoParallelBlock(OlmoBlock): +class OLMoParallelBlock(OLMoBlock): """ This is a transformer block where the output is computed as ``MLP(LN(x)) + Attention(LN(x))`` as in the PaLM architecture, as opposed to the typical ``MLP(LN(x + Attention(LN(x))))`` - as in :class:`OlmoSequentialBlock` (ignoring some skip connections). + as in :class:`OLMoSequentialBlock` (ignoring some skip connections). The decoupling of the MLP and Attention functions allow us to fuse the separate input projections into a single linear layer to increase throughput. In this configuration it's also straight-forward @@ -804,10 +804,10 @@ def forward( ) -class OlmoLlamaBlock(OlmoBlock): +class OLMoLlamaBlock(OLMoBlock): """ This is a transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))`` - (plus another skip connection). This block is similar to `OlmoSequentialBlock` + (plus another skip connection). This block is similar to `OLMoSequentialBlock` but some operations have slightly different implementations to imitate the behavior of Llama. """ @@ -922,7 +922,7 @@ def forward( return x, cache -class OlmoOutput(NamedTuple): +class OLMoOutput(NamedTuple): logits: torch.FloatTensor """ A tensor of shape `(batch_size, seq_len, vocab_size)` representing the log probabilities @@ -940,7 +940,7 @@ class OlmoOutput(NamedTuple): """ -class OlmoGenerateOutput(NamedTuple): +class OLMoGenerateOutput(NamedTuple): token_ids: torch.LongTensor """ The generated token IDs, a tensor of shape `(batch_size, beam_size, max_steps)`. @@ -953,7 +953,7 @@ class OlmoGenerateOutput(NamedTuple): """ -class OlmoBlockGroup(nn.ModuleList): +class OLMoBlockGroup(nn.ModuleList): def __init__(self, config: ModelConfig, layer_offset: int, modules: Optional[Iterable[nn.Module]] = None): super().__init__(modules) self.config = config @@ -1009,7 +1009,7 @@ def set_activation_checkpointing(self, strategy: Optional[ActivationCheckpointin block.set_activation_checkpointing(strategy) -class Olmo(nn.Module): +class OLMo(nn.Module): def __init__(self, config: ModelConfig, init_params: bool = True): super().__init__() self.config = config @@ -1017,14 +1017,14 @@ def __init__(self, config: ModelConfig, init_params: bool = True): # Validate config. if self.config.alibi and self.config.flash_attention: - raise OlmoConfigurationError("ALiBi is currently not supported with FlashAttention") + raise OLMoConfigurationError("ALiBi is currently not supported with FlashAttention") if self.config.alibi and self.config.rope: - raise OlmoConfigurationError("ALiBi and RoPE are mutually exclusive") + raise OLMoConfigurationError("ALiBi and RoPE are mutually exclusive") if self.config.embedding_size is not None and self.config.embedding_size != self.config.vocab_size: if self.config.embedding_size < self.config.vocab_size: - raise OlmoConfigurationError("embedding size should be at least as big as vocab size") + raise OLMoConfigurationError("embedding size should be at least as big as vocab size") elif self.config.embedding_size % 128 != 0: import warnings @@ -1039,7 +1039,7 @@ def __init__(self, config: ModelConfig, init_params: bool = True): 0 < self.config.block_group_size <= self.config.n_layers and self.config.n_layers % self.config.block_group_size == 0 ): - raise OlmoConfigurationError("n layers must be divisible by block group size") + raise OLMoConfigurationError("n layers must be divisible by block group size") torch.backends.cuda.enable_flash_sdp(self.config.flash_attention) torch.backends.cuda.enable_mem_efficient_sdp(False) # this is super slow so make sure torch won't use it @@ -1054,10 +1054,10 @@ def __init__(self, config: ModelConfig, init_params: bool = True): ) ) - blocks = [OlmoBlock.build(i, config, self.__cache) for i in range(config.n_layers)] + blocks = [OLMoBlock.build(i, config, self.__cache) for i in range(config.n_layers)] if self.config.block_group_size > 1: block_groups = [ - OlmoBlockGroup(config, i, blocks[i : i + config.block_group_size]) + OLMoBlockGroup(config, i, blocks[i : i + config.block_group_size]) for i in range(0, config.n_layers, config.block_group_size) ] self.transformer.update({"block_groups": nn.ModuleList(block_groups)}) @@ -1156,7 +1156,7 @@ def forward( use_cache: bool = False, last_logits_only: bool = False, output_hidden_states: Optional[bool] = None, - ) -> OlmoOutput: + ) -> OLMoOutput: """ :param input_ids: A tensor of shape `(batch_size, seq_len)`. :param input_embeddings: A tensor of shape `(batch_size, seq_len, d_model)` with input @@ -1334,7 +1334,7 @@ def forward( if self.config.scale_logits: logits.mul_(1 / math.sqrt(self.config.d_model)) - return OlmoOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None) # type: ignore[arg-type] + return OLMoOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None) # type: ignore[arg-type] def get_fsdp_wrap_policy(self, wrap_strategy: Optional[FSDPWrapStrategy] = None): if wrap_strategy is None: @@ -1354,7 +1354,7 @@ def get_fsdp_wrap_policy(self, wrap_strategy: Optional[FSDPWrapStrategy] = None) def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0): del nonwrapped_numel - wrap = isinstance(module, OlmoBlock) + wrap = isinstance(module, OLMoBlock) if recurse: return True else: @@ -1365,7 +1365,7 @@ def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0): def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0): del nonwrapped_numel - wrap = isinstance(module, (OlmoBlock,)) or module in size_based_module_to_wrap + wrap = isinstance(module, (OLMoBlock,)) or module in size_based_module_to_wrap if recurse: return True else: @@ -1374,13 +1374,13 @@ def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0): return fsdp_wrap_fn elif wrap_strategy == FSDPWrapStrategy.by_block_group: if self.config.block_group_size <= 1: - raise OlmoConfigurationError( + raise OLMoConfigurationError( "'by_block_group' FSDP wrapping strategy requires block group size greater than 1" ) def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0): del nonwrapped_numel - wrap = isinstance(module, OlmoBlockGroup) + wrap = isinstance(module, OLMoBlockGroup) if recurse: return True else: @@ -1389,13 +1389,13 @@ def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0): return fsdp_wrap_fn elif wrap_strategy == FSDPWrapStrategy.by_block_group_and_size: if self.config.block_group_size <= 1: - raise OlmoConfigurationError( + raise OLMoConfigurationError( "'by_block_group_and_size' FSDP wrapping strategy requires block group size greater than 1" ) def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0): del nonwrapped_numel - wrap = isinstance(module, (OlmoBlockGroup,)) or module in size_based_module_to_wrap + wrap = isinstance(module, (OLMoBlockGroup,)) or module in size_based_module_to_wrap if recurse: return True else: @@ -1421,7 +1421,7 @@ def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0): def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0): del nonwrapped_numel - wrap = isinstance(module, OlmoBlock) and module.layer_id % c == 0 + wrap = isinstance(module, OLMoBlock) and module.layer_id % c == 0 if recurse: return True else: @@ -1472,7 +1472,7 @@ def generate( min_steps: Optional[int] = None, final_sequence_scorer: Optional[FinalSequenceScorer] = None, constraints: Optional[List[Constraint]] = None, - ) -> OlmoGenerateOutput: + ) -> OLMoGenerateOutput: """ Generate token IDs using beam search. @@ -1582,7 +1582,7 @@ def step( with torch.no_grad(): token_ids, scores = beam_search.search(initial_preds, state, step) - return OlmoGenerateOutput( + return OLMoGenerateOutput( token_ids=token_ids, # type: ignore[arg-type] scores=scores, # type: ignore[arg-type] ) @@ -1590,7 +1590,7 @@ def step( @classmethod def from_checkpoint( cls, checkpoint_dir: PathOrStr, device: str = "cpu", checkpoint_type: Optional[CheckpointType] = None - ) -> Olmo: + ) -> OLMo: """ Load an OLMo model from a checkpoint. """ @@ -1613,7 +1613,7 @@ def from_checkpoint( if checkpoint_type == CheckpointType.unsharded: # Initialize model (always on CPU to start with so we don't run out of GPU memory). model_config.init_device = "cpu" - model = Olmo(model_config) + model = OLMo(model_config) # Load state dict directly to target device. state_dict_path = resource_path(checkpoint_dir, "model.pt") @@ -1626,7 +1626,7 @@ def from_checkpoint( # Initialize model on target device. In this case the state dict is loaded in-place # so it's not necessary to start on CPU if the target device is a GPU. model_config.init_device = device - model = Olmo(model_config) + model = OLMo(model_config) # Load state dict in place. load_model_state(checkpoint_dir, model) diff --git a/olmo/tokenizer.py b/olmo/tokenizer.py index a833d3c21..3ed064097 100644 --- a/olmo/tokenizer.py +++ b/olmo/tokenizer.py @@ -8,7 +8,7 @@ from .aliases import PathOrStr from .config import ModelConfig, TokenizerConfig, TrainConfig, TruncationDirection -from .exceptions import OlmoConfigurationError +from .exceptions import OLMoConfigurationError __all__ = ["Tokenizer"] @@ -68,7 +68,7 @@ def from_train_config(cls, config: TrainConfig) -> Tokenizer: pad_token_id=config.model.pad_token_id, ) if config.model.vocab_size != tokenizer.vocab_size: - raise OlmoConfigurationError("vocab size mismatch between config and tokenizer") + raise OLMoConfigurationError("vocab size mismatch between config and tokenizer") return tokenizer @classmethod @@ -117,7 +117,7 @@ def from_checkpoint(cls, checkpoint_dir: PathOrStr) -> Tokenizer: pad_token_id=model_config.pad_token_id, ) if model_config.vocab_size != tokenizer.vocab_size: - raise OlmoConfigurationError("vocab size mismatch between config and tokenizer") + raise OLMoConfigurationError("vocab size mismatch between config and tokenizer") return tokenizer def add_special_tokens(self, input_ids: List[int]) -> List[int]: diff --git a/olmo/train.py b/olmo/train.py index 79132f0fc..43d4ee5fc 100644 --- a/olmo/train.py +++ b/olmo/train.py @@ -33,8 +33,8 @@ ) from .data import IterableDataset from .eval import Evaluator -from .exceptions import OlmoConfigurationError -from .model import Olmo +from .exceptions import OLMoConfigurationError +from .model import OLMo from .optim import Optimizer, Scheduler from .torch_util import ( barrier, @@ -96,7 +96,7 @@ def check(self) -> Dict[str, float]: @dataclass class Trainer: cfg: TrainConfig - model: Olmo + model: OLMo fsdp_model: FSDP optim: Optimizer scheduler: Scheduler @@ -351,7 +351,7 @@ def _save_checkpoint( upload_to=remote_checkpoint_dir, ) except FileExistsError: - raise OlmoConfigurationError( + raise OLMoConfigurationError( f"Checkpoint for step {self.global_step} already exists, use --save-overwrite to overwrite it" ) diff --git a/olmo/util.py b/olmo/util.py index 71ee67e60..3473ff43f 100644 --- a/olmo/util.py +++ b/olmo/util.py @@ -25,11 +25,11 @@ from .aliases import PathOrStr from .exceptions import ( - OlmoCliError, - OlmoEnvironmentError, - OlmoError, - OlmoNetworkError, - OlmoThreadError, + OLMoCliError, + OLMoEnvironmentError, + OLMoError, + OLMoNetworkError, + OLMoThreadError, ) from .torch_util import get_global_rank, get_local_rank, get_node_rank, is_distributed @@ -148,9 +148,9 @@ def excepthook(exctype, value, traceback): """ if issubclass(exctype, KeyboardInterrupt): sys.__excepthook__(exctype, value, traceback) - elif issubclass(exctype, OlmoCliError): + elif issubclass(exctype, OLMoCliError): rich.get_console().print(f"[yellow]{value}[/]", highlight=False) - elif issubclass(exctype, OlmoError): + elif issubclass(exctype, OLMoError): rich.get_console().print(Text(f"{exctype.__name__}:", style="red"), value, highlight=False) else: log.critical("Uncaught %s: %s", exctype.__name__, value, exc_info=(exctype, value, traceback)) @@ -448,7 +448,7 @@ def _get_s3_profile_name(scheme: str) -> Optional[str]: if scheme == "r2": profile_name = os.environ.get("R2_PROFILE") if profile_name is None: - raise OlmoEnvironmentError( + raise OLMoEnvironmentError( "R2 profile name is not set. Did you forget to set the 'R2_PROFILE' env var?" ) @@ -463,7 +463,7 @@ def _get_s3_endpoint_url(scheme: str) -> Optional[str]: if scheme == "r2": r2_endpoint_url = os.environ.get("R2_ENDPOINT_URL") if r2_endpoint_url is None: - raise OlmoEnvironmentError( + raise OLMoEnvironmentError( "R2 endpoint url is not set. Did you forget to set the 'R2_ENDPOINT_URL' env var?" ) @@ -509,12 +509,12 @@ def _s3_upload( _wait_before_retry(attempt) if err is not None: - raise OlmoNetworkError("Failed to check object existence during s3 upload") from err + raise OLMoNetworkError("Failed to check object existence during s3 upload") from err try: _get_s3_client(scheme).upload_file(source, bucket_name, key) except boto_exceptions.ClientError as e: - raise OlmoNetworkError("Failed to upload to s3") from e + raise OLMoNetworkError("Failed to upload to s3") from e def _s3_file_size(scheme: str, bucket_name: str, key: str, max_attempts: int = 3) -> int: @@ -531,7 +531,7 @@ def _s3_file_size(scheme: str, bucket_name: str, key: str, max_attempts: int = 3 log.warning("%s failed attempt %d with retriable error: %s", _s3_file_size.__name__, attempt, err) _wait_before_retry(attempt) - raise OlmoNetworkError("Failed to get s3 file size") from err + raise OLMoNetworkError("Failed to get s3 file size") from err def _s3_get_bytes_range( @@ -570,7 +570,7 @@ def _s3_get_bytes_range( # This can cause an irrelevant exception (e.g. KeyError: 'error'), resulting # in us losing the true exception info. To avoid this, we change the exception # to a type that has a single-parameter constructor. - raise OlmoNetworkError("Failed to get bytes range from s3") from err + raise OLMoNetworkError("Failed to get bytes range from s3") from err def _s3_find_latest_checkpoint(scheme: str, bucket_name: str, prefix: str) -> Optional[str]: @@ -624,7 +624,7 @@ def fill_queue(): for x in iter(q.get, sentinel): if isinstance(x, Exception): - raise OlmoThreadError(f"generator thread {thread_name} failed") from x + raise OLMoThreadError(f"generator thread {thread_name} failed") from x else: yield x diff --git a/scripts/average_ckpts.py b/scripts/average_ckpts.py new file mode 100644 index 000000000..2be4533fe --- /dev/null +++ b/scripts/average_ckpts.py @@ -0,0 +1,71 @@ +import os + +import torch + +CKPTS = [ + "step456000-unsharded-lumi/model.pt", + "step556000-unsharded-mosaic/model.pt" +] + +OUTDIR = "step456000-unsharded-lumi-mosaic" + +CKPTS = [ + "step456000-unsharded-lumi/model.pt", + "step432410-unsharded-mosaic/model.pt" +] + + + + + + +import os + +import torch + +CKPTS = [ +"step551000-unsharded/model.pt", +"step552000-unsharded/model.pt", +"step553000-unsharded/model.pt", +"step554000-unsharded/model.pt", +"step555000-unsharded/model.pt", +"step556000-unsharded/model.pt", +"step557000-unsharded/model.pt", +] + +OUTDIR = "last7_avg" + +first_sd = torch.load(CKPTS[0]) +for k in first_sd: + first_sd[k] = torch.stack([sd[k] for sd in [torch.load(ckpt) for ckpt in CKPTS]], dim=0).mean(dim=0) + +os.makedirs(OUTDIR, exist_ok=True) +torch.save(first_sd, os.path.join(OUTDIR, "model.pt")) + + + + + +import os + +import torch + +CKPTS = [ +"step551000-unsharded/model.pt", +"step552000-unsharded/model.pt", +"step553000-unsharded/model.pt", +"step554000-unsharded/model.pt", +"step555000-unsharded/model.pt", +"step556000-unsharded/model.pt", +"step557000-unsharded/model.pt", +] + +OUTDIR = "last7_avg" + +keys = list(torch.load(CKPTS[0]).keys()) +new_sd = {} +for k in keys: + new_sd[k] = torch.stack([torch.load(ckpt)[k] for ckpt in CKPTS], dim=0).mean(dim=0) + +os.makedirs(OUTDIR, exist_ok=True) +torch.save(new_sd, os.path.join(OUTDIR, "model.pt")) diff --git a/scripts/average_ckpts_advanced.py b/scripts/average_ckpts_advanced.py new file mode 100644 index 000000000..ebde8b0a3 --- /dev/null +++ b/scripts/average_ckpts_advanced.py @@ -0,0 +1,123 @@ +#python avgadvanced.py --input ./ --filter "step*/model.pt" --output advaveraged.pt --no-sort +#!/usr/bin/env python +""" Checkpoint Averaging Script + +This script averages all model weights for checkpoints in specified path that match +the specified filter wildcard. All checkpoints must be from the exact same model. + +For any hope of decent results, the checkpoints should be from the same or child +(via resumes) training session. This can be viewed as similar to maintaining running +EMA (exponential moving average) of the model weights or performing SWA (stochastic +weight averaging), but post-training. + +Hacked together by Ross Wightman (https://github.com/rwightman) +""" +import torch +import argparse +import os +import glob +import hashlib +from timm.models.helpers import load_state_dict + +parser = argparse.ArgumentParser(description='PyTorch Checkpoint Averager') +parser.add_argument('--input', default='', type=str, metavar='PATH', + help='path to base input folder containing checkpoints') +parser.add_argument('--filter', default='*.pth.tar', type=str, metavar='WILDCARD', + help='checkpoint filter (path wildcard)') +parser.add_argument('--output', default='./averaged.pth', type=str, metavar='PATH', + help='output filename') +parser.add_argument('--no-use-ema', dest='no_use_ema', action='store_true', + help='Force not using ema version of weights (if present)') +parser.add_argument('--descending', dest='descending', action='store_true', + help='Set if eval metric is descending (like loss)') +parser.add_argument('--no-sort', dest='no_sort', action='store_true', + help='Do not sort and select by checkpoint metric, also makes "n" argument irrelevant') +parser.add_argument('-n', type=int, default=10, metavar='N', + help='Number of checkpoints to average') + + +def checkpoint_metric(checkpoint_path): + if not checkpoint_path or not os.path.isfile(checkpoint_path): + return {} + print("=> Extracting metric from checkpoint '{}'".format(checkpoint_path)) + checkpoint = torch.load(checkpoint_path, map_location='cpu') + metric = None + if 'metric' in checkpoint: + metric = checkpoint['metric'] + return metric + + +def main(): + args = parser.parse_args() + # by default use the EMA weights (if present) + args.use_ema = not args.no_use_ema + # by default sort by checkpoint metric (if present) and avg top n checkpoints + args.sort = not args.no_sort + + if os.path.exists(args.output): + print("Error: Output filename ({}) already exists.".format(args.output)) + exit(1) + + pattern = args.input + if not args.input.endswith(os.path.sep) and not args.filter.startswith(os.path.sep): + pattern += os.path.sep + pattern += args.filter + checkpoints = glob.glob(pattern, recursive=True) + if not checkpoints: + print("Error: No checkpoints to average.") + exit(1) + + if args.sort: + checkpoint_metrics = [] + for c in checkpoints: + metric = checkpoint_metric(c) + if metric is not None: + checkpoint_metrics.append((metric, c)) + checkpoint_metrics = list(sorted(checkpoint_metrics, reverse=not args.descending)) + checkpoint_metrics = checkpoint_metrics[:args.n] + print("Selected checkpoints:") + [print(m, c) for m, c in checkpoint_metrics] + avg_checkpoints = [c for m, c in checkpoint_metrics] + else: + avg_checkpoints = checkpoints + print("Selected checkpoints:") + [print(c) for c in checkpoints] + + avg_state_dict = {} + avg_counts = {} + for c in avg_checkpoints: + new_state_dict = load_state_dict(c, args.use_ema) + if not new_state_dict: + print("Error: Checkpoint ({}) doesn't exist".format(args.checkpoint)) + continue + + for k, v in new_state_dict.items(): + if k not in avg_state_dict: + avg_state_dict[k] = v.clone().to(dtype=torch.float32) + avg_counts[k] = 1 + else: + avg_state_dict[k] += v.to(dtype=torch.float32) + avg_counts[k] += 1 + + for k, v in avg_state_dict.items(): + v.div_(avg_counts[k]) + + # float32 overflow seems unlikely based on weights seen to date, but who knows + float32_info = torch.finfo(torch.float32) + final_state_dict = {} + for k, v in avg_state_dict.items(): + v = v.clamp(float32_info.min, float32_info.max) + final_state_dict[k] = v.to(dtype=torch.float32) + + try: + torch.save(final_state_dict, args.output, _use_new_zipfile_serialization=False) + except: + torch.save(final_state_dict, args.output) + + with open(args.output, 'rb') as f: + sha_hash = hashlib.sha256(f.read()).hexdigest() + print("=> Saved state_dict to '{}, SHA256: {}'".format(args.output, sha_hash)) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/avgckpts_stepbystep.py b/scripts/avgckpts_stepbystep.py new file mode 100644 index 000000000..4694fa0ba --- /dev/null +++ b/scripts/avgckpts_stepbystep.py @@ -0,0 +1,127 @@ +#python avckpt25.py --input ./ --filter "step*/model.pt" --output advaveraged25.pt --no-sort +#!/usr/bin/env python +""" Checkpoint Averaging Script + +This script averages all model weights for checkpoints in specified path that match +the specified filter wildcard. All checkpoints must be from the exact same model. + +For any hope of decent results, the checkpoints should be from the same or child +(via resumes) training session. This can be viewed as similar to maintaining running +EMA (exponential moving average) of the model weights or performing SWA (stochastic +weight averaging), but post-training. + +Hacked together by Ross Wightman (https://github.com/rwightman) +""" +import torch +import argparse +import os +import glob +import hashlib + +parser = argparse.ArgumentParser(description='PyTorch Checkpoint Averager') +parser.add_argument('--input', default='', type=str, metavar='PATH', + help='path to base input folder containing checkpoints') +parser.add_argument('--filter', default='*.pth.tar', type=str, metavar='WILDCARD', + help='checkpoint filter (path wildcard)') +parser.add_argument('--output', default='./averaged.pth', type=str, metavar='PATH', + help='output filename') +parser.add_argument('--no-use-ema', dest='no_use_ema', action='store_true', + help='Force not using ema version of weights (if present)') +parser.add_argument('--descending', dest='descending', action='store_true', + help='Set if eval metric is descending (like loss)') +parser.add_argument('--no-sort', dest='no_sort', action='store_true', + help='Do not sort and select by checkpoint metric, also makes "n" argument irrelevant') +parser.add_argument('-n', type=int, default=10, metavar='N', + help='Number of checkpoints to average') + + +def checkpoint_metric(checkpoint_path): + if not checkpoint_path or not os.path.isfile(checkpoint_path): + return {} + print("=> Extracting metric from checkpoint '{}'".format(checkpoint_path)) + checkpoint = torch.load(checkpoint_path, map_location='cpu') + metric = None + if 'metric' in checkpoint: + metric = checkpoint['metric'] + return metric + + +def main(): + args = parser.parse_args() + # by default use the EMA weights (if present) + args.use_ema = not args.no_use_ema + # by default sort by checkpoint metric (if present) and avg top n checkpoints + args.sort = not args.no_sort + + if os.path.exists(args.output): + print("Error: Output filename ({}) already exists.".format(args.output)) + exit(1) + + pattern = args.input + if not args.input.endswith(os.path.sep) and not args.filter.startswith(os.path.sep): + pattern += os.path.sep + pattern += args.filter + checkpoints = glob.glob(pattern, recursive=True) + if not checkpoints: + print("Error: No checkpoints to average.") + exit(1) + + if args.sort: + checkpoint_metrics = [] + for c in checkpoints: + metric = checkpoint_metric(c) + if metric is not None: + checkpoint_metrics.append((metric, c)) + checkpoint_metrics = list(sorted(checkpoint_metrics, reverse=not args.descending)) + checkpoint_metrics = checkpoint_metrics[:args.n] + print("Selected checkpoints:") + [print(m, c) for m, c in checkpoint_metrics] + avg_checkpoints = [c for m, c in checkpoint_metrics] + else: + avg_checkpoints = checkpoints + print("Selected checkpoints:") + [print(c) for c in checkpoints] + + avg_state_dict = {} + avg_counts = {} + for c in avg_checkpoints: + new_state_dict = torch.load(c) + keys = list(new_state_dict.keys()) + # Subselect 50% + keys = keys[:int(len(keys) * 0.25)] + new_state_dict = {k: new_state_dict[k] for k in keys} + if not new_state_dict: + print("Error: Checkpoint ({}) doesn't exist".format(args.checkpoint)) + continue + + for k in keys: + if k not in avg_state_dict: + avg_state_dict[k] = new_state_dict[k].clone().to(dtype=torch.float64) + avg_counts[k] = 1 + else: + avg_state_dict[k] += new_state_dict[k].to(dtype=torch.float64) + avg_counts[k] += 1 + del new_state_dict[k] + + for k, v in avg_state_dict.items(): + v.div_(avg_counts[k]) + + # float32 overflow seems unlikely based on weights seen to date, but who knows + float32_info = torch.finfo(torch.float32) + final_state_dict = {} + for k, v in avg_state_dict.items(): + v = v.clamp(float32_info.min, float32_info.max) + final_state_dict[k] = v.to(dtype=torch.float32) + + try: + torch.save(final_state_dict, args.output, _use_new_zipfile_serialization=False) + except: + torch.save(final_state_dict, args.output) + + with open(args.output, 'rb') as f: + sha_hash = hashlib.sha256(f.read()).hexdigest() + print("=> Saved state_dict to '{}, SHA256: {}'".format(args.output, sha_hash)) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/init_config.py b/scripts/init_config.py index 22143d401..22c223d7b 100644 --- a/scripts/init_config.py +++ b/scripts/init_config.py @@ -7,7 +7,7 @@ from typing import List from olmo import TrainConfig -from olmo.exceptions import OlmoCliError +from olmo.exceptions import OLMoCliError from olmo.util import clean_opt, prepare_cli_environment log = logging.getLogger(__name__) @@ -27,6 +27,6 @@ def main(save_path: Path, args_list: List[str]) -> None: try: save_path, args_list = sys.argv[1], sys.argv[2:] except IndexError: - raise OlmoCliError(f"Usage: {sys.argv[0]} [SAVE_PATH] [OPTIONS]") + raise OLMoCliError(f"Usage: {sys.argv[0]} [SAVE_PATH] [OPTIONS]") main(Path(save_path), [clean_opt(s) for s in args_list]) diff --git a/scripts/inspect_train_data.py b/scripts/inspect_train_data.py index 871702b8c..bed8b432f 100644 --- a/scripts/inspect_train_data.py +++ b/scripts/inspect_train_data.py @@ -9,7 +9,7 @@ from olmo.config import TrainConfig from olmo.data import build_memmap_dataset -from olmo.exceptions import OlmoCliError +from olmo.exceptions import OLMoCliError from olmo.tokenizer import Tokenizer from olmo.util import clean_opt, prepare_cli_environment @@ -51,6 +51,6 @@ def main(save_folder: Path, *steps: int, rank: Optional[int] = None): try: save_folder, rank, steps = sys.argv[1], int(sys.argv[2]), [int(i) for i in sys.argv[3:]] except (IndexError, ValueError): - raise OlmoCliError(f"Usage: {sys.argv[0]} [SAVE_FOLDER] [RANK] [STEP_NUMBER...]") + raise OLMoCliError(f"Usage: {sys.argv[0]} [SAVE_FOLDER] [RANK] [STEP_NUMBER...]") main(Path(save_folder), *steps, rank=rank if rank >= 0 else None) diff --git a/scripts/lumi/v1_5-mix-medium-mitch-ish_nockpt.sh b/scripts/lumi/v1_5-mix-medium-mitch-ish_nockpt.sh new file mode 100644 index 000000000..6b70307f5 --- /dev/null +++ b/scripts/lumi/v1_5-mix-medium-mitch-ish_nockpt.sh @@ -0,0 +1,55 @@ +#!/bin/bash +#SBATCH --job-name=v1-mix-medium +#SBATCH --account=project_462000229 +#SBATCH --output=/pfs/lustref1/flash/project_462000229/logs/%j.log +#SBATCH --nodes=32 # Total number of nodes +#SBATCH --ntasks-per-node=8 +#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank +#SBATCH --cpus-per-task=6 +#SBATCH --time=1:00:00 +#SBATCH --mem=0 # All memory on the node +#SBATCH --partition=standard-g + +module load LUMI/22.08 partition/G + +export OLMO_CONTAINER=llm-lumi_latest.sif + +export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK +export MPICH_GPU_SUPPORT_ENABLED=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET_GDR_LEVEL=3 +export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} +export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} +export CXI_FORK_SAFE=1 +export CXI_FORK_SAFE_HP=1 +export FI_CXI_DISABLE_CQ_HUGETLB=1 + +# We need to set this to avoid "Cassini Event Queue overflow detected." errors. +export FI_CXI_DEFAULT_CQ_SIZE=131072 + +#export NCCL_DEBUG=INFO +export PYTHONPATH=.:${PYTHONPATH} +export ROCM_PATH=/opt/rocm +export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64 + +# Try playing with max_split_size_mb if you run into OOM errors. +#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128 + +export DATA_PATH=$FLASH_DIR/preprocessed/olmo-mix +export CHECKPOINTS_PATH=$FLASH_DIR/checkpoints +export EVAL_DATA_PATH=$SCRATCH_DIR/eval-data + +srun \ + --cpus-per-task=$SLURM_CPUS_PER_TASK \ + --distribution=block:block \ + --kill-on-bad-exit \ + scripts/run_with_environment.sh \ + singularity exec \ + -B"$PROJECT_DIR:$PROJECT_DIR" \ + -B"$FLASH_DIR:$FLASH_DIR" \ + -B"$SCRATCH_DIR:$SCRATCH_DIR" \ + -B /opt/cray:/opt/cray \ + -B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \ + -B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \ + $PROJECT_DIR/containers/$OLMO_CONTAINER \ + python scripts/train.py configs/v1_5-mix-medium-mitch-ish_nockpt.yaml --run_name=${SLURM_JOB_ID} ${@} diff --git a/scripts/show_model_size.py b/scripts/show_model_size.py index 3740137dc..cf2ca1e22 100644 --- a/scripts/show_model_size.py +++ b/scripts/show_model_size.py @@ -10,8 +10,8 @@ import logging import sys -from olmo import Olmo, TrainConfig -from olmo.exceptions import OlmoCliError +from olmo import OLMo, TrainConfig +from olmo.exceptions import OLMoCliError from olmo.util import clean_opt, prepare_cli_environment log = logging.getLogger(__name__) @@ -23,7 +23,7 @@ def main(cfg: TrainConfig) -> None: n_layers = cfg.model.n_layers cfg.model.n_layers = 1 - single_layer_model = Olmo(cfg.model) + single_layer_model = OLMo(cfg.model) block = single_layer_model.transformer.blocks[0] # type: ignore params_per_block = sum(p.numel() for p in block.parameters()) # type: ignore @@ -42,7 +42,7 @@ def main(cfg: TrainConfig) -> None: try: yaml_path, args_list = sys.argv[1], sys.argv[2:] except IndexError: - raise OlmoCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [OPTIONS]") + raise OLMoCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [OPTIONS]") cfg = TrainConfig.load( yaml_path, diff --git a/scripts/train.py b/scripts/train.py index de97e31be..d2974c0a9 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -15,8 +15,8 @@ from olmo.config import CheckpointType, TrainConfig from olmo.data import build_train_dataloader from olmo.eval import build_evaluators -from olmo.exceptions import OlmoCliError, OlmoConfigurationError -from olmo.model import Olmo +from olmo.exceptions import OLMoCliError, OLMoConfigurationError +from olmo.model import OLMo from olmo.optim import BoltOnWarmupScheduler, build_optimizer, build_scheduler from olmo.torch_util import ( barrier, @@ -36,7 +36,7 @@ def main(cfg: TrainConfig) -> None: # Ensure run name set. if cfg.run_name is None: - raise OlmoConfigurationError("--run_name is required") + raise OLMoConfigurationError("--run_name is required") log_extra_field("run_name", cfg.run_name) # Sanity check @@ -76,7 +76,7 @@ def main(cfg: TrainConfig) -> None: # Save config. save_path = Path(cfg.save_folder) / "config.yaml" if save_path.is_file() and not cfg.save_overwrite: - raise OlmoConfigurationError(f"{save_path} already exists, use --save_overwrite to overwrite") + raise OLMoConfigurationError(f"{save_path} already exists, use --save_overwrite to overwrite") else: log.info(f"Saving config to {save_path}") save_path.parent.mkdir(exist_ok=True, parents=True) @@ -113,7 +113,7 @@ def main(cfg: TrainConfig) -> None: # Initialize the model. log.info("Building model...") - olmo_model = Olmo(cfg.model) + olmo_model = OLMo(cfg.model) log.info(f"Total number of parameters: {olmo_model.num_params():,d}") log.info(f"Number of non-embedding parameters: {olmo_model.num_params(include_embedding=False):,d}") log.info(f"Peak GPU Memory (MB) before FSDP: {int(peak_gpu_memory() or 0)}") @@ -158,7 +158,7 @@ def dummy_init_fn(module: torch.nn.Module) -> None: if cfg.save_data_indices: indices_file_path = Path(cfg.save_folder) / f"data-indices/rank{get_global_rank()}.tsv.gz" if indices_file_path.exists() and not cfg.save_overwrite: - raise OlmoConfigurationError(f"{indices_file_path} already exists, use --save_overwrite to overwrite") + raise OLMoConfigurationError(f"{indices_file_path} already exists, use --save_overwrite to overwrite") indices_file_path.parent.mkdir(exist_ok=True, parents=True) indices_file = gzip.open(indices_file_path, "wt") @@ -248,7 +248,7 @@ def dummy_init_fn(module: torch.nn.Module) -> None: try: yaml_path, args_list = sys.argv[1], sys.argv[2:] except IndexError: - raise OlmoCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [OPTIONS]") + raise OLMoCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [OPTIONS]") cfg = TrainConfig.load(yaml_path, [clean_opt(s) for s in args_list]) main(cfg) diff --git a/test_fixtures/test-olmo-model/config.json b/test_fixtures/test-olmo-model/config.json index 71e7b981e..352a4c976 100644 --- a/test_fixtures/test-olmo-model/config.json +++ b/test_fixtures/test-olmo-model/config.json @@ -3,7 +3,7 @@ "alibi": false, "alibi_bias_max": 8.0, "architectures": [ - "OlmoModelForCausalLM" + "OLMoModelForCausalLM" ], "attention_dropout": 0.1, "attention_layer_norm": false, diff --git a/tests/hf_olmo/hf_olmo_test.py b/tests/hf_olmo/hf_olmo_test.py index 0b323c4e8..6f70c0090 100644 --- a/tests/hf_olmo/hf_olmo_test.py +++ b/tests/hf_olmo/hf_olmo_test.py @@ -3,7 +3,7 @@ from olmo import BlockType, Tokenizer, TrainConfig from olmo.data import DataCollator -from olmo.model import Olmo +from olmo.model import OLMo from olmo.torch_util import seed_all @@ -188,7 +188,7 @@ def test_forward( use_amp = dtype in {torch.float16, torch.bfloat16} seed_all(1234) - model = Olmo(train_config.model).eval() + model = OLMo(train_config.model).eval() hf_config = OLMoConfig(**model.config.asdict()) diff --git a/tests/hf_olmo/modeling_olmo_test.py b/tests/hf_olmo/modeling_olmo_test.py index fda1bd715..e4bb02f54 100644 --- a/tests/hf_olmo/modeling_olmo_test.py +++ b/tests/hf_olmo/modeling_olmo_test.py @@ -3,7 +3,7 @@ import pytest import torch -from olmo.model import Olmo +from olmo.model import OLMo def test_olmo_model(model_path: str): @@ -11,7 +11,7 @@ def test_olmo_model(model_path: str): from hf_olmo import OLMoForCausalLM, OLMoTokenizerFast # noqa: F401 - model = Olmo.from_checkpoint(model_path) + model = OLMo.from_checkpoint(model_path) hf_model = AutoModelForCausalLM.from_pretrained(model_path) tokenizer = AutoTokenizer.from_pretrained(model_path) diff --git a/tests/model_test.py b/tests/model_test.py index 18dd5401f..ce1100037 100644 --- a/tests/model_test.py +++ b/tests/model_test.py @@ -3,7 +3,7 @@ import torch.nn.functional as F from torch.nn import CrossEntropyLoss -from olmo import BlockType, LayerNorm, Olmo, Tokenizer, TrainConfig +from olmo import BlockType, LayerNorm, OLMo, Tokenizer, TrainConfig from olmo.config import ModelConfig, PaddingDirection from olmo.data import DataCollator from olmo.model import AMDLayerNorm @@ -174,7 +174,7 @@ def test_forward( use_amp = dtype in {torch.float16, torch.bfloat16} - model = Olmo(train_config.model).eval() + model = OLMo(train_config.model).eval() input1 = tokenizer.encode("My name is OLMo!") input2 = tokenizer.encode("I'm a delightful large open language model :)") @@ -294,7 +294,7 @@ def test_backward( else: train_config.model.init_device = "cpu" - model = Olmo(train_config.model).train() + model = OLMo(train_config.model).train() with torch.autocast( device_type="cuda" if cuda else "cpu", enabled=use_amp, dtype=None if not use_amp else dtype @@ -365,7 +365,7 @@ def test_generate( train_config.model.init_device = "cpu" use_amp = dtype in {torch.float16, torch.bfloat16} - model = Olmo(train_config.model).eval() + model = OLMo(train_config.model).eval() input1 = tokenizer.encode("My name is OLMo! ", add_special_tokens=False) input2 = tokenizer.encode("I'm a delightful large open language model :) ", add_special_tokens=False) @@ -435,8 +435,8 @@ def test_layer_norm(train_config: TrainConfig, elementwise_affine: bool, include def test_block_groups(): - model_with_block_groups = Olmo(ModelConfig(d_model=128, n_heads=2, n_layers=9, block_group_size=3)).eval() - model_without_block_groups = Olmo(ModelConfig(d_model=128, n_heads=2, n_layers=9, block_group_size=1)).eval() + model_with_block_groups = OLMo(ModelConfig(d_model=128, n_heads=2, n_layers=9, block_group_size=3)).eval() + model_without_block_groups = OLMo(ModelConfig(d_model=128, n_heads=2, n_layers=9, block_group_size=1)).eval() # We should be able to load the state dict from one model into the other, and vice-versa. state_dict_to_load, og_keys_to_new_keys = model_with_block_groups._make_state_dict_compatible( From 9798c088f820885f1cb3ce6a03ecc4bb8b72ec4c Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Sun, 25 Feb 2024 10:22:36 +0100 Subject: [PATCH 2/7] Rm confs --- configs/c4-medium_ckptfine.yaml | 184 ------------------ configs/c4-medium_ckptoneintwo.yaml | 184 ------------------ configs/c4-medium_ckptwhole.yaml | 184 ------------------ configs/c4-medium_nockpt.yaml | 182 ----------------- configs/c4-small_nockpt.yaml | 183 ----------------- configs/olmo_nockpt.yml | 86 -------- configs/olmo_wholeckpt.yml | 88 --------- configs/v1_5-mix-medium-mitch-ish_nockpt.yaml | 98 ---------- 8 files changed, 1189 deletions(-) delete mode 100644 configs/c4-medium_ckptfine.yaml delete mode 100644 configs/c4-medium_ckptoneintwo.yaml delete mode 100644 configs/c4-medium_ckptwhole.yaml delete mode 100644 configs/c4-medium_nockpt.yaml delete mode 100644 configs/c4-small_nockpt.yaml delete mode 100644 configs/olmo_nockpt.yml delete mode 100644 configs/olmo_wholeckpt.yml delete mode 100644 configs/v1_5-mix-medium-mitch-ish_nockpt.yaml diff --git a/configs/c4-medium_ckptfine.yaml b/configs/c4-medium_ckptfine.yaml deleted file mode 100644 index f5e77a958..000000000 --- a/configs/c4-medium_ckptfine.yaml +++ /dev/null @@ -1,184 +0,0 @@ -run_name: c4-medium-run-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - -activation_checkpointing: fine_grained - -model: - d_model: 4096 - n_heads: 16 - n_layers: 30 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - block_type: sequential - layer_norm_type: low_precision # if not compiling, use 'low_precision' - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50257 - embedding_size: 50304 - eos_token_id: 50256 - pad_token_id: 50256 - init_device: meta - init_std: 0.02 - -compile: null # causes instability on AMD GPUs - -optimizer: - name: lionw - learning_rate: 1.0e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - t_max: null - -data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy} - pad_direction: right - num_workers: 2 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 - -tokenizer: - identifier: gpt2 - truncate_direction: right - -save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: 2 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: 50000 -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -# max_duration: 476837 # 2T tokens -max_duration: 50 # 200B tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - ########################## - # Perplexity evaluations # - ########################## - - label: c4-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - - label: rp-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - # lump all the small datasets together (we still get separate metrics). - - label: all-small-ppl-validation - data: - datasets: - 4chan-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy - c4_100_domains-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy - c4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy - gab-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy - ice-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy - m2d2_s2orc-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy - manosphere-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy - mc4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy - pile-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy - ptb-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy - twitterAEE-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy - wikitext_103-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy - drop_last: true - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream diff --git a/configs/c4-medium_ckptoneintwo.yaml b/configs/c4-medium_ckptoneintwo.yaml deleted file mode 100644 index 4ceb2901f..000000000 --- a/configs/c4-medium_ckptoneintwo.yaml +++ /dev/null @@ -1,184 +0,0 @@ -run_name: c4-medium-run-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - -activation_checkpointing: one_in_two - -model: - d_model: 4096 - n_heads: 16 - n_layers: 30 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - block_type: sequential - layer_norm_type: low_precision # if not compiling, use 'low_precision' - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50257 - embedding_size: 50304 - eos_token_id: 50256 - pad_token_id: 50256 - init_device: meta - init_std: 0.02 - -compile: null # causes instability on AMD GPUs - -optimizer: - name: lionw - learning_rate: 1.0e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - t_max: null - -data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy} - pad_direction: right - num_workers: 2 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 - -tokenizer: - identifier: gpt2 - truncate_direction: right - -save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: 2 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: 50000 -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -# max_duration: 476837 # 2T tokens -max_duration: 50 # 200B tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - ########################## - # Perplexity evaluations # - ########################## - - label: c4-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - - label: rp-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - # lump all the small datasets together (we still get separate metrics). - - label: all-small-ppl-validation - data: - datasets: - 4chan-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy - c4_100_domains-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy - c4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy - gab-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy - ice-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy - m2d2_s2orc-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy - manosphere-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy - mc4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy - pile-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy - ptb-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy - twitterAEE-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy - wikitext_103-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy - drop_last: true - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream diff --git a/configs/c4-medium_ckptwhole.yaml b/configs/c4-medium_ckptwhole.yaml deleted file mode 100644 index 2e8084d32..000000000 --- a/configs/c4-medium_ckptwhole.yaml +++ /dev/null @@ -1,184 +0,0 @@ -run_name: c4-medium-run-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - -activation_checkpointing: whole_layer - -model: - d_model: 4096 - n_heads: 16 - n_layers: 30 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - block_type: sequential - layer_norm_type: low_precision # if not compiling, use 'low_precision' - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50257 - embedding_size: 50304 - eos_token_id: 50256 - pad_token_id: 50256 - init_device: meta - init_std: 0.02 - -compile: null # causes instability on AMD GPUs - -optimizer: - name: lionw - learning_rate: 1.0e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - t_max: null - -data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy} - pad_direction: right - num_workers: 2 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 - -tokenizer: - identifier: gpt2 - truncate_direction: right - -save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: 2 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: 50000 -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -# max_duration: 476837 # 2T tokens -max_duration: 50 # 200B tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - ########################## - # Perplexity evaluations # - ########################## - - label: c4-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - - label: rp-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - # lump all the small datasets together (we still get separate metrics). - - label: all-small-ppl-validation - data: - datasets: - 4chan-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy - c4_100_domains-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy - c4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy - gab-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy - ice-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy - m2d2_s2orc-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy - manosphere-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy - mc4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy - pile-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy - ptb-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy - twitterAEE-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy - wikitext_103-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy - drop_last: true - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream diff --git a/configs/c4-medium_nockpt.yaml b/configs/c4-medium_nockpt.yaml deleted file mode 100644 index 0d862117b..000000000 --- a/configs/c4-medium_nockpt.yaml +++ /dev/null @@ -1,182 +0,0 @@ -run_name: c4-medium-run-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - -model: - d_model: 4096 - n_heads: 16 - n_layers: 30 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - block_type: sequential - layer_norm_type: low_precision # if not compiling, use 'low_precision' - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50257 - embedding_size: 50304 - eos_token_id: 50256 - pad_token_id: 50256 - init_device: meta - init_std: 0.02 - -compile: null # causes instability on AMD GPUs - -optimizer: - name: lionw - learning_rate: 1.0e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - t_max: null - -data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy} - pad_direction: right - num_workers: 2 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 - -tokenizer: - identifier: gpt2 - truncate_direction: right - -save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: 2 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: 50000 -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -# max_duration: 476837 # 2T tokens -max_duration: 50 # 200B tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - ########################## - # Perplexity evaluations # - ########################## - - label: c4-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - - label: rp-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - # lump all the small datasets together (we still get separate metrics). - - label: all-small-ppl-validation - data: - datasets: - 4chan-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy - c4_100_domains-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy - c4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy - gab-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy - ice-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy - m2d2_s2orc-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy - manosphere-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy - mc4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy - pile-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy - ptb-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy - twitterAEE-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy - wikitext_103-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy - drop_last: true - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream diff --git a/configs/c4-small_nockpt.yaml b/configs/c4-small_nockpt.yaml deleted file mode 100644 index bdc2e04a9..000000000 --- a/configs/c4-small_nockpt.yaml +++ /dev/null @@ -1,183 +0,0 @@ -run_name: c4-small-run-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: c4-small - -model: - d_model: 2048 - n_heads: 16 - n_layers: 16 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - block_type: sequential - layer_norm_type: low_precision # if not compiling, use 'low_precision' - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50257 - embedding_size: 50304 - eos_token_id: 50256 - pad_token_id: 50256 - init_device: meta - init_std: 0.02 - -compile: null # causes instability on AMD GPUs - -optimizer: - name: lionw - learning_rate: 2.0e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - t_max: null - -data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy} - pad_direction: right - num_workers: 2 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 - -tokenizer: - identifier: gpt2 - truncate_direction: right - -save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: 9 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: 10000 -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -# max_duration: 476837 # 2T tokens -max_duration: 50 # 200B tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 8 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - ########################## - # Perplexity evaluations # - ########################## - - label: c4-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - - label: rp-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - # lump all the small datasets together (we still get separate metrics). - - label: all-small-ppl-validation - data: - datasets: - 4chan-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy - c4_100_domains-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy - c4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy - gab-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy - ice-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy - m2d2_s2orc-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy - manosphere-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy - mc4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy - pile-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy - ptb-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy - twitterAEE-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy - wikitext_103-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy - drop_last: true - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream diff --git a/configs/olmo_nockpt.yml b/configs/olmo_nockpt.yml deleted file mode 100644 index f09396738..000000000 --- a/configs/olmo_nockpt.yml +++ /dev/null @@ -1,86 +0,0 @@ -run_name: olmo-small-ablation -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: c4-small - -model: - d_model: 2048 - n_heads: 16 - n_layers: 16 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - block_type: sequential - layer_norm_type: low_precision # if not compiling, use 'low_precision' - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50277 - embedding_size: 50304 - eos_token_id: 50276 - pad_token_id: 50276 - init_device: meta - init_std: 0.02 - -compile: null # causes instability on AMD GPUs - -optimizer: - name: lionw - learning_rate: 1.0e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - t_max: null - -data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4_en/gpt-neox-20b/c4-train.*.npy} - pad_direction: right - num_workers: 4 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 - -tokenizer: - identifier: EleutherAI/gpt-neox-20b - truncate_direction: right - -save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: 9 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: 10000 -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -# max_duration: 953674 # 2T tokens -max_duration: 50 # 200B tokens -global_train_batch_size: 1024 -device_train_microbatch_size: 8 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} \ No newline at end of file diff --git a/configs/olmo_wholeckpt.yml b/configs/olmo_wholeckpt.yml deleted file mode 100644 index efc53dbf2..000000000 --- a/configs/olmo_wholeckpt.yml +++ /dev/null @@ -1,88 +0,0 @@ -run_name: olmo-small-ablation -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: c4-small - -activation_checkpointing: whole_layer - -model: - d_model: 2048 - n_heads: 16 - n_layers: 16 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - block_type: sequential - layer_norm_type: low_precision # if not compiling, use 'low_precision' - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50277 - embedding_size: 50304 - eos_token_id: 50276 - pad_token_id: 50276 - init_device: meta - init_std: 0.02 - -compile: null # causes instability on AMD GPUs - -optimizer: - name: lionw - learning_rate: 1.0e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - t_max: null - -data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4_en/gpt-neox-20b/c4-train.*.npy} - pad_direction: right - num_workers: 4 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 - -tokenizer: - identifier: EleutherAI/gpt-neox-20b - truncate_direction: right - -save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: 9 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: 10000 -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -# max_duration: 953674 # 2T tokens -max_duration: 50 # 200B tokens -global_train_batch_size: 1024 -device_train_microbatch_size: 8 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} \ No newline at end of file diff --git a/configs/v1_5-mix-medium-mitch-ish_nockpt.yaml b/configs/v1_5-mix-medium-mitch-ish_nockpt.yaml deleted file mode 100644 index 0c0974f0e..000000000 --- a/configs/v1_5-mix-medium-mitch-ish_nockpt.yaml +++ /dev/null @@ -1,98 +0,0 @@ -run_name: v1_5-mix-medium-mitch-ish -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: v1_5-mix - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: false # not available on AMD - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 3.0e-4 - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 5000 - alpha_f: 0.1 - grad_clip_warmup_steps: 1000 - grad_clip_warmup_factor: 10.0 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: ${oc.env:CHECKPOINTS_PATH}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null # getting errors on LUMI right now -save_num_unsharded_checkpoints_to_keep: -1 -no_pre_train_checkpoint: true - -load_path: null - -max_duration: 50 # 2T tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: null - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4_en/gpt-neox-20b/c4-train.*.npy} - pad_direction: right - num_workers: 0 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 From 9be1519cd6ccc781b5ec8c034d66a48f8d07e5ba Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Sun, 25 Feb 2024 10:25:19 +0100 Subject: [PATCH 3/7] Rm scripts --- olmo/ckptavg.py | 17 ---- scripts/average_ckpts.py | 71 ----------------- scripts/average_ckpts_advanced.py | 123 ----------------------------- scripts/avgckpts_stepbystep.py | 127 ------------------------------ 4 files changed, 338 deletions(-) delete mode 100644 olmo/ckptavg.py delete mode 100644 scripts/average_ckpts.py delete mode 100644 scripts/average_ckpts_advanced.py delete mode 100644 scripts/avgckpts_stepbystep.py diff --git a/olmo/ckptavg.py b/olmo/ckptavg.py deleted file mode 100644 index 8531f62da..000000000 --- a/olmo/ckptavg.py +++ /dev/null @@ -1,17 +0,0 @@ -import torch - -STATEDICTS = [ - "advaveraged25.pt", - "advaveraged2550.pt", - "advaveraged5075.pt", - "advaveraged75.pt", -] - -sd = torch.load(STATEDICTS[0]) -for state_dict in STATEDICTS[1:]: - sd2 = torch.load(state_dict) - for k,v in sd2.items(): - assert k not in sd - sd[k] = v - -torch.save(sd, "advaveraged.pt") diff --git a/scripts/average_ckpts.py b/scripts/average_ckpts.py deleted file mode 100644 index 2be4533fe..000000000 --- a/scripts/average_ckpts.py +++ /dev/null @@ -1,71 +0,0 @@ -import os - -import torch - -CKPTS = [ - "step456000-unsharded-lumi/model.pt", - "step556000-unsharded-mosaic/model.pt" -] - -OUTDIR = "step456000-unsharded-lumi-mosaic" - -CKPTS = [ - "step456000-unsharded-lumi/model.pt", - "step432410-unsharded-mosaic/model.pt" -] - - - - - - -import os - -import torch - -CKPTS = [ -"step551000-unsharded/model.pt", -"step552000-unsharded/model.pt", -"step553000-unsharded/model.pt", -"step554000-unsharded/model.pt", -"step555000-unsharded/model.pt", -"step556000-unsharded/model.pt", -"step557000-unsharded/model.pt", -] - -OUTDIR = "last7_avg" - -first_sd = torch.load(CKPTS[0]) -for k in first_sd: - first_sd[k] = torch.stack([sd[k] for sd in [torch.load(ckpt) for ckpt in CKPTS]], dim=0).mean(dim=0) - -os.makedirs(OUTDIR, exist_ok=True) -torch.save(first_sd, os.path.join(OUTDIR, "model.pt")) - - - - - -import os - -import torch - -CKPTS = [ -"step551000-unsharded/model.pt", -"step552000-unsharded/model.pt", -"step553000-unsharded/model.pt", -"step554000-unsharded/model.pt", -"step555000-unsharded/model.pt", -"step556000-unsharded/model.pt", -"step557000-unsharded/model.pt", -] - -OUTDIR = "last7_avg" - -keys = list(torch.load(CKPTS[0]).keys()) -new_sd = {} -for k in keys: - new_sd[k] = torch.stack([torch.load(ckpt)[k] for ckpt in CKPTS], dim=0).mean(dim=0) - -os.makedirs(OUTDIR, exist_ok=True) -torch.save(new_sd, os.path.join(OUTDIR, "model.pt")) diff --git a/scripts/average_ckpts_advanced.py b/scripts/average_ckpts_advanced.py deleted file mode 100644 index ebde8b0a3..000000000 --- a/scripts/average_ckpts_advanced.py +++ /dev/null @@ -1,123 +0,0 @@ -#python avgadvanced.py --input ./ --filter "step*/model.pt" --output advaveraged.pt --no-sort -#!/usr/bin/env python -""" Checkpoint Averaging Script - -This script averages all model weights for checkpoints in specified path that match -the specified filter wildcard. All checkpoints must be from the exact same model. - -For any hope of decent results, the checkpoints should be from the same or child -(via resumes) training session. This can be viewed as similar to maintaining running -EMA (exponential moving average) of the model weights or performing SWA (stochastic -weight averaging), but post-training. - -Hacked together by Ross Wightman (https://github.com/rwightman) -""" -import torch -import argparse -import os -import glob -import hashlib -from timm.models.helpers import load_state_dict - -parser = argparse.ArgumentParser(description='PyTorch Checkpoint Averager') -parser.add_argument('--input', default='', type=str, metavar='PATH', - help='path to base input folder containing checkpoints') -parser.add_argument('--filter', default='*.pth.tar', type=str, metavar='WILDCARD', - help='checkpoint filter (path wildcard)') -parser.add_argument('--output', default='./averaged.pth', type=str, metavar='PATH', - help='output filename') -parser.add_argument('--no-use-ema', dest='no_use_ema', action='store_true', - help='Force not using ema version of weights (if present)') -parser.add_argument('--descending', dest='descending', action='store_true', - help='Set if eval metric is descending (like loss)') -parser.add_argument('--no-sort', dest='no_sort', action='store_true', - help='Do not sort and select by checkpoint metric, also makes "n" argument irrelevant') -parser.add_argument('-n', type=int, default=10, metavar='N', - help='Number of checkpoints to average') - - -def checkpoint_metric(checkpoint_path): - if not checkpoint_path or not os.path.isfile(checkpoint_path): - return {} - print("=> Extracting metric from checkpoint '{}'".format(checkpoint_path)) - checkpoint = torch.load(checkpoint_path, map_location='cpu') - metric = None - if 'metric' in checkpoint: - metric = checkpoint['metric'] - return metric - - -def main(): - args = parser.parse_args() - # by default use the EMA weights (if present) - args.use_ema = not args.no_use_ema - # by default sort by checkpoint metric (if present) and avg top n checkpoints - args.sort = not args.no_sort - - if os.path.exists(args.output): - print("Error: Output filename ({}) already exists.".format(args.output)) - exit(1) - - pattern = args.input - if not args.input.endswith(os.path.sep) and not args.filter.startswith(os.path.sep): - pattern += os.path.sep - pattern += args.filter - checkpoints = glob.glob(pattern, recursive=True) - if not checkpoints: - print("Error: No checkpoints to average.") - exit(1) - - if args.sort: - checkpoint_metrics = [] - for c in checkpoints: - metric = checkpoint_metric(c) - if metric is not None: - checkpoint_metrics.append((metric, c)) - checkpoint_metrics = list(sorted(checkpoint_metrics, reverse=not args.descending)) - checkpoint_metrics = checkpoint_metrics[:args.n] - print("Selected checkpoints:") - [print(m, c) for m, c in checkpoint_metrics] - avg_checkpoints = [c for m, c in checkpoint_metrics] - else: - avg_checkpoints = checkpoints - print("Selected checkpoints:") - [print(c) for c in checkpoints] - - avg_state_dict = {} - avg_counts = {} - for c in avg_checkpoints: - new_state_dict = load_state_dict(c, args.use_ema) - if not new_state_dict: - print("Error: Checkpoint ({}) doesn't exist".format(args.checkpoint)) - continue - - for k, v in new_state_dict.items(): - if k not in avg_state_dict: - avg_state_dict[k] = v.clone().to(dtype=torch.float32) - avg_counts[k] = 1 - else: - avg_state_dict[k] += v.to(dtype=torch.float32) - avg_counts[k] += 1 - - for k, v in avg_state_dict.items(): - v.div_(avg_counts[k]) - - # float32 overflow seems unlikely based on weights seen to date, but who knows - float32_info = torch.finfo(torch.float32) - final_state_dict = {} - for k, v in avg_state_dict.items(): - v = v.clamp(float32_info.min, float32_info.max) - final_state_dict[k] = v.to(dtype=torch.float32) - - try: - torch.save(final_state_dict, args.output, _use_new_zipfile_serialization=False) - except: - torch.save(final_state_dict, args.output) - - with open(args.output, 'rb') as f: - sha_hash = hashlib.sha256(f.read()).hexdigest() - print("=> Saved state_dict to '{}, SHA256: {}'".format(args.output, sha_hash)) - - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/scripts/avgckpts_stepbystep.py b/scripts/avgckpts_stepbystep.py deleted file mode 100644 index 4694fa0ba..000000000 --- a/scripts/avgckpts_stepbystep.py +++ /dev/null @@ -1,127 +0,0 @@ -#python avckpt25.py --input ./ --filter "step*/model.pt" --output advaveraged25.pt --no-sort -#!/usr/bin/env python -""" Checkpoint Averaging Script - -This script averages all model weights for checkpoints in specified path that match -the specified filter wildcard. All checkpoints must be from the exact same model. - -For any hope of decent results, the checkpoints should be from the same or child -(via resumes) training session. This can be viewed as similar to maintaining running -EMA (exponential moving average) of the model weights or performing SWA (stochastic -weight averaging), but post-training. - -Hacked together by Ross Wightman (https://github.com/rwightman) -""" -import torch -import argparse -import os -import glob -import hashlib - -parser = argparse.ArgumentParser(description='PyTorch Checkpoint Averager') -parser.add_argument('--input', default='', type=str, metavar='PATH', - help='path to base input folder containing checkpoints') -parser.add_argument('--filter', default='*.pth.tar', type=str, metavar='WILDCARD', - help='checkpoint filter (path wildcard)') -parser.add_argument('--output', default='./averaged.pth', type=str, metavar='PATH', - help='output filename') -parser.add_argument('--no-use-ema', dest='no_use_ema', action='store_true', - help='Force not using ema version of weights (if present)') -parser.add_argument('--descending', dest='descending', action='store_true', - help='Set if eval metric is descending (like loss)') -parser.add_argument('--no-sort', dest='no_sort', action='store_true', - help='Do not sort and select by checkpoint metric, also makes "n" argument irrelevant') -parser.add_argument('-n', type=int, default=10, metavar='N', - help='Number of checkpoints to average') - - -def checkpoint_metric(checkpoint_path): - if not checkpoint_path or not os.path.isfile(checkpoint_path): - return {} - print("=> Extracting metric from checkpoint '{}'".format(checkpoint_path)) - checkpoint = torch.load(checkpoint_path, map_location='cpu') - metric = None - if 'metric' in checkpoint: - metric = checkpoint['metric'] - return metric - - -def main(): - args = parser.parse_args() - # by default use the EMA weights (if present) - args.use_ema = not args.no_use_ema - # by default sort by checkpoint metric (if present) and avg top n checkpoints - args.sort = not args.no_sort - - if os.path.exists(args.output): - print("Error: Output filename ({}) already exists.".format(args.output)) - exit(1) - - pattern = args.input - if not args.input.endswith(os.path.sep) and not args.filter.startswith(os.path.sep): - pattern += os.path.sep - pattern += args.filter - checkpoints = glob.glob(pattern, recursive=True) - if not checkpoints: - print("Error: No checkpoints to average.") - exit(1) - - if args.sort: - checkpoint_metrics = [] - for c in checkpoints: - metric = checkpoint_metric(c) - if metric is not None: - checkpoint_metrics.append((metric, c)) - checkpoint_metrics = list(sorted(checkpoint_metrics, reverse=not args.descending)) - checkpoint_metrics = checkpoint_metrics[:args.n] - print("Selected checkpoints:") - [print(m, c) for m, c in checkpoint_metrics] - avg_checkpoints = [c for m, c in checkpoint_metrics] - else: - avg_checkpoints = checkpoints - print("Selected checkpoints:") - [print(c) for c in checkpoints] - - avg_state_dict = {} - avg_counts = {} - for c in avg_checkpoints: - new_state_dict = torch.load(c) - keys = list(new_state_dict.keys()) - # Subselect 50% - keys = keys[:int(len(keys) * 0.25)] - new_state_dict = {k: new_state_dict[k] for k in keys} - if not new_state_dict: - print("Error: Checkpoint ({}) doesn't exist".format(args.checkpoint)) - continue - - for k in keys: - if k not in avg_state_dict: - avg_state_dict[k] = new_state_dict[k].clone().to(dtype=torch.float64) - avg_counts[k] = 1 - else: - avg_state_dict[k] += new_state_dict[k].to(dtype=torch.float64) - avg_counts[k] += 1 - del new_state_dict[k] - - for k, v in avg_state_dict.items(): - v.div_(avg_counts[k]) - - # float32 overflow seems unlikely based on weights seen to date, but who knows - float32_info = torch.finfo(torch.float32) - final_state_dict = {} - for k, v in avg_state_dict.items(): - v = v.clamp(float32_info.min, float32_info.max) - final_state_dict[k] = v.to(dtype=torch.float32) - - try: - torch.save(final_state_dict, args.output, _use_new_zipfile_serialization=False) - except: - torch.save(final_state_dict, args.output) - - with open(args.output, 'rb') as f: - sha_hash = hashlib.sha256(f.read()).hexdigest() - print("=> Saved state_dict to '{}, SHA256: {}'".format(args.output, sha_hash)) - - -if __name__ == '__main__': - main() \ No newline at end of file From 98904685973ddbc58e52bc5262face42678caa04 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Sun, 25 Feb 2024 10:26:45 +0100 Subject: [PATCH 4/7] Rm scripts --- .../lumi/v1_5-mix-medium-mitch-ish_nockpt.sh | 55 ------------------- 1 file changed, 55 deletions(-) delete mode 100644 scripts/lumi/v1_5-mix-medium-mitch-ish_nockpt.sh diff --git a/scripts/lumi/v1_5-mix-medium-mitch-ish_nockpt.sh b/scripts/lumi/v1_5-mix-medium-mitch-ish_nockpt.sh deleted file mode 100644 index 6b70307f5..000000000 --- a/scripts/lumi/v1_5-mix-medium-mitch-ish_nockpt.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=v1-mix-medium -#SBATCH --account=project_462000229 -#SBATCH --output=/pfs/lustref1/flash/project_462000229/logs/%j.log -#SBATCH --nodes=32 # Total number of nodes -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank -#SBATCH --cpus-per-task=6 -#SBATCH --time=1:00:00 -#SBATCH --mem=0 # All memory on the node -#SBATCH --partition=standard-g - -module load LUMI/22.08 partition/G - -export OLMO_CONTAINER=llm-lumi_latest.sif - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET_GDR_LEVEL=3 -export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export CXI_FORK_SAFE=1 -export CXI_FORK_SAFE_HP=1 -export FI_CXI_DISABLE_CQ_HUGETLB=1 - -# We need to set this to avoid "Cassini Event Queue overflow detected." errors. -export FI_CXI_DEFAULT_CQ_SIZE=131072 - -#export NCCL_DEBUG=INFO -export PYTHONPATH=.:${PYTHONPATH} -export ROCM_PATH=/opt/rocm -export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64 - -# Try playing with max_split_size_mb if you run into OOM errors. -#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128 - -export DATA_PATH=$FLASH_DIR/preprocessed/olmo-mix -export CHECKPOINTS_PATH=$FLASH_DIR/checkpoints -export EVAL_DATA_PATH=$SCRATCH_DIR/eval-data - -srun \ - --cpus-per-task=$SLURM_CPUS_PER_TASK \ - --distribution=block:block \ - --kill-on-bad-exit \ - scripts/run_with_environment.sh \ - singularity exec \ - -B"$PROJECT_DIR:$PROJECT_DIR" \ - -B"$FLASH_DIR:$FLASH_DIR" \ - -B"$SCRATCH_DIR:$SCRATCH_DIR" \ - -B /opt/cray:/opt/cray \ - -B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \ - -B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \ - $PROJECT_DIR/containers/$OLMO_CONTAINER \ - python scripts/train.py configs/v1_5-mix-medium-mitch-ish_nockpt.yaml --run_name=${SLURM_JOB_ID} ${@} From 1d97aa5f2b42bc595c953f38245b20cd4eeb2989 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Sun, 25 Feb 2024 10:29:32 +0100 Subject: [PATCH 5/7] Add OLMo change --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fae7e99eb..9070f9579 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added the option to directly pass input embeddings to `OLMo` and `OLMoForCausalLM`. - Added support for Python 3.8. - Added code to throw an error if `output_attentions` is set to `True` in forward call to `OLMoForCausalLM`. This functionality hasn't been implemented yet. +- Rename `Olmo` to `OLMo` everywhere in the codebase ### Added - Added `output_hidden_states` argument and associated functionality to `OLMo` and `OLMoForCausalLM` to return model intermediate hidden states. From 9e9e9c0dc0780fc7855ab50c9a1de98057584c47 Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Thu, 7 Mar 2024 17:24:58 -0800 Subject: [PATCH 6/7] Bump version --- olmo/version.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/olmo/version.py b/olmo/version.py index 3f9d92c5b..e75c8373e 100644 --- a/olmo/version.py +++ b/olmo/version.py @@ -1,8 +1,8 @@ _MAJOR = "0" -_MINOR = "2" +_MINOR = "3" # On main and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "5" +_PATCH = "0" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = "" From afb547368f167c3c6781c54283d9a395b653d3c5 Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Thu, 7 Mar 2024 21:33:26 -0800 Subject: [PATCH 7/7] Fix changelog --- CHANGELOG.md | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index be127a61e..b93d52fb9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Changed + +- Rename `Olmo` to `OLMo` everywhere in the codebase + +### Removed + +- Removed `AMDLayerNorm`, since the original layer norm bug has been fixed and we don't need this workaround anymore. + + ## [v0.2.5](https://github.com/allenai/OLMo/releases/tag/v0.2.5) - 2024-03-06 ### Fixed @@ -15,7 +24,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added the option to directly pass input embeddings to `OLMo` and `OLMoForCausalLM`. - Added support for Python 3.8. - Added code to throw an error if `output_attentions` is set to `True` in forward call to `OLMoForCausalLM`. This functionality hasn't been implemented yet. -- Rename `Olmo` to `OLMo` everywhere in the codebase - Fixed running with data loading workers on LUMI ### Added @@ -29,10 +37,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Changed legacy checkpoint unsharding to use processes and shared memory instead of threads -### Removed - -- Removed `AMDLayerNorm`, since the original layer norm bug has been fixed and we don't need this workaround anymore. - ## [v0.2.4](https://github.com/allenai/OLMo/releases/tag/v0.2.4) - 2024-02-02