Skip to content

Commit

Permalink
update configs to use optimizer state
Browse files Browse the repository at this point in the history
  • Loading branch information
epwalsh committed Jan 18, 2024
1 parent cfca552 commit b69ea02
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 44 deletions.
14 changes: 9 additions & 5 deletions configs/mcli/mitchish-instruct.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ integrations:
ssh_clone: true
command: |-
checkpoint=s3://olmo-checkpoints/ai2-llm/olmo-medium/wd2gxrza/step556000-unsharded
learning_rate=2e-5
run_name=mitchish-mcli-2.5T-instruct-${learning_rate}-v2
learning_rate=2e-6
run_name=mitchish-mcli-2.5T-instruct-${learning_rate}-5ep-v2
# NOTE: For some reason getting S3 and R2 authentication working both from the command line and
# from Python proved to be challenging, maybe because Mosaic's server are in Australia.
Expand All @@ -38,7 +38,6 @@ command: |-
# Prepare environment including AWS config files for both S3 and R2 access.
mkdir -p /root/.cache/torch
mkdir /root/checkpoint-unsharded
mkdir /root/data
mkdir /root/.aws
touch /root/.aws/credentials /root/.aws/config
echo '[s3]' >> /root/.aws/credentials
Expand Down Expand Up @@ -75,6 +74,11 @@ command: |-
--endpoint-url=https://a198dc34621661a1a66a02d6eb7c4dc3.r2.cloudflarestorage.com \
"${checkpoint}/model.pt" /root/checkpoint-unsharded/
# Download optimizer state.
aws s3 cp --profile=r2 --region=auto \
--endpoint-url=https://a198dc34621661a1a66a02d6eb7c4dc3.r2.cloudflarestorage.com \
"${checkpoint}/optim.pt" /root/checkpoint-unsharded/
# Now remove the aws configs so it doesn't mess with data loading / uploading checkpoints to/from S3.
rm -rf /root/.aws
Expand All @@ -90,8 +94,8 @@ command: |-
--save_overwrite \
--save_interval_unsharded=10000 \
--load_path=/root/checkpoint-unsharded \
--reset_optimizer_state \
--reset_trainer_state \
--compile=null \
--activation_checkpointing=fine_grained \
--fsdp.wrapping_strategy=size_based
--fsdp.wrapping_strategy=size_based \
--max_duration=5ep
42 changes: 3 additions & 39 deletions configs/mitchish-instruct.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,15 @@ compile:
optimizer:
name: adamw
learning_rate: 2e-5
weight_decay: 0.0
weight_decay: 0.1
betas:
- 0.9
- 0.999
- 0.95
metrics_log_interval: 10

scheduler:
name: linear_with_warmup
t_warmup: 100
t_warmup: 200
alpha_f: 0.001

tokenizer:
Expand Down Expand Up @@ -91,42 +91,6 @@ eval_interval: ${save_interval}
eval_subset_num_batches: -1
device_eval_batch_size: ${device_train_microbatch_size}
evaluators:
- label: all-small-ppl-validation
data:
num_workers: 0
drop_last: true
# pin_memory: true
# prefetch_factor: 1
# persistent_workers: false
# timeout: 0
datasets:
4chan-validation:
- s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy
c4_100_domains-validation:
- s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy
c4_en-validation:
- s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy
gab-validation:
- s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy
ice-validation:
- s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy
m2d2_s2orc-validation:
- s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy
m2d2_wiki-validation:
- s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy
manosphere-validation:
- s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy
mc4_en-validation:
- s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy
pile-validation:
- s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy
ptb-validation:
- s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy
twitterAEE-validation:
- s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy
wikitext_103-validation:
- s3://ai2-llm/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy

##########################
# Downstream evaluations #
##########################
Expand Down

0 comments on commit b69ea02

Please sign in to comment.