Skip to content

Commit

Permalink
Merge branch 'huvu/t5_dist_checkpoint_mrtests' into 'main'
Browse files Browse the repository at this point in the history
Adding more MR tests for T5 (e.g., transformer_engine, distributed_checkpoint)

See merge request ADLR/megatron-lm!2109
  • Loading branch information
ko3n1g committed Oct 1, 2024
2 parents dddecd1 + 5ab659b commit 3efa8c2
Show file tree
Hide file tree
Showing 10 changed files with 1,101 additions and 0 deletions.
6 changes: 6 additions & 0 deletions tests/functional_tests/jet_recipes/t5.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@ products:
- scope: [mr]
time_limit: [12000]
test_case:
- t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G
- t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G
- t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G
- t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G
- t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G
- t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G
- t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G
- t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G
- scope: [weekly]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"forward-backward-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [19.39068, 0.66038, 0.65673, 0.66493, 0.65894, 0.6473, 0.65746, 0.64942, 0.66259, 0.65247, 0.65165, 0.64944, 0.81313, 0.65069, 0.64982, 0.65247, 0.65149, 0.65284, 0.64913, 0.6496]}, "forward-compute-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [3.63253, 0.27412, 0.26777, 0.27338, 0.26922, 0.26445, 0.27043, 0.26308, 0.27178, 0.26246, 0.26565, 0.26691, 0.42095, 0.26741, 0.26653, 0.26546, 0.26547, 0.26403, 0.26266, 0.26606]}, "backward-compute-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [2.0264, 0.24005, 0.23751, 0.24162, 0.24102, 0.23888, 0.24027, 0.23829, 0.24182, 0.24308, 0.24109, 0.23964, 0.23841, 0.24005, 0.23898, 0.23896, 0.24052, 0.23894, 0.24242, 0.23863]}, "forward-recv-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [8.32911, 0.07441, 0.07755, 0.07578, 0.07557, 0.07223, 0.0737, 0.07404, 0.07108, 0.07174, 0.07137, 0.07162, 0.07437, 0.07185, 0.07129, 0.07247, 0.0719, 0.07573, 0.07292, 0.07122]}, "forward-send-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.47287, 0.00053, 0.00063, 0.00048, 0.00045, 0.00047, 0.00046, 0.00045, 0.00046, 0.00063, 0.00044, 0.00046, 0.00047, 0.00045, 0.00056, 0.00046, 0.00045, 0.00046, 0.00045, 0.00044]}, "backward-recv-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.1444, 0.13179, 0.12767, 0.13592, 0.1279, 0.12912, 0.13033, 0.1328, 0.13106, 0.13249, 0.12957, 0.12877, 0.13334, 0.12829, 0.12815, 0.13128, 0.12985, 0.13117, 0.12901, 0.1277]}, "backward-send-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00065, 0.00056, 0.00066, 0.00067, 0.0006, 0.00059, 0.00064, 0.00067, 0.00068, 0.0006, 0.00056, 0.00058, 0.00059, 0.00056, 0.00064, 0.00058, 0.00049, 0.00079, 0.00081, 0.0006]}, "forward-send-backward-recv-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [12.49425, 0.23291, 0.228, 0.22475, 0.22786, 0.22525, 0.22534, 0.22597, 0.23004, 0.22656, 0.22342, 0.22577, 0.38374, 0.22857, 0.22673, 0.22371, 0.22908, 0.23017, 0.23145, 0.23191]}, "backward-send-forward-recv-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [5.02478, 0.00608, 0.00441, 0.00414, 0.0093, 0.00347, 0.00363, 0.00527, 0.0093, 0.00705, 0.00369, 0.00633, 0.00834, 0.00352, 0.0034, 0.00565, 0.00346, 0.00354, 0.00341, 0.0035]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [5e-05, 2e-05, 2e-05, 3e-05, 3e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.47745, 0.00052, 0.00064, 0.00053, 0.00052, 0.0006, 0.00052, 0.00062, 0.00052, 0.00056, 0.00065, 0.00056, 0.00054, 0.00053, 0.00058, 0.00052, 0.00052, 0.00052, 0.00055, 0.00053]}, "all-grads-sync-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.43086, 0.00036, 0.00041, 0.00037, 0.00032, 0.00037, 0.00048, 0.00044, 0.00043, 0.00045, 0.00034, 0.00044, 0.00037, 0.00043, 0.00044, 0.00032, 0.00032, 0.00045, 0.00045, 0.00045]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00053, 0.00034, 0.00032, 0.00033, 0.00034, 0.00031, 0.00033, 0.00035, 0.00032, 0.00033, 0.00036, 0.00035, 0.00033, 0.00033, 0.00034, 0.00035, 0.00033, 0.00034, 0.00032, 0.00035]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [2.26638, 0.00127, 0.00123, 0.00144, 0.00125, 0.00123, 0.00128, 0.00162, 0.00128, 0.00131, 0.00138, 0.00133, 0.00142, 0.0013, 0.00136, 0.00137, 0.00133, 0.00135, 0.00129, 0.00136]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.01282, 0.00738, 0.00728, 0.00736, 0.00738, 0.00733, 0.00738, 0.00735, 0.00731, 0.00727, 0.00897, 0.00755, 0.0073, 0.00721, 0.00734, 0.00746, 0.00736, 0.00734, 0.00737, 0.00726]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00984, 0.00108, 0.00105, 0.00108, 0.00105, 0.00105, 0.00107, 0.00104, 0.00105, 0.00106, 0.00106, 0.00105, 0.0012, 0.00106, 0.00105, 0.00105, 0.00105, 0.00106, 0.00104, 0.00106]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0011, 0.00101, 0.00102, 0.00102, 0.00101, 0.00102, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.0015, 0.00102, 0.00101, 0.00101, 0.00102, 0.00268, 0.00101, 0.00101]}, "optimizer-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [2.29197, 0.01172, 0.01152, 0.01191, 0.01165, 0.01156, 0.0117, 0.01199, 0.01159, 0.01161, 0.0134, 0.01194, 0.01269, 0.01155, 0.01172, 0.01186, 0.01173, 0.01343, 0.01172, 0.01165]}, "learning-rate": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0001, 0.0001, 9e-05, 9e-05, 8e-05, 8e-05, 7e-05, 7e-05, 6e-05, 6e-05, 5e-05, 5e-05, 5e-05, 4e-05, 4e-05, 3e-05, 3e-05, 2e-05, 2e-05, 1e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0001, 0.0001, 9e-05, 9e-05, 8e-05, 8e-05, 7e-05, 7e-05, 6e-05, 6e-05, 5e-05, 5e-05, 5e-05, 4e-05, 4e-05, 3e-05, 3e-05, 2e-05, 2e-05, 1e-05]}, "batch-size": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.41489, 9.20451, 8.62156, 8.34435, 8.08472, 7.96931, 7.68116, 7.39495, 7.26108, 7.19145, 7.31028, 7.16653, 7.05979, 6.99436, 6.85568, 6.93225, 6.95525, 7.02522, 6.66561, 6.93924]}, "lm loss vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.41489, 9.20451, 8.62156, 8.34435, 8.08472, 7.96931, 7.68116, 7.39495, 7.26108, 7.19145, 7.31028, 7.16653, 7.05979, 6.99436, 6.85568, 6.93225, 6.95525, 7.02522, 6.66561, 6.93924]}, "loss-scale": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [31.51239, 2.98952, 3.27663, 2.61225, 2.39588, 1.99758, 1.81287, 1.93167, 1.62175, 1.51416, 1.16291, 1.32388, 1.20328, 1.10814, 1.5007, 2.15295, 1.65903, 1.42013, 2.08526, 1.2754]}, "grad-norm vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [31.51239, 2.98952, 3.27663, 2.61225, 2.39588, 1.99758, 1.81287, 1.93167, 1.62175, 1.51416, 1.16291, 1.32388, 1.20328, 1.10814, 1.5007, 2.15295, 1.65903, 1.42013, 2.08526, 1.2754]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [115745.0, 111070.0, 117081.0, 112381.0, 118700.0, 116957.0, 111399.0, 114013.0, 118460.0, 116959.0, 111499.0, 115613.0, 108489.0, 119947.0, 115772.0, 116922.0, 119841.0, 120380.0, 121396.0, 118455.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [115745.0, 111070.0, 117081.0, 112381.0, 118700.0, 116957.0, 111399.0, 114013.0, 118460.0, 116959.0, 111499.0, 115613.0, 108489.0, 119947.0, 115772.0, 116922.0, 119841.0, 120380.0, 121396.0, 118455.0]}, "params-norm": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [309.46707, 309.48447, 309.52603, 309.57944, 309.64523, 309.72018, 309.80231, 309.8884, 309.97391, 310.05591, 310.13483, 310.20755, 310.27094, 310.32535, 310.37161, 310.40887, 310.43597, 310.45648, 310.47238, 310.48444]}, "params-norm vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [309.46707, 309.48447, 309.52603, 309.57944, 309.64523, 309.72018, 309.80231, 309.8884, 309.97391, 310.05591, 310.13483, 310.20755, 310.27094, 310.32535, 310.37161, 310.40887, 310.43597, 310.45648, 310.47238, 310.48444]}, "iteration-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [21.7057, 0.68569, 0.68236, 0.69077, 0.68415, 0.67238, 0.68288, 0.67481, 0.6874, 0.67748, 0.6785, 0.67478, 0.83941, 0.6755, 0.67503, 0.67787, 0.67668, 0.67904, 0.67443, 0.67541]}, "lm loss validation": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [6.86582]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [6.86582]}, "lm loss validation ppl": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [958.93542]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [958.93542]}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
ENV_VARS:
CUDA_DEVICE_MAX_CONNECTIONS: 1
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: ^NVLS
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 5
MODEL_ARGS:
--encoder-num-layers: 12
--decoder-num-layers: 12
--hidden-size: 768
--num-attention-heads: 12
--kv-channels: 64
--ffn-hidden-size: 3072
--encoder-seq-length: 512
--decoder-seq-length: 128
--max-position-embeddings: 512
--tensor-model-parallel-size: 2
--pipeline-model-parallel-size: 2
--micro-batch-size: 4
--global-batch-size: 32
--lr: 0.0001
--train-iters: 100
--lr-decay-iters: 100
--lr-decay-style: linear
--min-lr: 0.00001
--weight-decay: 1e-2
--lr-warmup-fraction: .01
--clip-grad: 1.0
--bf16: true
--vocab-extra-ids: 100
--init-method-std: 0.015
--transformer-impl: transformer_engine
--data-path: ${DATA_PATH}/my-t5_00_text_document
--vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
--tokenizer-type: BertWordPieceCase
--calculate-per-token-loss: true
--split: 99982,9,9
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--tensorboard-dir: ${TENSORBOARD_PATH}
--log-params-norm: true
--log-num-zeros-in-grad: true
--log-validation-ppl-to-tensorboard: true
--log-timers-to-tensorboard: true
--timing-log-level: 2
--log-interval: 1
--save-interval: 10000
--eval-interval: 1000
--eval-iters: 10
--distributed-backend: nccl
--data-cache-path: ${DATA_CACHE_PATH}
--encoder-pipeline-model-parallel-size: 2
--deterministic-mode: true
--ckpt-format: torch
TEST_TYPE: regular
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
ENV_VARS:
CUDA_DEVICE_MAX_CONNECTIONS: 1
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: ^NVLS
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 5
MODEL_ARGS:
--encoder-num-layers: 12
--decoder-num-layers: 12
--hidden-size: 768
--num-attention-heads: 12
--kv-channels: 64
--ffn-hidden-size: 3072
--encoder-seq-length: 512
--decoder-seq-length: 128
--max-position-embeddings: 512
--tensor-model-parallel-size: 2
--pipeline-model-parallel-size: 2
--micro-batch-size: 4
--global-batch-size: 32
--lr: 0.0001
--train-iters: 100
--lr-decay-iters: 100
--lr-decay-style: linear
--min-lr: 0.00001
--weight-decay: 1e-2
--lr-warmup-fraction: .01
--clip-grad: 1.0
--bf16: true
--vocab-extra-ids: 100
--init-method-std: 0.015
--transformer-impl: transformer_engine
--data-path: ${DATA_PATH}/my-t5_00_text_document
--vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
--tokenizer-type: BertWordPieceCase
--calculate-per-token-loss: true
--split: 99982,9,9
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--tensorboard-dir: ${TENSORBOARD_PATH}
--log-params-norm: true
--log-num-zeros-in-grad: true
--log-validation-ppl-to-tensorboard: true
--log-timers-to-tensorboard: true
--timing-log-level: 2
--log-interval: 1
--save-interval: 50
--eval-interval: 1000
--eval-iters: 10
--distributed-backend: nccl
--data-cache-path: ${DATA_CACHE_PATH}
--encoder-pipeline-model-parallel-size: 2
--deterministic-mode: true
--ckpt-format: torch
TEST_TYPE: ckpt-resume
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"forward-backward-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [11.55278, 0.77358, 0.76856, 0.77172, 0.75887, 0.76061, 0.75836, 0.76125, 0.76192, 0.76187, 0.76171, 0.76045, 0.7599, 0.76535, 0.76121, 0.76796, 0.76998, 0.76511, 0.76167, 0.75816]}, "forward-compute-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [6.97639, 0.39525, 0.3898, 0.39437, 0.37749, 0.38195, 0.37908, 0.37821, 0.38433, 0.38023, 0.38359, 0.37973, 0.37768, 0.37754, 0.38336, 0.38173, 0.39026, 0.38845, 0.38337, 0.37691]}, "backward-compute-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [3.32964, 0.37495, 0.37481, 0.37567, 0.37884, 0.37558, 0.37486, 0.37929, 0.37612, 0.37965, 0.37608, 0.37503, 0.37843, 0.38541, 0.37552, 0.38094, 0.37923, 0.37628, 0.37437, 0.37757]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [5e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [5e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05]}, "all-grads-sync-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.89543, 0.00188, 0.00211, 0.00164, 0.00165, 0.00162, 0.00162, 0.00162, 0.00184, 0.00165, 0.00164, 0.00208, 0.00162, 0.00167, 0.0016, 0.00168, 0.00165, 0.00163, 0.00164, 0.00161]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00146, 0.00105, 0.00105, 0.00102, 0.00107, 0.00107, 0.00107, 0.00109, 0.00105, 0.00106, 0.00107, 0.00106, 0.00106, 0.00106, 0.00108, 0.00108, 0.00107, 0.00104, 0.00103, 0.0011]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.50022, 0.00376, 0.00381, 0.00329, 0.00321, 0.00354, 0.00371, 0.00375, 0.00366, 0.00301, 0.00349, 0.00372, 0.00349, 0.00369, 0.00297, 0.00283, 0.00369, 0.00377, 0.00388, 0.00369]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.04986, 0.02302, 0.02299, 0.02588, 0.02338, 0.0231, 0.02293, 0.0231, 0.02309, 0.02329, 0.02328, 0.02332, 0.02304, 0.02327, 0.02287, 0.02321, 0.02315, 0.0234, 0.02312, 0.02327]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0158, 0.00219, 0.00221, 0.00411, 0.0022, 0.0022, 0.00216, 0.0022, 0.00217, 0.00218, 0.00218, 0.00225, 0.00233, 0.00219, 0.00223, 0.00222, 0.00212, 0.0022, 0.00222, 0.00225]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00301, 0.00302, 0.00302, 0.00339, 0.003, 0.00302, 0.00302, 0.00301, 0.00301, 0.00301, 0.003, 0.00301, 0.00302, 0.00304, 0.003, 0.00301, 0.00299, 0.00304, 0.00303, 0.00303]}, "optimizer-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.57167, 0.03386, 0.03382, 0.03847, 0.03353, 0.03358, 0.03363, 0.03394, 0.03377, 0.03326, 0.03368, 0.03412, 0.03363, 0.03407, 0.03281, 0.03316, 0.03373, 0.03419, 0.03396, 0.034]}, "learning-rate": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0001, 0.0001, 9e-05, 9e-05, 8e-05, 8e-05, 7e-05, 7e-05, 6e-05, 6e-05, 5e-05, 5e-05, 5e-05, 4e-05, 4e-05, 3e-05, 3e-05, 2e-05, 2e-05, 1e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0001, 0.0001, 9e-05, 9e-05, 8e-05, 8e-05, 7e-05, 7e-05, 6e-05, 6e-05, 5e-05, 5e-05, 5e-05, 4e-05, 4e-05, 3e-05, 3e-05, 2e-05, 2e-05, 1e-05]}, "batch-size": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.32677, 9.4141, 8.86401, 8.56564, 8.28782, 8.1035, 7.83676, 7.53769, 7.39294, 7.29345, 7.37746, 7.22535, 7.11277, 7.06759, 6.91832, 6.96664, 6.97845, 7.04885, 6.7213, 6.98241]}, "lm loss vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.32677, 9.4141, 8.86401, 8.56564, 8.28782, 8.1035, 7.83676, 7.53769, 7.39294, 7.29345, 7.37746, 7.22535, 7.11277, 7.06759, 6.91832, 6.96664, 6.97845, 7.04885, 6.7213, 6.98241]}, "loss-scale": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [21.26434, 2.17404, 2.50103, 2.08973, 1.92522, 1.69977, 1.63605, 1.57256, 1.48469, 1.29632, 1.00932, 1.0148, 0.95539, 1.04571, 0.94482, 0.77816, 1.07456, 1.17593, 1.12335, 0.8491]}, "grad-norm vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [21.26434, 2.17404, 2.50103, 2.08973, 1.92522, 1.69977, 1.63605, 1.57256, 1.48469, 1.29632, 1.00932, 1.0148, 0.95539, 1.04571, 0.94482, 0.77816, 1.07456, 1.17593, 1.12335, 0.8491]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [43306.0, 40955.0, 43967.0, 41614.0, 44764.0, 43923.0, 41108.0, 42464.0, 44664.0, 43899.0, 41152.0, 43230.0, 39719.0, 45367.0, 43334.0, 43903.0, 45349.0, 45688.0, 46166.0, 44691.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [43306.0, 40955.0, 43967.0, 41614.0, 44764.0, 43923.0, 41108.0, 42464.0, 44664.0, 43899.0, 41152.0, 43230.0, 39719.0, 45367.0, 43334.0, 43903.0, 45349.0, 45688.0, 46166.0, 44691.0]}, "params-norm": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [283.80362, 283.8273, 283.86472, 283.9053, 283.95062, 284.00027, 284.05212, 284.1051, 284.15643, 284.20459, 284.25775, 284.30682, 284.34848, 284.38312, 284.41144, 284.43539, 284.45441, 284.46988, 284.48172, 284.49054]}, "params-norm vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [283.80362, 283.8273, 283.86472, 283.9053, 283.95062, 284.00027, 284.05212, 284.1051, 284.15643, 284.20459, 284.25775, 284.30682, 284.34848, 284.38312, 284.41144, 284.43539, 284.45441, 284.46988, 284.48172, 284.49054]}, "iteration-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [13.15856, 0.82951, 0.82427, 0.83168, 0.8147, 0.81581, 0.81386, 0.8171, 0.8176, 0.81664, 0.81719, 0.81685, 0.81547, 0.82136, 0.81551, 0.82315, 0.82591, 0.82132, 0.81777, 0.81414]}, "lm loss validation": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [6.9202]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [6.9202]}, "lm loss validation ppl": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [1012.5238]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [1012.5238]}}
Loading

0 comments on commit 3efa8c2

Please sign in to comment.