From 194b1a342f11d6202d119d5b4b1c8a95b4e30207 Mon Sep 17 00:00:00 2001 From: Lee Miller Date: Tue, 23 Jul 2024 19:21:18 -0600 Subject: [PATCH] adding l3 supercat config --- wordllama/config/l3_supercat.toml | 35 +++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 wordllama/config/l3_supercat.toml diff --git a/wordllama/config/l3_supercat.toml b/wordllama/config/l3_supercat.toml new file mode 100644 index 0000000..0fe7c82 --- /dev/null +++ b/wordllama/config/l3_supercat.toml @@ -0,0 +1,35 @@ +[model] +dim = 28672 +n_vocab = 128256 +hf_model_id = "meta-llama/Meta-Llama-3.1-405B" +pad_token = "<|end_of_text|>" + +[tokenizer] +return_tensors = "pt" +return_attention_mask = true +max_length = 256 +padding = "longest" +truncation = true +add_special_tokens = false + +[tokenizer.inference] +use_local_config = true +config_filename = "l3_supercat_tokenizer_config.json" + +[training] +output_dir = "output/matryoshka_supercat" +num_train_epochs = 2 +per_device_train_batch_size = 256 +warmup_steps = 256 +evaluation_strategy = "steps" +eval_steps = 250 +save_steps = 1000 +fp16 = true +include_num_input_tokens_seen = false +learning_rate = 3e-4 +multi_dataset_batch_sampler = "PROPORTIONAL" +binarizer_ste = "ste" + +[matryoshka] +dims = [1024, 512, 256, 128, 64] +