From f3159234bf8de50486015b6b47708b5c3271170d Mon Sep 17 00:00:00 2001
From: Valentina Pyatkin <ValentinaPy@users.noreply.github.com>
Date: Tue, 12 Nov 2024 14:48:37 -0800
Subject: [PATCH] Add files via upload

The original version of this combination led to the best 7B DPO performance, this is the same prompts, but with on-policy completions, comparable, but slightly worse performance.
---
 .../train_configs/dpo/best_on_policy_7b.yaml  | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 configs/train_configs/dpo/best_on_policy_7b.yaml

diff --git a/configs/train_configs/dpo/best_on_policy_7b.yaml b/configs/train_configs/dpo/best_on_policy_7b.yaml
new file mode 100644
index 000000000..5262780e5
--- /dev/null
+++ b/configs/train_configs/dpo/best_on_policy_7b.yaml
@@ -0,0 +1,29 @@
+model_name_or_path: /model
+model_revision: main
+use_flash_attn: true
+gradient_checkpointing: true
+dataset_mixer:
+  allenai/ultrafeedback_binarized_cleaned_train: 1.0
+  ai2-adapt-dev/sft_v3.9_used_off_policy: 1.0
+  ai2-adapt-dev/sft_v3.9_used_on_policy_small_8b_ckpt: 1.0
+  ai2-adapt-dev/personahub_if_pref_data_manualseed_v2_19890: 1.0
+tokenizer_name: /model
+use_slow_tokenizer: true
+max_seq_length: 2048
+preprocessing_num_workers: 16
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 16 # designed for 8 GPUs, so batch size 128
+learning_rate: 5.0e-7
+lr_scheduler_type: linear
+warmup_ratio: 0.1
+weight_decay: 0.0
+num_train_epochs: 1
+output_dir: /output
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+use_lora: false
+dpo_loss_type: dpo_norm
+dpo_beta: 5
+checkpointing_steps: 1000
\ No newline at end of file