huggingface · AIR-hl · May 16, 2025 · May 16, 2025 · May 16, 2025 · May 19, 2025
diff --git a/docs/source/dpo_trainer.md b/docs/source/dpo_trainer.md
@@ -168,6 +168,9 @@ The [RPO](https://huggingface.co/papers/2404.19733) paper implements an iterativ
 
 The [WPO](https://huggingface.co/papers/2406.11827) paper adapts off-policy data to resemble on-policy data more closely by reweighting preference pairs according to their probability under the current policy. To use this method, set the `use_weighting` flag to `True` in the [`DPOConfig`].
 
+### LD-DPO loss
+The [LD-DPO](https://huggingface.co/papers/2409.06411) The paper decomposes the portion of the response that exceeds the desired length into two components — human-like preferences and verbosity preference — based on a mixing coefficient $\alpha$. To use this method, set the `ld_alpha` in the [`DPOConfig`] to an appropriate value. The paper suggests setting this value between `0.0` and `1.0`.
+
 ### For Mixture of Experts Models: Enabling the auxiliary loss
 
 MOEs are the most efficient if the load is about equally distributed between experts.  

diff --git a/trl/trainer/dpo_config.py b/trl/trainer/dpo_config.py
@@ -139,6 +139,10 @@ class DPOConfig(TrainingArguments):
             α parameter from the [RPO](https://huggingface.co/papers/2404.19733) paper (v3), which controls the
             weighting of the NLL term in the loss. If `None`, no weighting is applied and the loss is the same as the
             DPO loss. The paper recommends `rpo_alpha=1.0`.
+        ld_alpha (`float`, *optional*, defaults to `None`):
+            α parameter from the LD-DPO paper, which controls the verbose token logp in responses.
+            If `None`, no weighting is applied on the verbose part and the loss is the same as the DPO loss.
+            The paper recommends `ld_alpha` should be between `0.0` and `1.0`
         discopop_tau (`float`, *optional*, defaults to `0.05`):
             τ/temperature parameter from the [DiscoPOP](https://huggingface.co/papers/2406.08414) paper, which controls
             the shape of log ratio modulated loss. The paper recommends the default value `discopop_tau=0.05`.
@@ -346,6 +350,14 @@ class DPOConfig(TrainingArguments):
             "`rpo_alpha=1.0`."
         },
     )
+    ld_alpha: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "α parameter from the LD-DPO paper, which controls the verbose token logp in responses. If "
+            "`None`, no weighting is applied on the verbose part and the loss is the same as the DPO loss. "
+            "The paper recommends `ld_alpha` should be between `0.0` and `1.0`"
+        },
+    )
     discopop_tau: float = field(
         default=0.05,
         metadata={

diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py
@@ -1218,6 +1218,28 @@ def concatenated_forward(self, model: nn.Module, batch: dict[str, Union[list, to
         if self.loss_type == "ipo":
             all_logps = all_logps / loss_mask.sum(-1)
 
+        if self.args.ld_alpha is not None:
+            # Compute response lengths based on loss_mask
+            completion_lengths = loss_mask.sum(dim=1)
+
+            chosen_lengths = completion_lengths[:num_examples]
+            rejected_lengths = completion_lengths[num_examples:]
+            l_p = torch.min(chosen_lengths, rejected_lengths)
+            l_p = torch.cat([l_p, l_p], dim=0)
+
+            seq_len = per_token_logps.size(1)
+            position_ids = torch.arange(seq_len, device=per_token_logps.device).expand_as(per_token_logps)
+
+            ld_mask = position_ids < l_p.unsqueeze(1)
+            mask = position_ids < completion_lengths.unsqueeze(1)
+
+            front_mask = (ld_mask & mask).float()
+            rear_mask = (~ld_mask & mask).float()
+            front_logps = (per_token_logps * front_mask).sum(dim=1)
+            rear_logps = (per_token_logps * rear_mask).sum(dim=1)
+
+            all_logps = front_logps + self.args.ld_alpha * rear_logps
+
         output["chosen_logps"] = all_logps[:num_examples]
         output["rejected_logps"] = all_logps[num_examples:]