init

pytorch · Feb 12, 2024 · 3d55991 · 3d55991
1 parent 412bb67
commit 3d55991
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 10 deletions.
diff --git a/examples/rlhf/train_rlhf.py b/examples/rlhf/train_rlhf.py
@@ -100,9 +100,7 @@ def main(cfg):
     # using a Gym-like API (querying steps etc) introduces some
     # extra code that we can spare.
     #
-    kl_scheduler = AdaptiveKLController(
-        model=model, init_kl_coef=0.1, target=6, horizon=10000
-    )
+    kl_scheduler = AdaptiveKLController(init_kl_coef=0.1, target=6, horizon=10000)
     rollout_from_model = RolloutFromModel(
         model,
         ref_model,

diff --git a/torchrl/data/rlhf/utils.py b/torchrl/data/rlhf/utils.py
@@ -7,7 +7,7 @@
 import abc
 import collections
 import importlib
-from typing import Sequence, Tuple
+from typing import List, Tuple
 
 import numpy as np
 import torch
@@ -30,7 +30,7 @@ class KLControllerBase(abc.ABC):
     """
 
     @abc.abstractmethod
-    def update(self, kl_values: float) -> float:
+    def update(self, kl_values: List[float]) -> float:
         ...
 
 
@@ -63,7 +63,7 @@ def __init__(
         if model is not None:
             self.model.kl_coef = self.coef
 
-    def update(self, kl_values: Sequence[float] = None) -> float:
+    def update(self, kl_values: List[float] = None) -> float:
         if self.model is not None:
             self.model.kl_coef = self.coef
         return self.coef
@@ -104,7 +104,7 @@ def __init__(
         if model is not None:
             self.model.kl_coef = self.coef
 
-    def update(self, kl_values: Sequence[float]):
+    def update(self, kl_values: List[float]):
         """Update ``self.coef`` adaptively.
 
         Arguments:
@@ -256,8 +256,6 @@ def create_rollout_td(self, batch, generated, log_probs, log_ratio):
             log_ratio (torch.Tensor): The log ratio of the probabilities of the generated tokens
                 according to the generative model and the reference model. Can be
                 obtained by calling the ``generate`` method.
-            kl_coef (float, optional): Coefficient with which to multiply the KL term before subtracting
-                from the reward. Defaults to 0.1.
 
         Returns:
             A :class:`~tensordict.TensorDict` with the following keys:
@@ -537,7 +535,7 @@ def generate(self, batch: PromptData, generation_config=None):
 
     def step_scheduler(self):
         # recover true kl
-        self.kl_scheduler.update(self._kl_queue)
+        self.kl_coef = self.kl_scheduler.update(self._kl_queue)
         if isinstance(self._kl_queue, (list, collections.deque)):
             # remove all values
             while len(self._kl_queue):