Add mu,ucb score logging during ap_container training

Summary: Adding logging of mu and ucb during training. Based on scalar metric logging added to RecMetrics in D53499300 (example: D53499301) `lifetime_scalar` shows the average values for each batch `window_scalar` shows ths smoothed average values Reviewed By: zxpmirror1994 Differential Revision: D54961342 fbshipit-source-id: 6df4dfcdefd71f1adaa266e4087d42de53af6ba4
facebookresearch · Mar 18, 2024 · b335f84 · b335f84
1 parent b1f3ca1
commit b335f84
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 3 deletions.
diff --git a/pearl/policy_learners/contextual_bandits/linear_bandit.py b/pearl/policy_learners/contextual_bandits/linear_bandit.py
@@ -153,7 +153,6 @@ def get_scores(
             UCB scores when exploration module is UCB
             Shape is (batch)
         """
-        assert isinstance(self._exploration_module, ScoreExplorationBase)
         feature = concatenate_actions_to_state(
             subjective_state=subjective_state,
             action_space=action_space,

diff --git a/pearl/policy_learners/contextual_bandits/neural_linear_bandit.py b/pearl/policy_learners/contextual_bandits/neural_linear_bandit.py
@@ -29,7 +29,7 @@
     DEFAULT_ACTION_SPACE,
 )
 from pearl.policy_learners.exploration_modules.contextual_bandits.ucb_exploration import (
-    UCBExploration,
+    ScoreExplorationBase,
 )
 from pearl.policy_learners.exploration_modules.exploration_module import (
     ExplorationModule,
@@ -151,6 +151,9 @@ def learn_batch(self, batch: TransitionBatch) -> Dict[str, Any]:
             else torch.ones_like(expected_values)
         )
 
+        # get scores for logging
+        ucb_scores = self.get_scores(input_features).mean()
+
         # criterion = mae, mse, Xentropy
         # Xentropy loss apply Sigmoid, MSE or MAE apply Identiy
         criterion = LOSS_TYPES[self.loss_type]
@@ -179,11 +182,14 @@ def learn_batch(self, batch: TransitionBatch) -> Dict[str, Any]:
             batch_weight,
         )
         self._maybe_apply_discounting()
+        predicted_values = predicted_values.detach()  # detach for logging
         return {
             "label": expected_values,
             "prediction": predicted_values,
             "weight": batch_weight,
             "loss": loss.detach(),
+            "scores:ucb": ucb_scores,
+            "scores:mu": predicted_values.mean(),
         }
 
     def act(
@@ -219,6 +225,7 @@ def act(
             representation=self.model._linear_regression_layer,
         )
 
+    @torch.no_grad()  # the UCB scores don't need the gradients
     def get_scores(
         self,
         subjective_state: SubjectiveState,
@@ -238,7 +245,7 @@ def get_scores(
         # dim: [batch_size, num_arms, feature_dim]
         model_ret = self.model.forward_with_intermediate_values(feature)
         # dim: [batch_size * num_arms, 1]
-        assert isinstance(self._exploration_module, UCBExploration)
+        assert isinstance(self._exploration_module, ScoreExplorationBase)
         scores = self._exploration_module.get_scores(
             subjective_state=model_ret["nn_output"],
             values=model_ret["pred_label"],