Unity-Technologies · ervteng · Nov 19, 2020 · Nov 19, 2020 · vincentpierre · Nov 19, 2020
diff --git a/ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py b/ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py
@@ -29,6 +29,7 @@ def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None:
 
     def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray:
         with torch.no_grad():
+            self._discriminator_network.update_normalization(mini_batch)
             estimates, _ = self._discriminator_network.compute_estimate(
                 mini_batch, use_vail_noise=False
             )
@@ -70,7 +71,7 @@ def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None:
         self._settings = settings
 
         encoder_settings = NetworkSettings(
-            normalize=False,
+            normalize=True,
             hidden_units=settings.encoding_size,
             num_layers=2,
             vis_encode_type=EncoderType.SIMPLE,
@@ -104,6 +105,13 @@ def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None:
             linear_layer(estimator_input_size, 1), torch.nn.Sigmoid()
         )
 
+    def update_normalization(self, mini_batch: AgentBuffer) -> None:
+        """
+        Updates the normalization of this Discriminator's encoder.
+        """
+        vec_inputs, _ = self.get_state_inputs(mini_batch)
+        self.encoder.update_normalization(vec_inputs)
+
     def get_action_input(self, mini_batch: AgentBuffer) -> torch.Tensor:
         """
         Creates the action Tensor. In continuous case, corresponds to the action. In
@@ -271,9 +279,14 @@ def compute_gradient_magnitude(
             use_vail_noise = True
             z_mu = self._z_mu_layer(hidden)
             hidden = torch.normal(z_mu, self._z_sigma * use_vail_noise)
-        estimate = self._estimator(hidden).squeeze(1).sum()
-        gradient = torch.autograd.grad(estimate, encoder_input, create_graph=True)[0]
+        estimate = self._estimator(hidden).squeeze(1)
+        gradient = torch.autograd.grad(
+            estimate,
+            encoder_input,
+            grad_outputs=torch.ones(estimate.shape),
+            create_graph=True,
+        )[0]
         # Norm's gradient could be NaN at 0. Use our own safe_norm
-        safe_norm = (torch.sum(gradient ** 2, dim=1) + self.EPSILON).sqrt()
+        safe_norm = (torch.sum(torch.pow(gradient, 2), dim=1) + self.EPSILON).sqrt()
         gradient_mag = torch.mean((safe_norm - 1) ** 2)
         return gradient_mag