change observations only from float64 to float32 before adding to the…

… replay buffer Summary: This is to prepare future diffs for supporting Atari games. In Atari games, the observations are uint8 type. Currently, pearl converts uint8 observations to float32 before adding them to the replay buffer. A huge amount of memory is used when the agent saves float32 images in the replay buffer. This diff reduces the memory usage by not making the conversion when the observations are added to the replay buffer, and makes this conversion when batchs are sampled from the replay buffer. Reviewed By: rodrigodesalvobraz Differential Revision: D65923483 fbshipit-source-id: b43825a31c4da2aa4e58cea5bb47025379b0903e
facebookresearch · Dec 7, 2024 · 8941dbe · 8941dbe
1 parent 3e52c4a
commit 8941dbe
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 4 deletions.
diff --git a/pearl/replay_buffers/tensor_based_replay_buffer.py b/pearl/replay_buffers/tensor_based_replay_buffer.py
@@ -347,7 +347,7 @@ def _create_transition_batch(
                     x.next_unavailable_actions_mask
                 )
 
-        state_batch = torch.cat(state_list)
+        state_batch = torch.cat(state_list).type(torch.float32)
         action_batch = torch.cat(action_list)
         reward_batch = torch.cat(reward_list)
         terminated_batch = torch.cat(terminated_list)
@@ -358,7 +358,7 @@ def _create_transition_batch(
             cost_batch = None
         next_state_batch, next_action_batch = None, None
         if has_next_state:
-            next_state_batch = torch.cat(next_state_list)
+            next_state_batch = torch.cat(next_state_list).type(torch.float32)
         if has_next_action:
             next_action_batch = torch.cat(next_action_list)
         curr_available_actions_batch, curr_unavailable_actions_mask_batch = None, None

diff --git a/pearl/utils/instantiations/environments/gym_environment.py b/pearl/utils/instantiations/environments/gym_environment.py
@@ -112,7 +112,7 @@ def reset(self, seed: int | None = None) -> tuple[Observation, ActionSpace]:
             # TODO: Deprecate this part at some point and only support new
             # version of Gymnasium?
             observation = list(reset_result.values())[0]  # pyre-ignore
-        if isinstance(observation, np.ndarray):
+        if isinstance(observation, np.float64):
             observation = observation.astype(np.float32)
         return observation, self.action_space
 
@@ -151,7 +151,7 @@ def step(self, action: Action) -> ActionResult:
         else:
             available_action_space = None
 
-        if isinstance(observation, np.ndarray):
+        if isinstance(observation, np.float64):
             observation = observation.astype(np.float32)
         if isinstance(reward, np.float64):
             reward = reward.astype(np.float32)