From 069d089cd0aa67c6f64ffbccdd133f5f078988ad Mon Sep 17 00:00:00 2001
From: StoneT2000 <stonezt2019@gmail.com>
Date: Mon, 22 Jan 2024 22:29:32 -0800
Subject: [PATCH] fix some batching code, only reset and step return numpy on
 cpu mode

---
 mani_skill2/envs/sapien_env.py      | 37 ++++++++++++++---------------
 mani_skill2/envs/tasks/push_cube.py |  1 +
 mani_skill2/utils/sapien_utils.py   |  9 +++----
 3 files changed, 22 insertions(+), 25 deletions(-)

diff --git a/mani_skill2/envs/sapien_env.py b/mani_skill2/envs/sapien_env.py
index f29377bfb..d43cbf1de 100644
--- a/mani_skill2/envs/sapien_env.py
+++ b/mani_skill2/envs/sapien_env.py
@@ -338,7 +338,7 @@ def get_obs(self):
         if physx.is_gpu_enabled():
             return obs
         else:
-            return unbatch(to_numpy(obs))
+            return unbatch(obs)
 
     def _get_obs_state_dict(self):
         """Get (ground-truth) state-based observations."""
@@ -414,16 +414,16 @@ def reward_mode(self):
 
     def get_reward(self, obs: Any, action: Array, info: Dict):
         if self._reward_mode == "sparse":
-            return to_tensor(info["success"])
+            reward = info["success"]
         elif self._reward_mode == "dense":
-            return self.compute_dense_reward(obs=obs, action=action, info=info)
+            reward = self.compute_dense_reward(obs=obs, action=action, info=info)
         elif self._reward_mode == "normalized_dense":
-            return self.compute_normalized_dense_reward(
+            reward = self.compute_normalized_dense_reward(
                 obs=obs, action=action, info=info
             )
         else:
             raise NotImplementedError(self._reward_mode)
-
+        return reward
     def compute_dense_reward(self, obs: Any, action: Array, info: Dict):
         raise NotImplementedError
 
@@ -545,12 +545,15 @@ def reset(self, seed=None, options=None):
         self._set_episode_rng(self._episode_seed)
 
         self.initialize_episode()
+        obs = self.get_obs()
         if physx.is_gpu_enabled():
             # ensure all updates to object poses and configurations are applied on GPU after task initialization
             self._scene._gpu_apply_all()
             self._scene.px.gpu_update_articulation_kinematics()
             self._scene._gpu_fetch_all()
-        return self.get_obs(), {}
+        else:
+            obs = to_numpy(obs)
+        return obs, {}
 
     def _set_main_rng(self, seed):
         """Set the main random generator (e.g., to generate the seed for each episode)."""
@@ -616,24 +619,17 @@ def _clear_sim_state(self):
     # -------------------------------------------------------------------------- #
 
     def step(self, action: Union[None, np.ndarray, Dict]):
-        with sapien.profile("step_action"):
-            self.step_action(action)
+        self.step_action(action)
         self._elapsed_steps += 1
-        # TODO (stao): I think evaluation should always occur first before generating observations
-        # as evaluation is more likely to use privileged information whereas observations only sometimes should include privileged information
-        with sapien.profile("get_obs"):
-            obs = self.get_obs()
+        obs = self.get_obs()
         info = self.get_info(obs=obs)
         reward = self.get_reward(obs=obs, action=action, info=info)
         terminated = info["success"]
-        if self.num_envs == 1:
-            terminated = terminated[0]
-            reward = reward[0]
-
         if physx.is_gpu_enabled():
             return obs, reward, terminated, torch.Tensor(False), info
         else:
-            return unbatch(obs, reward, terminated.item(), False, to_numpy(info))
+            # On CPU sim mode, we always return numpy / python primitives without any batching.
+            return unbatch(to_numpy(obs), to_numpy(reward), to_numpy(terminated), False, to_numpy(info))
 
     def step_action(self, action):
         set_action = False
@@ -674,8 +670,11 @@ def get_info(self, **kwargs):
         """
         info = dict(elapsed_steps=self._elapsed_steps)
         info.update(self.evaluate(**kwargs))
-        return info
-
+        if physx.is_gpu_enabled():
+            return info
+        else:
+            return unbatch(info)
+        
     def _before_control_step(self):
         pass
 
diff --git a/mani_skill2/envs/tasks/push_cube.py b/mani_skill2/envs/tasks/push_cube.py
index 037287d97..79feefdde 100644
--- a/mani_skill2/envs/tasks/push_cube.py
+++ b/mani_skill2/envs/tasks/push_cube.py
@@ -38,6 +38,7 @@
 
 @register_env("PushCube-v0", max_episode_steps=50)
 class PushCubeEnv(BaseEnv):
+    # Specify some supported robot types
     agent: Union[Panda, Xmate3Robotiq]
 
     # set some commonly used values
diff --git a/mani_skill2/utils/sapien_utils.py b/mani_skill2/utils/sapien_utils.py
index fcb8141b0..646bca4fc 100644
--- a/mani_skill2/utils/sapien_utils.py
+++ b/mani_skill2/utils/sapien_utils.py
@@ -47,12 +47,9 @@ def to_tensor(array: Union[torch.Tensor, np.array, Sequence]):
 def _to_numpy(array: Union[Array, Sequence]) -> np.ndarray:
     if isinstance(array, (dict)):
         return {k: _to_numpy(v) for k, v in array.items()}
-    if isinstance(array, str):
-        return array
-    if torch is not None:
-        if isinstance(array, torch.Tensor):
-            return array.cpu().numpy()
-    if isinstance(array, np.ndarray):
+    if isinstance(array, torch.Tensor):
+        return array.cpu().numpy()
+    if isinstance(array, np.ndarray) or isinstance(array, bool) or isinstance(array, str) or isinstance(array, float) or isinstance(array, int):
         return array
     else:
         return np.array(array)