diff --git a/mo_gymnasium/envs/mujoco/__init__.py b/mo_gymnasium/envs/mujoco/__init__.py index 4415d57..f5d5639 100644 --- a/mo_gymnasium/envs/mujoco/__init__.py +++ b/mo_gymnasium/envs/mujoco/__init__.py @@ -45,6 +45,12 @@ max_episode_steps=1000, ) +register( + id="mo-walker2d-v5", + entry_point="mo_gymnasium.envs.mujoco.walker2d_v5:MOWalker2dEnv", + max_episode_steps=1000, +) + register( id="mo-ant-v4", entry_point="mo_gymnasium.envs.mujoco.ant:MOAntEnv", @@ -58,20 +64,52 @@ kwargs={"cost_objective": False}, ) + +register( + id="mo-ant-v5", + entry_point="mo_gymnasium.envs.mujoco.ant_v5:MOAntEnv", + max_episode_steps=1000, +) + +register( + id="mo-ant-2d-v5", + entry_point="mo_gymnasium.envs.mujoco.ant_v5:MOAntEnv", + max_episode_steps=1000, + kwargs={"cost_objective": False}, +) + register( id="mo-swimmer-v4", entry_point="mo_gymnasium.envs.mujoco.swimmer:MOSwimmerEnv", max_episode_steps=1000, ) +register( + id="mo-swimmer-v5", + entry_point="mo_gymnasium.envs.mujoco.swimmer_v5:MOSwimmerEnv", + max_episode_steps=1000, +) + register( id="mo-humanoid-v4", entry_point="mo_gymnasium.envs.mujoco.humanoid:MOHumanoidEnv", max_episode_steps=1000, ) +register( + id="mo-humanoid-v5", + entry_point="mo_gymnasium.envs.mujoco.humanoid_v5:MOHumanoidEnv", + max_episode_steps=1000, +) + register( id="mo-reacher-v4", entry_point="mo_gymnasium.envs.mujoco.reacher_v4:MOReacherEnv", max_episode_steps=50, ) + +register( + id="mo-reacher-v5", + entry_point="mo_gymnasium.envs.mujoco.reacher_v5:MOReacherEnv", + max_episode_steps=50, +) diff --git a/mo_gymnasium/envs/mujoco/ant_v5.py b/mo_gymnasium/envs/mujoco/ant_v5.py new file mode 100644 index 0000000..c4e4ad9 --- /dev/null +++ b/mo_gymnasium/envs/mujoco/ant_v5.py @@ -0,0 +1,57 @@ +import numpy as np +from gymnasium.envs.mujoco.ant_v5 import AntEnv +from gymnasium.spaces import Box +from gymnasium.utils import EzPickle + + +class MOAntEnv(AntEnv, EzPickle): + """ + ## Description + Multi-objective version of the AntEnv environment. + + See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/ant/) for more information. + + The original Gymnasium's 'Ant-v5' is recovered by the following linear scalarization: + + env = mo_gym.make('mo-ant-v4', cost_objective=False) + LinearReward(env, weight=np.array([1.0, 0.0])) + + ## Reward Space + The reward is 2- or 3-dimensional: + - 0: x-velocity + - 1: y-velocity + - 2: Control cost of the action + If the cost_objective flag is set to False, the reward is 2-dimensional, and the cost is added to other objectives. + A healthy reward is added to all objectives. + + ## Version History + - v5: Now includes contact forces in the reward and observation. + See https://gymnasium.farama.org/environments/mujoco/ant/#version-history + """ + + def __init__(self, cost_objective=True, **kwargs): + super().__init__(**kwargs) + EzPickle.__init__(self, cost_objective, **kwargs) + self.cost_objetive = cost_objective + self.reward_dim = 3 if cost_objective else 2 + self.reward_space = Box(low=-np.inf, high=np.inf, shape=(self.reward_dim,)) + + def step(self, action): + observation, reward, terminated, truncated, info = super().step(action) + x_velocity = info["x_velocity"] + y_velocity = info["y_velocity"] + cost = info["reward_ctrl"] + contact_cost = info["reward_contact"] + healthy_reward = info["reward_survive"] + + if self.cost_objetive: + cost /= self._ctrl_cost_weight # Ignore the weight in the original AntEnv + contact_cost /= self._contact_cost_weight + vec_reward = np.array([x_velocity, y_velocity, cost], dtype=np.float32) + else: + vec_reward = np.array([x_velocity, y_velocity], dtype=np.float32) + vec_reward += cost + contact_cost + + vec_reward += healthy_reward + + return observation, vec_reward, terminated, truncated, info diff --git a/mo_gymnasium/envs/mujoco/humanoid_v5.py b/mo_gymnasium/envs/mujoco/humanoid_v5.py new file mode 100644 index 0000000..4cd5bf0 --- /dev/null +++ b/mo_gymnasium/envs/mujoco/humanoid_v5.py @@ -0,0 +1,37 @@ +import numpy as np +from gymnasium.envs.mujoco.humanoid_v5 import HumanoidEnv +from gymnasium.spaces import Box +from gymnasium.utils import EzPickle + + +class MOHumanoidEnv(HumanoidEnv, EzPickle): + """ + ## Description + Multi-objective version of the HumanoidEnv environment. + + See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/humanoid/) for more information. + + ## Reward Space + The reward is 2-dimensional: + - 0: Reward for running forward (x-velocity) + - 1: Control cost of the action + + ## Version History: + - v5: Now includes contact forces. See: https://gymnasium.farama.org/environments/mujoco/humanoid/#version-history + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + EzPickle.__init__(self, **kwargs) + self.reward_space = Box(low=-np.inf, high=np.inf, shape=(2,)) + self.reward_dim = 2 + + def step(self, action): + observation, reward, terminated, truncated, info = super().step(action) + velocity = info["x_velocity"] + negative_cost = 10 * info["reward_ctrl"] + info["reward_contact"] + vec_reward = np.array([velocity, negative_cost], dtype=np.float32) + + vec_reward += self.healthy_reward # All objectives are penalyzed when the agent falls + + return observation, vec_reward, terminated, truncated, info diff --git a/mo_gymnasium/envs/mujoco/reacher_v5.py b/mo_gymnasium/envs/mujoco/reacher_v5.py new file mode 100644 index 0000000..20b9668 --- /dev/null +++ b/mo_gymnasium/envs/mujoco/reacher_v5.py @@ -0,0 +1,101 @@ +from os import path + +import numpy as np +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.envs.mujoco.reacher_v5 import ReacherEnv +from gymnasium.spaces import Box, Discrete + + +DEFAULT_CAMERA_CONFIG = {"trackbodyid": 0} + + +class MOReacherEnv(ReacherEnv): + """ + ## Description + Multi-objective version of the [`Reacher-v4` environment](https://gymnasium.farama.org/environments/mujoco/reacher/). + + ## Observation Space + The observation is 6-dimensional and contains: + - sin and cos of the angles of the central and elbow joints + - angular velocity of the central and elbow joints + + ## Action Space + The action space is discrete and contains the 3^2=9 possible actions based on applying positive (+1), negative (-1) or zero (0) torque to each of the two joints. + + ## Reward Space + The reward is 4-dimensional and is defined based on the distance of the tip of the arm and the four target locations. + For each i={1,2,3,4} it is computed as: + ```math + r_i = 1 - 4 * || finger_tip_coord - target_i ||^2 + ``` + + ## Version History: + See https://gymnasium.farama.org/environments/mujoco/reacher/#version-history + """ + + def __init__(self, **kwargs): + utils.EzPickle.__init__(self, **kwargs) + self.observation_space = Box(low=-np.inf, high=np.inf, shape=(6,), dtype=np.float64) + MujocoEnv.__init__( + self, + path.join(path.dirname(__file__), "assets", "mo_reacher.xml"), + 2, + observation_space=self.observation_space, + default_camera_config=DEFAULT_CAMERA_CONFIG, + **kwargs, + ) + actions = [-1.0, 0.0, 1.0] + self.action_dict = dict() + for a1 in actions: + for a2 in actions: + self.action_dict[len(self.action_dict)] = (a1, a2) + self.action_space = Discrete(9) + # Target goals: x1, y1, x2, y2, ... x4, y4 + self.goal = np.array([0.14, 0.0, -0.14, 0.0, 0.0, 0.14, 0.0, -0.14]) + self.reward_space = Box(low=-1.0, high=1.0, shape=(4,)) + self.reward_dim = 4 + + def step(self, a): + real_action = self.action_dict[int(a)] + vec_reward = np.array( + [ + 1 - 4 * np.linalg.norm(self.get_body_com("fingertip")[:2] - self.get_body_com("target1")[:2]), + 1 - 4 * np.linalg.norm(self.get_body_com("fingertip")[:2] - self.get_body_com("target2")[:2]), + 1 - 4 * np.linalg.norm(self.get_body_com("fingertip")[:2] - self.get_body_com("target3")[:2]), + 1 - 4 * np.linalg.norm(self.get_body_com("fingertip")[:2] - self.get_body_com("target4")[:2]), + ], + dtype=np.float32, + ) + + self._step_mujoco_simulation(real_action, self.frame_skip) + if self.render_mode == "human": + self.render() + + ob = self._get_obs() + return ( + ob, + vec_reward, + False, + False, + {}, + ) + + def reset_model(self): + qpos = self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq) + self.init_qpos + qpos[:2] = np.array([0, 3.1415 / 2]) # init position + qpos[-len(self.goal) :] = self.goal + qvel = self.init_qvel + self.np_random.uniform(low=-0.005, high=0.005, size=self.model.nv) + qvel[-len(self.goal) :] = 0 + self.set_state(qpos, qvel) + return self._get_obs() + + def _get_obs(self): + theta = self.data.qpos.flatten()[:2] + return np.concatenate( + [ + np.cos(theta), + np.sin(theta), + self.data.qvel.flatten()[:2] * 0.1, + ] + ) diff --git a/mo_gymnasium/envs/mujoco/swimmer_v5.py b/mo_gymnasium/envs/mujoco/swimmer_v5.py new file mode 100644 index 0000000..11a160c --- /dev/null +++ b/mo_gymnasium/envs/mujoco/swimmer_v5.py @@ -0,0 +1,41 @@ +import numpy as np +from gymnasium.envs.mujoco.swimmer_v5 import SwimmerEnv +from gymnasium.spaces import Box +from gymnasium.utils import EzPickle + + +class MOSwimmerEnv(SwimmerEnv, EzPickle): + """ + ## Description + Multi-objective version of the SwimmerEnv environment. + + See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/swimmer/) for more information. + + The original Gymnasium's 'Swimmer-v4' is recovered by the following linear scalarization: + + env = mo_gym.make('mo-swimmer-v4') + LinearReward(env, weight=np.array([1.0, 1e-4])) + + ## Reward Space + The reward is 2-dimensional: + - 0: Reward for moving forward (x-velocity) + - 1: Control cost of the action + + ## Version History: + See https://gymnasium.farama.org/main/environments/mujoco/swimmer/#version-history + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + EzPickle.__init__(self, **kwargs) + self.reward_space = Box(low=-np.inf, high=np.inf, shape=(2,)) + self.reward_dim = 2 + + def step(self, action): + observation, reward, terminated, truncated, info = super().step(action) + velocity = info["x_velocity"] + energy = -np.sum(np.square(action)) + + vec_reward = np.array([velocity, energy], dtype=np.float32) + + return observation, vec_reward, terminated, truncated, info diff --git a/mo_gymnasium/envs/mujoco/walker2d_v5.py b/mo_gymnasium/envs/mujoco/walker2d_v5.py new file mode 100644 index 0000000..5b036db --- /dev/null +++ b/mo_gymnasium/envs/mujoco/walker2d_v5.py @@ -0,0 +1,38 @@ +import numpy as np +from gymnasium.envs.mujoco.walker2d_v5 import Walker2dEnv +from gymnasium.spaces import Box +from gymnasium.utils import EzPickle + + +class MOWalker2dEnv(Walker2dEnv, EzPickle): + """ + ## Description + Multi-objective version of the Walker2dEnv environment. + + See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/walker2d/) for more information. + + ## Reward Space + The reward is 2-dimensional: + - 0: Reward for running forward (x-velocity) + - 1: Control cost of the action + + # Version History + - See https://gymnasium.farama.org/main/environments/mujoco/walker2d/#version-history + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + EzPickle.__init__(self, **kwargs) + self.reward_space = Box(low=-np.inf, high=np.inf, shape=(2,)) + self.reward_dim = 2 + + def step(self, action): + observation, reward, terminated, truncated, info = super().step(action) + velocity = info["x_velocity"] + energy = -np.sum(np.square(action)) + + vec_reward = np.array([velocity, energy], dtype=np.float32) + + vec_reward += self.healthy_reward # All objectives are penalyzed when the agent falls + + return observation, vec_reward, terminated, truncated, info