From 67c50f365f2940aad5d04125f29faaa63dc07e33 Mon Sep 17 00:00:00 2001 From: Lucas Alegre Date: Tue, 21 May 2024 21:52:33 -0300 Subject: [PATCH 01/21] Fix wrapper imports --- mo_gymnasium/envs/mario/mario.py | 7 ++++--- mo_gymnasium/envs/reacher/reacher.py | 3 ++- mo_gymnasium/utils.py | 4 ++-- tests/test_envs.py | 12 ++++++------ 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/mo_gymnasium/envs/mario/mario.py b/mo_gymnasium/envs/mario/mario.py index b7279941..f2ba3b37 100644 --- a/mo_gymnasium/envs/mario/mario.py +++ b/mo_gymnasium/envs/mario/mario.py @@ -7,7 +7,8 @@ from gymnasium.utils import EzPickle, seeding # from stable_baselines3.common.atari_wrappers import MaxAndSkipEnv -from gymnasium.wrappers import GrayScaleObservation, ResizeObservation +from gymnasium.wrappers import ResizeObservation +from gymnasium.wrappers.transform_observation import GrayscaleObservation from nes_py.nes_env import SCREEN_SHAPE_24_BIT import mo_gymnasium as mo_gym @@ -16,7 +17,7 @@ from mo_gymnasium.envs.mario.joypad_space import JoypadSpace -class MOSuperMarioBros(SuperMarioBrosEnv, EzPickle): +class MOSuperMarioBros(gym.Env, SuperMarioBrosEnv, EzPickle): """ ## Description Multi-objective version of the SuperMarioBro environment. @@ -206,7 +207,7 @@ def step(self, action): env = JoypadSpace(env, SIMPLE_MOVEMENT) # env = MaxAndSkipEnv(env, 4) env = ResizeObservation(env, (84, 84)) - env = GrayScaleObservation(env) + env = GrayscaleObservation(env) # env = FrameStack(env, 4) env = mo_gym.LinearReward(env) diff --git a/mo_gymnasium/envs/reacher/reacher.py b/mo_gymnasium/envs/reacher/reacher.py index f881f512..f6a69a4f 100644 --- a/mo_gymnasium/envs/reacher/reacher.py +++ b/mo_gymnasium/envs/reacher/reacher.py @@ -1,5 +1,6 @@ from typing import Optional +import gymnasium as gym import numpy as np from gymnasium import spaces from gymnasium.utils import EzPickle, seeding @@ -11,7 +12,7 @@ target_positions = list(map(lambda l: np.array(l), [(0.14, 0.0), (-0.14, 0.0), (0.0, 0.14), (0.0, -0.14)])) -class ReacherBulletEnv(BaseBulletEnv, EzPickle): +class ReacherBulletEnv(gym.Env, BaseBulletEnv, EzPickle): metadata = {"render_modes": ["human", "rgb_array"]} def __init__( diff --git a/mo_gymnasium/utils.py b/mo_gymnasium/utils.py index def90471..31460a0d 100644 --- a/mo_gymnasium/utils.py +++ b/mo_gymnasium/utils.py @@ -7,8 +7,8 @@ import gymnasium as gym import numpy as np from gymnasium.vector import SyncVectorEnv -from gymnasium.wrappers.normalize import RunningMeanStd -from gymnasium.wrappers.record_episode_statistics import RecordEpisodeStatistics +from gymnasium.wrappers.common import RecordEpisodeStatistics +from gymnasium.wrappers.utils import RunningMeanStd ObsType = TypeVar("ObsType") diff --git a/tests/test_envs.py b/tests/test_envs.py index 28af4b0c..e3c82ef1 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -190,7 +190,7 @@ def test_ccs_dst(): np.array([19.778, -17.383]), ] - discounted_front = env.pareto_front(gamma=0.99) + discounted_front = env.unwrapped.pareto_front(gamma=0.99) for desired, actual in zip(known_ccs, discounted_front): np.testing.assert_array_almost_equal(desired, actual, decimal=2) @@ -200,7 +200,7 @@ def test_ccs_dst_no_discount(): known_ccs = mo_gym.envs.deep_sea_treasure.deep_sea_treasure.CONVEX_FRONT - discounted_front = env.pareto_front(gamma=1.0) + discounted_front = env.unwrapped.pareto_front(gamma=1.0) for desired, actual in zip(known_ccs, discounted_front): np.testing.assert_array_almost_equal(desired, actual, decimal=2) @@ -223,7 +223,7 @@ def test_concave_pf_dst(): np.array([124.0 * gamma**18, -17.383]), ] - discounted_front = env.pareto_front(gamma=0.99) + discounted_front = env.unwrapped.pareto_front(gamma=0.99) for desired, actual in zip(known_pf, discounted_front): np.testing.assert_array_almost_equal(desired, actual, decimal=2) @@ -233,7 +233,7 @@ def test_concave_pf_dst_no_discount(): known_pf = mo_gym.envs.deep_sea_treasure.deep_sea_treasure.CONCAVE_FRONT - discounted_front = env.pareto_front(gamma=1.0) + discounted_front = env.unwrapped.pareto_front(gamma=1.0) for desired, actual in zip(known_pf, discounted_front): np.testing.assert_array_almost_equal(desired, actual, decimal=2) @@ -244,7 +244,7 @@ def test_pf_fruit_tree(): known_pf = np.array(mo_gym.envs.fruit_tree.fruit_tree.FRUITS[str(depth)]) * (0.99 ** (depth - 1)) - discounted_front = env.pareto_front(gamma=0.99) + discounted_front = env.unwrapped.pareto_front(gamma=0.99) for desired, actual in zip(known_pf, discounted_front): np.testing.assert_array_almost_equal(desired, actual, decimal=2) @@ -255,6 +255,6 @@ def test_pf_fruit_tree_no_discount(): known_pf = mo_gym.envs.fruit_tree.fruit_tree.FRUITS[str(depth)] - discounted_front = env.pareto_front(gamma=1.0) + discounted_front = env.unwrapped.pareto_front(gamma=1.0) for desired, actual in zip(known_pf, discounted_front): np.testing.assert_array_almost_equal(desired, actual, decimal=2) From 6285bf4c47bed5d6144fcb579719aea235df1139 Mon Sep 17 00:00:00 2001 From: Lucas Alegre Date: Wed, 22 May 2024 11:11:57 -0300 Subject: [PATCH 02/21] Fix mo-reacher-v0 reset --- mo_gymnasium/envs/reacher/reacher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mo_gymnasium/envs/reacher/reacher.py b/mo_gymnasium/envs/reacher/reacher.py index f6a69a4f..13782b58 100644 --- a/mo_gymnasium/envs/reacher/reacher.py +++ b/mo_gymnasium/envs/reacher/reacher.py @@ -90,7 +90,7 @@ def reset(self, seed=None, **kwargs): self._seed(seed) if seed is not None: self._np_random, seed = seeding.np_random(seed) - obs = super().reset() + obs = self._reset() if self.render_mode == "human": self._render(mode="human") return obs, {} From 8096b2c7df397bb2c8693e8834c1be8632eb2348 Mon Sep 17 00:00:00 2001 From: Lucas Alegre Date: Wed, 22 May 2024 11:17:33 -0300 Subject: [PATCH 03/21] Bump LunarLander to v3 --- mo_gymnasium/envs/lunar_lander/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mo_gymnasium/envs/lunar_lander/__init__.py b/mo_gymnasium/envs/lunar_lander/__init__.py index d4435341..817671fb 100644 --- a/mo_gymnasium/envs/lunar_lander/__init__.py +++ b/mo_gymnasium/envs/lunar_lander/__init__.py @@ -2,13 +2,13 @@ register( - id="mo-lunar-lander-v2", + id="mo-lunar-lander-v3", entry_point="mo_gymnasium.envs.lunar_lander.lunar_lander:MOLunarLander", max_episode_steps=1000, ) register( - id="mo-lunar-lander-continuous-v2", + id="mo-lunar-lander-continuous-v3", entry_point="mo_gymnasium.envs.lunar_lander.lunar_lander:MOLunarLander", max_episode_steps=1000, kwargs={"continuous": True}, From a20e9e7024f4bc946279038c67a895f854c99685 Mon Sep 17 00:00:00 2001 From: Lucas Alegre Date: Wed, 22 May 2024 11:25:51 -0300 Subject: [PATCH 04/21] Mario subclass Env --- mo_gymnasium/envs/mario/mario.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mo_gymnasium/envs/mario/mario.py b/mo_gymnasium/envs/mario/mario.py index f2ba3b37..45924ed3 100644 --- a/mo_gymnasium/envs/mario/mario.py +++ b/mo_gymnasium/envs/mario/mario.py @@ -7,8 +7,6 @@ from gymnasium.utils import EzPickle, seeding # from stable_baselines3.common.atari_wrappers import MaxAndSkipEnv -from gymnasium.wrappers import ResizeObservation -from gymnasium.wrappers.transform_observation import GrayscaleObservation from nes_py.nes_env import SCREEN_SHAPE_24_BIT import mo_gymnasium as mo_gym @@ -17,7 +15,7 @@ from mo_gymnasium.envs.mario.joypad_space import JoypadSpace -class MOSuperMarioBros(gym.Env, SuperMarioBrosEnv, EzPickle): +class MOSuperMarioBros(SuperMarioBrosEnv, gym.Env, EzPickle): """ ## Description Multi-objective version of the SuperMarioBro environment. @@ -203,6 +201,9 @@ def step(self, action): if __name__ == "__main__": + from gymnasium.wrappers import ResizeObservation + from gymnasium.wrappers.transform_observation import GrayscaleObservation + env = MOSuperMarioBros() env = JoypadSpace(env, SIMPLE_MOVEMENT) # env = MaxAndSkipEnv(env, 4) From bf2dcc9ed5d66de1d963074d3e8945fb5efc88f6 Mon Sep 17 00:00:00 2001 From: Lucas Alegre Date: Wed, 22 May 2024 11:26:30 -0300 Subject: [PATCH 05/21] Skip highway tests --- tests/test_envs.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_envs.py b/tests/test_envs.py index e3c82ef1..c2d395de 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -14,6 +14,8 @@ for env_spec in gym.envs.registry.values(): if type(env_spec.entry_point) is not str: continue + if "highway" in env_spec.entry_point: + continue # collect MO Gymnasium envs if env_spec.entry_point.split(".")[0] == "mo_gymnasium": all_testing_env_specs.append(env_spec) @@ -46,7 +48,7 @@ def test_all_env_passive_env_checker(spec): [ ("MountainCar-v0", "mo-mountaincar-v0"), ("MountainCarContinuous-v0", "mo-mountaincarcontinuous-v0"), - ("LunarLander-v2", "mo-lunar-lander-v2"), + ("LunarLander-v3", "mo-lunar-lander-v3"), # ("Reacher-v4", "mo-reacher-v4"), # use a different model and action space ("Hopper-v4", "mo-hopper-v4"), ("HalfCheetah-v4", "mo-halfcheetah-v4"), From efe8bb7ebebd87bd17531c4cea8fbdfc70d27c24 Mon Sep 17 00:00:00 2001 From: Florian Felten Date: Thu, 23 May 2024 13:35:40 +0200 Subject: [PATCH 06/21] Migrate wrappers --- README.md | 2 +- docs/wrappers/wrappers.md | 10 +- mo_gymnasium/__init__.py | 9 +- mo_gymnasium/utils.py | 344 +----------------------------- mo_gymnasium/wrappers/__init__.py | 9 + mo_gymnasium/wrappers/wrappers.py | 295 +++++++++++++++++++++++++ pyproject.toml | 2 +- tests/test_envs.py | 8 +- tests/test_vector_wrappers.py | 0 tests/test_wrappers.py | 73 ++----- 10 files changed, 334 insertions(+), 418 deletions(-) create mode 100644 mo_gymnasium/wrappers/__init__.py create mode 100644 mo_gymnasium/wrappers/wrappers.py create mode 100644 tests/test_vector_wrappers.py diff --git a/README.md b/README.md index fb5f7885..708bb3a6 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ obs, info = env.reset() next_obs, vector_reward, terminated, truncated, info = env.step(your_agent.act(obs)) # Optionally, you can scalarize the reward function with the LinearReward wrapper -env = mo_gym.LinearReward(env, weight=np.array([0.8, 0.2, 0.2])) +env = mo_gym.wrappers.LinearReward(env, weight=np.array([0.8, 0.2, 0.2])) ``` For details on multi-objective MDP's (MOMDP's) and other MORL definitions, see [A practical guide to multi-objective reinforcement learning and planning](https://link.springer.com/article/10.1007/s10458-022-09552-y). diff --git a/docs/wrappers/wrappers.md b/docs/wrappers/wrappers.md index 542e5cca..5d6bd55d 100644 --- a/docs/wrappers/wrappers.md +++ b/docs/wrappers/wrappers.md @@ -11,29 +11,29 @@ A few wrappers inspired from Gymnasium's wrappers are available in MO-Gymnasium. ```{eval-rst} -.. autoclass:: mo_gymnasium.LinearReward +.. autoclass:: mo_gymnasium.wrappers.LinearReward ``` ## `MONormalizeReward` ```{eval-rst} -.. autoclass:: mo_gymnasium.MONormalizeReward +.. autoclass:: mo_gymnasium.wrappers.MONormalizeReward ``` ## `MOClipReward` ```{eval-rst} -.. autoclass:: mo_gymnasium.MOClipReward +.. autoclass:: mo_gymnasium.wrappers.MOClipReward ``` ## `MOSyncVectorEnv` ```{eval-rst} -.. autoclass:: mo_gymnasium.MOSyncVectorEnv +.. autoclass:: mo_gymnasium.wrappers.MOSyncVectorEnv ``` ## `MORecordEpisodeStatistics` ```{eval-rst} -.. autoclass:: mo_gymnasium.MORecordEpisodeStatistics +.. autoclass:: mo_gymnasium.wrappers.MORecordEpisodeStatistics ``` diff --git a/mo_gymnasium/__init__.py b/mo_gymnasium/__init__.py index 23201d0c..f49af753 100644 --- a/mo_gymnasium/__init__.py +++ b/mo_gymnasium/__init__.py @@ -4,14 +4,7 @@ import mo_gymnasium.envs # Utils -from mo_gymnasium.utils import ( - LinearReward, - MOClipReward, - MONormalizeReward, - MORecordEpisodeStatistics, - MOSyncVectorEnv, - make, -) +from mo_gymnasium.utils import make __version__ = "1.1.0" diff --git a/mo_gymnasium/utils.py b/mo_gymnasium/utils.py index 31460a0d..a5c41a14 100644 --- a/mo_gymnasium/utils.py +++ b/mo_gymnasium/utils.py @@ -1,14 +1,8 @@ -"""Utilities function such as wrappers.""" +"""Utilities functions.""" -import time -from copy import deepcopy -from typing import Iterator, Tuple, TypeVar +from typing import TypeVar import gymnasium as gym -import numpy as np -from gymnasium.vector import SyncVectorEnv -from gymnasium.wrappers.common import RecordEpisodeStatistics -from gymnasium.wrappers.utils import RunningMeanStd ObsType = TypeVar("ObsType") @@ -26,337 +20,3 @@ def make(env_name: str, disable_env_checker: bool = True, **kwargs) -> gym.Env: """ """Disable env checker, as it requires the reward to be a scalar.""" return gym.make(env_name, disable_env_checker=disable_env_checker, **kwargs) - - -class LinearReward(gym.Wrapper, gym.utils.RecordConstructorArgs): - """Makes the env return a scalar reward, which is the dot-product between the reward vector and the weight vector.""" - - def __init__(self, env: gym.Env, weight: np.ndarray = None): - """Makes the env return a scalar reward, which is the dot-product between the reward vector and the weight vector. - - Args: - env: env to wrap - weight: weight vector to use in the dot product - """ - gym.utils.RecordConstructorArgs.__init__(self, weight=weight) - gym.Wrapper.__init__(self, env) - if weight is None: - weight = np.ones(shape=env.unwrapped.reward_space.shape) - self.set_weight(weight) - - def set_weight(self, weight: np.ndarray): - """Changes weights for the scalarization. - - Args: - weight: new weights to set - Returns: nothing - """ - assert weight.shape == self.env.unwrapped.reward_space.shape, "Reward weight has different shape than reward vector." - self.w = weight - - def step(self, action: ActType) -> Tuple[ObsType, float, bool, bool, dict]: - """Steps in the environment. - - Args: - action: action to perform - Returns: obs, scalarized_reward, terminated, truncated, info - """ - observation, reward, terminated, truncated, info = self.env.step(action) - scalar_reward = np.dot(reward, self.w) - info["vector_reward"] = reward - info["reward_weights"] = self.w - - return observation, scalar_reward, terminated, truncated, info - - -class MONormalizeReward(gym.Wrapper, gym.utils.RecordConstructorArgs): - """Wrapper to normalize the reward component at index idx. Does not touch other reward components.""" - - def __init__(self, env: gym.Env, idx: int, gamma: float = 0.99, epsilon: float = 1e-8): - """This wrapper will normalize immediate rewards s.t. their exponential moving average has a fixed variance. - - Args: - env (env): The environment to apply the wrapper - idx (int): the index of the reward to normalize - epsilon (float): A stability parameter - gamma (float): The discount factor that is used in the exponential moving average. - """ - gym.utils.RecordConstructorArgs.__init__(self, idx=idx, gamma=gamma, epsilon=epsilon) - gym.Wrapper.__init__(self, env) - self.idx = idx - self.num_envs = getattr(env, "num_envs", 1) - self.is_vector_env = getattr(env, "is_vector_env", False) - self.return_rms = RunningMeanStd(shape=()) - self.returns = np.zeros(self.num_envs) - self.gamma = gamma - self.epsilon = epsilon - - def step(self, action: ActType): - """Steps through the environment, normalizing the rewards returned. - - Args: - action: action to perform - Returns: obs, normalized_rewards, terminated, truncated, infos - """ - obs, rews, terminated, truncated, infos = self.env.step(action) - # Extracts the objective value to normalize - to_normalize = rews[self.idx] - if not self.is_vector_env: - to_normalize = np.array([to_normalize]) - self.returns = self.returns * self.gamma + to_normalize - # Defer normalization to gym implementation - to_normalize = self.normalize(to_normalize) - self.returns[terminated] = 0.0 - if not self.is_vector_env: - to_normalize = to_normalize[0] - # Injecting the normalized objective value back into the reward vector - rews[self.idx] = to_normalize - return obs, rews, terminated, truncated, infos - - def normalize(self, rews): - """Normalizes the rewards with the running mean rewards and their variance. - - Args: - rews: rewards - Returns: the normalized reward - """ - self.return_rms.update(self.returns) - return rews / np.sqrt(self.return_rms.var + self.epsilon) - - -class MOClipReward(gym.RewardWrapper, gym.utils.RecordConstructorArgs): - """Clip reward[idx] to [min, max].""" - - def __init__(self, env: gym.Env, idx: int, min_r, max_r): - """Clip reward[idx] to [min, max]. - - Args: - env: environment to wrap - idx: index of the MO reward to clip - min_r: min reward - max_r: max reward - """ - gym.utils.RecordConstructorArgs.__init__(self, idx=idx, min_r=min_r, max_r=max_r) - gym.RewardWrapper.__init__(self, env) - self.idx = idx - self.min_r = min_r - self.max_r = max_r - - def reward(self, reward): - """Clips the reward at the given index. - - Args: - reward: reward to clip. - Returns: the clipped reward. - """ - reward[self.idx] = np.clip(reward[self.idx], self.min_r, self.max_r) - return reward - - -class MOSyncVectorEnv(SyncVectorEnv): - """Vectorized environment that serially runs multiple environments.""" - - def __init__( - self, - env_fns: Iterator[callable], - copy: bool = True, - ): - """Vectorized environment that serially runs multiple environments. - - Args: - env_fns: env constructors - copy: If ``True``, then the :meth:`reset` and :meth:`step` methods return a copy of the observations. - """ - SyncVectorEnv.__init__(self, env_fns, copy=copy) - # Just overrides the rewards memory to add the number of objectives - self.reward_space = self.envs[0].unwrapped.reward_space - self._rewards = np.zeros( - ( - self.num_envs, - self.reward_space.shape[0], - ), - dtype=np.float64, - ) - - -class MORecordEpisodeStatistics(RecordEpisodeStatistics, gym.utils.RecordConstructorArgs): - """This wrapper will keep track of cumulative rewards and episode lengths. - - After the completion of an episode, ``info`` will look like this:: - - >>> info = { - ... "episode": { - ... "r": "", - ... "dr": "", - ... "l": "", # contrary to Gymnasium, these are not a numpy array - ... "t": "" - ... }, - ... } - - For a vectorized environments the output will be in the form of (be careful to first wrap the env into vector before applying MORewordStatistics):: - - >>> infos = { - ... "final_observation": "", - ... "_final_observation": "", - ... "final_info": "", - ... "_final_info": "", - ... "episode": { - ... "r": "", - ... "dr": "", - ... "l": "", - ... "t": "" - ... }, - ... "_episode": "" - ... } - """ - - def __init__(self, env: gym.Env, gamma: float = 1.0, deque_size: int = 100): - """This wrapper will keep track of cumulative rewards and episode lengths. - - Args: - env (Env): The environment to apply the wrapper - gamma (float): Discounting factor - deque_size: The size of the buffers :attr:`return_queue` and :attr:`length_queue` - """ - gym.utils.RecordConstructorArgs.__init__(self, gamma=gamma, deque_size=deque_size) - RecordEpisodeStatistics.__init__(self, env, deque_size=deque_size) - # CHANGE: Here we just override the standard implementation to extend to MO - # We also take care of the case where the env is vectorized - self.reward_dim = self.env.unwrapped.reward_space.shape[0] - if self.is_vector_env: - self.rewards_shape = (self.num_envs, self.reward_dim) - else: - self.rewards_shape = (self.reward_dim,) - self.gamma = gamma - - def reset(self, **kwargs): - """Resets the environment using kwargs and resets the episode returns and lengths.""" - obs, info = super().reset(**kwargs) - - # CHANGE: Here we just override the standard implementation to extend to MO - self.episode_returns = np.zeros(self.rewards_shape, dtype=np.float32) - self.disc_episode_returns = np.zeros(self.rewards_shape, dtype=np.float32) - - return obs, info - - def step(self, action): - """Steps through the environment, recording the episode statistics.""" - # This is very close the code from the RecordEpisodeStatistics wrapper from gym. - ( - observations, - rewards, - terminations, - truncations, - infos, - ) = self.env.step(action) - assert isinstance( - infos, dict - ), f"`info` dtype is {type(infos)} while supported dtype is `dict`. This may be due to usage of other wrappers in the wrong order." - self.episode_returns += rewards - self.episode_lengths += 1 - - # CHANGE: The discounted returns are also computed here - self.disc_episode_returns += rewards * np.repeat(self.gamma**self.episode_lengths, self.reward_dim).reshape( - self.episode_returns.shape - ) - - dones = np.logical_or(terminations, truncations) - num_dones = np.sum(dones) - if num_dones: - if "episode" in infos or "_episode" in infos: - raise ValueError("Attempted to add episode stats when they already exist") - else: - episode_return = np.zeros(self.rewards_shape, dtype=np.float32) - disc_episode_return = np.zeros(self.rewards_shape, dtype=np.float32) - if self.is_vector_env: - for i in range(self.num_envs): - if dones[i]: - # CHANGE: Makes a deepcopy to avoid subsequent mutations - episode_return[i] = deepcopy(self.episode_returns[i]) - disc_episode_return[i] = deepcopy(self.disc_episode_returns[i]) - else: - episode_return = deepcopy(self.episode_returns) - disc_episode_return = deepcopy(self.disc_episode_returns) - - length_eps = np.where(dones, self.episode_lengths, 0) - time_eps = np.where( - dones, - np.round(time.perf_counter() - self.episode_start_times, 6), - 0.0, - ) - - infos["episode"] = { - "r": episode_return, - "dr": disc_episode_return, - "l": length_eps[0] if not self.is_vector_env else length_eps, - "t": time_eps[0] if not self.is_vector_env else time_eps, - } - if self.is_vector_env: - infos["_episode"] = np.where(dones, True, False) - self.return_queue.extend(self.episode_returns[dones]) - self.length_queue.extend(self.episode_lengths[dones]) - self.episode_count += num_dones - self.episode_lengths[dones] = 0 - self.episode_returns[dones] = np.zeros(self.reward_dim, dtype=np.float32) - self.disc_episode_returns[dones] = np.zeros(self.reward_dim, dtype=np.float32) - self.episode_start_times[dones] = time.perf_counter() - return ( - observations, - rewards, - terminations, - truncations, - infos, - ) - - -class MOMaxAndSkipObservation(gym.Wrapper): - """This wrapper will return only every ``skip``-th frame (frameskipping) and return the max between the two last observations. - - Note: This wrapper is based on the wrapper from stable-baselines3: https://stable-baselines3.readthedocs.io/en/master/_modules/stable_baselines3/common/atari_wrappers.html#MaxAndSkipEnv - """ - - def __init__(self, env: gym.Env[ObsType, ActType], skip: int = 4): - """This wrapper will return only every ``skip``-th frame (frameskipping) and return the max between the two last frames. - - Args: - env (Env): The environment to apply the wrapper - skip: The number of frames to skip - """ - gym.Wrapper.__init__(self, env) - - if not np.issubdtype(type(skip), np.integer): - raise TypeError(f"The skip is expected to be an integer, actual type: {type(skip)}") - if skip < 2: - raise ValueError(f"The skip value needs to be equal or greater than two, actual value: {skip}") - if env.observation_space.shape is None: - raise ValueError("The observation space must have the shape attribute.") - - self._skip = skip - self._obs_buffer = np.zeros((2, *env.observation_space.shape), dtype=env.observation_space.dtype) - - def step(self, action): - """Step the environment with the given action for ``skip`` steps. - - Repeat action, sum reward, and max over last observations. - - Args: - action: The action to step through the environment with - Returns: - Max of the last two observations, reward, terminated, truncated, and info from the environment - """ - total_reward = np.zeros(self.env.unwrapped.reward_dim, dtype=np.float32) - terminated = truncated = False - info = {} - for i in range(self._skip): - obs, reward, terminated, truncated, info = self.env.step(action) - done = terminated or truncated - if i == self._skip - 2: - self._obs_buffer[0] = obs - if i == self._skip - 1: - self._obs_buffer[1] = obs - total_reward += reward - if done: - break - max_frame = self._obs_buffer.max(axis=0) - - return max_frame, total_reward, terminated, truncated, info diff --git a/mo_gymnasium/wrappers/__init__.py b/mo_gymnasium/wrappers/__init__.py new file mode 100644 index 00000000..e77291a4 --- /dev/null +++ b/mo_gymnasium/wrappers/__init__.py @@ -0,0 +1,9 @@ +"""Contains all wrappers (vectors or not).""" + +from mo_gymnasium.wrappers.wrappers import ( + LinearReward, + MOClipReward, + MOMaxAndSkipObservation, + MONormalizeReward, + MORecordEpisodeStatistics, +) diff --git a/mo_gymnasium/wrappers/wrappers.py b/mo_gymnasium/wrappers/wrappers.py new file mode 100644 index 00000000..7e4071f3 --- /dev/null +++ b/mo_gymnasium/wrappers/wrappers.py @@ -0,0 +1,295 @@ +"""Wrappers.""" + +import time +from copy import deepcopy +from typing import Tuple, TypeVar + +import gymnasium as gym +import numpy as np +from gymnasium.wrappers.common import RecordEpisodeStatistics +from gymnasium.wrappers.utils import RunningMeanStd + + +ObsType = TypeVar("ObsType") +ActType = TypeVar("ActType") + + +class LinearReward(gym.Wrapper, gym.utils.RecordConstructorArgs): + """Makes the env return a scalar reward, which is the dot-product between the reward vector and the weight vector.""" + + def __init__(self, env: gym.Env, weight: np.ndarray = None): + """Makes the env return a scalar reward, which is the dot-product between the reward vector and the weight vector. + + Args: + env: env to wrap + weight: weight vector to use in the dot product + """ + gym.utils.RecordConstructorArgs.__init__(self, weight=weight) + gym.Wrapper.__init__(self, env) + if weight is None: + weight = np.ones(shape=env.unwrapped.reward_space.shape) + self.set_weight(weight) + + def set_weight(self, weight: np.ndarray): + """Changes weights for the scalarization. + + Args: + weight: new weights to set + Returns: nothing + """ + assert weight.shape == self.env.unwrapped.reward_space.shape, "Reward weight has different shape than reward vector." + self.w = weight + + def step(self, action: ActType) -> Tuple[ObsType, float, bool, bool, dict]: + """Steps in the environment. + + Args: + action: action to perform + Returns: obs, scalarized_reward, terminated, truncated, info + """ + observation, reward, terminated, truncated, info = self.env.step(action) + scalar_reward = np.dot(reward, self.w) + info["vector_reward"] = reward + info["reward_weights"] = self.w + + return observation, scalar_reward, terminated, truncated, info + + +class MONormalizeReward(gym.Wrapper, gym.utils.RecordConstructorArgs): + """Wrapper to normalize the reward component at index idx. Does not touch other reward components. + + This code is heavily inspired on Gymnasium's except that it extracts the reward component at given idx, normalizes it, and reinjects it. + """ + + def __init__(self, env: gym.Env, idx: int, gamma: float = 0.99, epsilon: float = 1e-8): + """This wrapper will normalize immediate rewards s.t. their exponential moving average has a fixed variance. + + Args: + env (env): The environment to apply the wrapper + idx (int): the index of the reward to normalize + epsilon (float): A stability parameter + gamma (float): The discount factor that is used in the exponential moving average. + """ + gym.utils.RecordConstructorArgs.__init__(self, idx=idx, gamma=gamma, epsilon=epsilon) + gym.Wrapper.__init__(self, env) + self.idx = idx + self.return_rms = RunningMeanStd(shape=()) + self.discounted_reward: np.array = np.array([0.0]) + self.gamma = gamma + self.epsilon = epsilon + self._update_running_mean = True + + @property + def update_running_mean(self) -> bool: + """Property to freeze/continue the running mean calculation of the reward statistics.""" + return self._update_running_mean + + @update_running_mean.setter + def update_running_mean(self, setting: bool): + """Sets the property to freeze/continue the running mean calculation of the reward statistics.""" + self._update_running_mean = setting + + def step(self, action: ActType): + """Steps through the environment, normalizing the rewards returned. + + Args: + action: action to perform + Returns: obs, normalized_rewards, terminated, truncated, infos + """ + obs, rews, terminated, truncated, infos = self.env.step(action) + # Extracts the objective value to normalize + print("Normalizing reward at index", self.idx) + to_normalize = rews[self.idx] + + self.discounted_reward = self.discounted_reward * self.gamma * (1 - terminated) + float(to_normalize) + if self._update_running_mean: + self.return_rms.update(self.discounted_reward) + + # We don't (reward - self.return_rms.mean) see https://github.com/openai/baselines/issues/538 + normalized_reward = to_normalize / np.sqrt(self.return_rms.var + self.epsilon) + + # Injecting the normalized objective value back into the reward vector + rews[self.idx] = normalized_reward + return obs, rews, terminated, truncated, infos + + +class MOClipReward(gym.RewardWrapper, gym.utils.RecordConstructorArgs): + """Clip reward[idx] to [min, max].""" + + def __init__(self, env: gym.Env, idx: int, min_r, max_r): + """Clip reward[idx] to [min, max]. + + Args: + env: environment to wrap + idx: index of the MO reward to clip + min_r: min reward + max_r: max reward + """ + gym.utils.RecordConstructorArgs.__init__(self, idx=idx, min_r=min_r, max_r=max_r) + gym.RewardWrapper.__init__(self, env) + self.idx = idx + self.min_r = min_r + self.max_r = max_r + + def reward(self, reward): + """Clips the reward at the given index. + + Args: + reward: reward to clip. + Returns: the clipped reward. + """ + reward[self.idx] = np.clip(reward[self.idx], self.min_r, self.max_r) + return reward + + +class MORecordEpisodeStatistics(RecordEpisodeStatistics, gym.utils.RecordConstructorArgs): + """This wrapper will keep track of cumulative rewards and episode lengths. + + After the completion of an episode, ``info`` will look like this:: + + >>> info = { + ... "episode": { + ... "r": "", + ... "dr": "", + ... "l": "", + ... "t": "" + ... }, + ... } + """ + + def __init__( + self, + env: gym.Env, + gamma: float = 1.0, + buffer_length: int = 100, + stats_key: str = "episode", + ): + """This wrapper will keep track of cumulative rewards and episode lengths. + + Args: + env (Env): The environment to apply the wrapper + gamma (float): Discounting factor + buffer_length: The size of the buffers :attr:`return_queue`, :attr:`length_queue` and :attr:`time_queue` + stats_key: The info key for the episode statistics + """ + gym.utils.RecordConstructorArgs.__init__(self, gamma=gamma, buffer_length=buffer_length, stats_key=stats_key) + RecordEpisodeStatistics.__init__(self, env, buffer_length=buffer_length, stats_key=stats_key) + # CHANGE: Here we just override the standard implementation to extend to MO + self.reward_dim = self.env.unwrapped.reward_space.shape[0] + self.rewards_shape = (self.reward_dim,) + self.gamma = gamma + + def step(self, action): + """Steps through the environment, recording the episode statistics.""" + # This is very close the code from the RecordEpisodeStatistics wrapper from Gymnasium. + ( + observation, + rewards, + terminated, + truncated, + info, + ) = self.env.step(action) + assert isinstance( + info, dict + ), f"`info` dtype is {type(info)} while supported dtype is `dict`. This may be due to usage of other wrappers in the wrong order." + self.episode_returns += rewards + self.episode_lengths += 1 + + # CHANGE: The discounted returns are also computed here + self.disc_episode_returns += rewards * np.repeat(self.gamma**self.episode_lengths, self.reward_dim).reshape( + self.episode_returns.shape + ) + + if terminated or truncated: + assert self._stats_key not in info + + episode_time_length = round(time.perf_counter() - self.episode_start_time, 6) + + # Make a deepcopy to void subsequent mutation of the numpy array + episode_returns = deepcopy(self.episode_returns) + disc_episode_returns = deepcopy(self.disc_episode_returns) + + info["episode"] = { + "r": episode_returns, + "dr": disc_episode_returns, + "l": self.episode_lengths, + "t": episode_time_length, + } + + self.time_queue.append(episode_time_length) + self.return_queue.append(episode_returns) + self.length_queue.append(self.episode_lengths) + + self.episode_count += 1 + self.episode_start_time = time.perf_counter() + + return ( + observation, + rewards, + terminated, + truncated, + info, + ) + + def reset(self, **kwargs): + """Resets the environment using kwargs and resets the episode returns and lengths.""" + obs, info = super().reset(**kwargs) + + # CHANGE: Here we just override the standard implementation to extend to MO + self.episode_returns = np.zeros(self.rewards_shape, dtype=np.float32) + self.disc_episode_returns = np.zeros(self.rewards_shape, dtype=np.float32) + + return obs, info + + +class MOMaxAndSkipObservation(gym.Wrapper): + """This wrapper will return only every ``skip``-th frame (frameskipping) and return the max between the two last observations. + + Note: This wrapper is based on the wrapper from stable-baselines3: https://stable-baselines3.readthedocs.io/en/master/_modules/stable_baselines3/common/atari_wrappers.html#MaxAndSkipEnv + """ + + def __init__(self, env: gym.Env[ObsType, ActType], skip: int = 4): + """This wrapper will return only every ``skip``-th frame (frameskipping) and return the max between the two last frames. + + Args: + env (Env): The environment to apply the wrapper + skip: The number of frames to skip + """ + gym.Wrapper.__init__(self, env) + + if not np.issubdtype(type(skip), np.integer): + raise TypeError(f"The skip is expected to be an integer, actual type: {type(skip)}") + if skip < 2: + raise ValueError(f"The skip value needs to be equal or greater than two, actual value: {skip}") + if env.observation_space.shape is None: + raise ValueError("The observation space must have the shape attribute.") + + self._skip = skip + self._obs_buffer = np.zeros((2, *env.observation_space.shape), dtype=env.observation_space.dtype) + + def step(self, action): + """Step the environment with the given action for ``skip`` steps. + + Repeat action, sum reward, and max over last observations. + + Args: + action: The action to step through the environment with + Returns: + Max of the last two observations, reward, terminated, truncated, and info from the environment + """ + total_reward = np.zeros(self.env.unwrapped.reward_dim, dtype=np.float32) + terminated = truncated = False + info = {} + for i in range(self._skip): + obs, reward, terminated, truncated, info = self.env.step(action) + done = terminated or truncated + if i == self._skip - 2: + self._obs_buffer[0] = obs + if i == self._skip - 1: + self._obs_buffer[1] = obs + total_reward += reward + if done: + break + max_frame = self._obs_buffer.max(axis=0) + + return max_frame, total_reward, terminated, truncated, info diff --git a/pyproject.toml b/pyproject.toml index 160f2b53..840ad0cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ 'Topic :: Scientific/Engineering :: Artificial Intelligence', ] dependencies = [ - "gymnasium>=0.28.1,<0.30", + "gymnasium>=0.28.1", "numpy >=1.21.0", "pygame >=2.1.0", "scipy >=1.7.3", diff --git a/tests/test_envs.py b/tests/test_envs.py index c2d395de..16180c8b 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -29,7 +29,7 @@ def test_all_env_api(spec): """Check that all environments pass the environment checker.""" env = mo_gym.make(spec.id) - env = mo_gym.LinearReward(env) + env = mo_gym.wrappers.LinearReward(env) check_env(env, skip_render_check=True) _test_reward_bounds(env.unwrapped) _test_pickle_env(env) @@ -60,7 +60,7 @@ def test_all_env_passive_env_checker(spec): ) def test_gymnasium_equivalence(gym_id, mo_gym_id, num_steps=100, seed=123): env = gym.make(gym_id) - mo_env = mo_gym.LinearReward(mo_gym.make(mo_gym_id)) + mo_env = mo_gym.wrappers.LinearReward(mo_gym.make(mo_gym_id)) # for float rewards, then precision becomes an issue env = gym.wrappers.TransformReward(env, lambda reward: round(reward, 4)) @@ -95,8 +95,8 @@ def test_env_determinism_rollout(env_spec: EnvSpec): env_1 = mo_gym.make(env_spec.id) env_2 = mo_gym.make(env_spec.id) - env_1 = mo_gym.LinearReward(env_1) - env_2 = mo_gym.LinearReward(env_2) + env_1 = mo_gym.wrappers.LinearReward(env_1) + env_2 = mo_gym.wrappers.LinearReward(env_2) initial_obs_1, initial_info_1 = env_1.reset(seed=SEED) initial_obs_2, initial_info_2 = env_2.reset(seed=SEED) diff --git a/tests/test_vector_wrappers.py b/tests/test_vector_wrappers.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_wrappers.py b/tests/test_wrappers.py index 9cf42354..b8bc2697 100644 --- a/tests/test_wrappers.py +++ b/tests/test_wrappers.py @@ -1,11 +1,10 @@ import numpy as np import mo_gymnasium as mo_gym -from mo_gymnasium import ( +from mo_gymnasium.wrappers import ( MOClipReward, MONormalizeReward, MORecordEpisodeStatistics, - MOSyncVectorEnv, ) @@ -25,24 +24,31 @@ def test_normalization_wrapper(): norm_treasure_env = MONormalizeReward(env, idx=0) both_norm_env = MONormalizeReward(norm_treasure_env, idx=1) + # No normalization + env.reset(seed=0) + _, rewards, _, _, _ = env.step(1) + np.testing.assert_allclose(rewards, [0.7, -1.0], rtol=0, atol=1e-2) + # Tests for both rewards normalized for i in range(30): go_to_8_3(both_norm_env) - both_norm_env.reset() + both_norm_env.reset(seed=0) _, rewards, _, _, _ = both_norm_env.step(1) # down - np.testing.assert_allclose(rewards, [0.18, -1.24], rtol=0, atol=1e-2) + np.testing.assert_allclose( + rewards, [0.49, -1.24], rtol=0, atol=1e-2 + ) # TODO PR check why we had to change those values @Mark? rewards, _ = go_to_8_3(both_norm_env) - np.testing.assert_allclose(rewards, [2.13, -1.24], rtol=0, atol=1e-2) + np.testing.assert_allclose(rewards, [4.73, -1.24], rtol=0, atol=1e-2) # Tests for only treasure normalized for i in range(30): go_to_8_3(norm_treasure_env) - norm_treasure_env.reset() + norm_treasure_env.reset(seed=0) _, rewards, _, _, _ = norm_treasure_env.step(1) # down # Time rewards are not normalized (-1) - np.testing.assert_allclose(rewards, [0.18, -1.0], rtol=0, atol=1e-2) + np.testing.assert_allclose(rewards, [0.51, -1.0], rtol=0, atol=1e-2) rewards, _ = go_to_8_3(norm_treasure_env) - np.testing.assert_allclose(rewards, [2.13, -1.0], rtol=0, atol=1e-2) + np.testing.assert_allclose(rewards, [5.33, -1.0], rtol=0, atol=1e-2) def test_clip_wrapper(): @@ -66,26 +72,6 @@ def test_clip_wrapper(): np.testing.assert_allclose(rewards, [0.5, -1.0], rtol=0, atol=1e-2) -def test_mo_sync_wrapper(): - def make_env(env_id): - def thunk(): - env = mo_gym.make(env_id) - env = MORecordEpisodeStatistics(env, gamma=0.97) - return env - - return thunk - - num_envs = 3 - envs = MOSyncVectorEnv([make_env("deep-sea-treasure-v0") for _ in range(num_envs)]) - - envs.reset() - obs, rewards, terminateds, truncateds, infos = envs.step(envs.action_space.sample()) - assert len(obs) == num_envs, "Number of observations do not match the number of envs" - assert len(rewards) == num_envs, "Number of rewards do not match the number of envs" - assert len(terminateds) == num_envs, "Number of terminateds do not match the number of envs" - assert len(truncateds) == num_envs, "Number of truncateds do not match the number of envs" - - def test_mo_record_ep_statistic(): env = mo_gym.make("deep-sea-treasure-v0") env = MORecordEpisodeStatistics(env, gamma=0.97) @@ -102,33 +88,6 @@ def test_mo_record_ep_statistic(): np.float32(7.48), np.float32(-2.82), ) - assert isinstance(info["episode"]["l"], np.int32) + assert isinstance(info["episode"]["l"], int) assert info["episode"]["l"] == 3 - assert isinstance(info["episode"]["t"], np.float32) - - -def test_mo_record_ep_statistic_vector_env(): - def make_env(env_id): - def thunk(): - env = mo_gym.make(env_id) - return env - - return thunk - - num_envs = 3 - envs = MOSyncVectorEnv([make_env("deep-sea-treasure-v0") for _ in range(num_envs)]) - envs = MORecordEpisodeStatistics(envs) - - envs.reset() - terminateds = np.array([False] * num_envs) - info = {} - while not np.any(terminateds): - obs, rewards, terminateds, _, info = envs.step(envs.action_space.sample()) - - assert isinstance(info["episode"]["r"], np.ndarray) - assert isinstance(info["episode"]["dr"], np.ndarray) - # Episode records are vectorized because multiple environments - assert info["episode"]["r"].shape == (num_envs, 2) - assert info["episode"]["dr"].shape == (num_envs, 2) - assert isinstance(info["episode"]["l"], np.ndarray) - assert isinstance(info["episode"]["t"], np.ndarray) + assert isinstance(info["episode"]["t"], float) From b2f2b53fb941c6523f1bd2c2fa7a9fc251bd18e3 Mon Sep 17 00:00:00 2001 From: Florian Felten Date: Thu, 23 May 2024 14:51:59 +0200 Subject: [PATCH 07/21] WIP --- mo_gymnasium/wrappers/vector/__init__.py | 6 + mo_gymnasium/wrappers/vector/wrappers.py | 252 +++++++++++++++++++++++ tests/test_vector_wrappers.py | 57 +++++ 3 files changed, 315 insertions(+) create mode 100644 mo_gymnasium/wrappers/vector/__init__.py create mode 100644 mo_gymnasium/wrappers/vector/wrappers.py diff --git a/mo_gymnasium/wrappers/vector/__init__.py b/mo_gymnasium/wrappers/vector/__init__.py new file mode 100644 index 00000000..60225b17 --- /dev/null +++ b/mo_gymnasium/wrappers/vector/__init__.py @@ -0,0 +1,6 @@ +"""Vector wrappers.""" + +from mo_gymnasium.wrappers.vector.wrappers import ( + MORecordEpisodeStatistics, + MOSyncVectorEnv, +) diff --git a/mo_gymnasium/wrappers/vector/wrappers.py b/mo_gymnasium/wrappers/vector/wrappers.py new file mode 100644 index 00000000..369053db --- /dev/null +++ b/mo_gymnasium/wrappers/vector/wrappers.py @@ -0,0 +1,252 @@ +"""Vector wrappers.""" +import time +from copy import deepcopy +from typing import Any, Iterator + +import gymnasium as gym +import numpy as np +from gymnasium.core import ActType, ObsType +from gymnasium.vector import SyncVectorEnv +from gymnasium.vector.utils import concatenate, iterate +from gymnasium.vector.vector_env import ArrayType, VectorEnv +from gymnasium.wrappers.vector import RecordEpisodeStatistics + + +# class MONormalizeReward(gym.Wrapper, gym.utils.RecordConstructorArgs): +# """Wrapper to normalize the reward component at index idx. Does not touch other reward components.""" +# +# def __init__(self, env: gym.Env, idx: int, gamma: float = 0.99, epsilon: float = 1e-8): +# """This wrapper will normalize immediate rewards s.t. their exponential moving average has a fixed variance. +# +# Args: +# env (env): The environment to apply the wrapper +# idx (int): the index of the reward to normalize +# epsilon (float): A stability parameter +# gamma (float): The discount factor that is used in the exponential moving average. +# """ +# gym.utils.RecordConstructorArgs.__init__(self, idx=idx, gamma=gamma, epsilon=epsilon) +# gym.Wrapper.__init__(self, env) +# self.idx = idx +# self.num_envs = getattr(env, "num_envs", 1) +# self.is_vector_env = getattr(env, "is_vector_env", False) +# self.return_rms = RunningMeanStd(shape=()) +# self.returns = np.zeros(self.num_envs) +# self.gamma = gamma +# self.epsilon = epsilon +# +# def step(self, action: ActType): +# """Steps through the environment, normalizing the rewards returned. +# +# Args: +# action: action to perform +# Returns: obs, normalized_rewards, terminated, truncated, infos +# """ +# obs, rews, terminated, truncated, infos = self.env.step(action) +# # Extracts the objective value to normalize +# to_normalize = rews[self.idx] +# if not self.is_vector_env: +# to_normalize = np.array([to_normalize]) +# self.returns = self.returns * self.gamma + to_normalize +# # Defer normalization to gym implementation +# to_normalize = self.normalize(to_normalize) +# self.returns[terminated] = 0.0 +# if not self.is_vector_env: +# to_normalize = to_normalize[0] +# # Injecting the normalized objective value back into the reward vector +# rews[self.idx] = to_normalize +# return obs, rews, terminated, truncated, infos +# +# def normalize(self, rews): +# """Normalizes the rewards with the running mean rewards and their variance. +# +# Args: +# rews: rewards +# Returns: the normalized reward +# """ +# self.return_rms.update(self.returns) +# return rews / np.sqrt(self.return_rms.var + self.epsilon) +# +# +class MOSyncVectorEnv(SyncVectorEnv): + """Vectorized environment that serially runs multiple environments.""" + + def __init__( + self, + env_fns: Iterator[callable], + copy: bool = True, + ): + """Vectorized environment that serially runs multiple environments. + + Args: + env_fns: env constructors + copy: If ``True``, then the :meth:`reset` and :meth:`step` methods return a copy of the observations. + """ + SyncVectorEnv.__init__(self, env_fns, copy=copy) + # Just overrides the rewards memory to add the number of objectives + self.reward_space = self.envs[0].unwrapped.reward_space + self._rewards = np.zeros( + ( + self.num_envs, + self.reward_space.shape[0], + ), + dtype=np.float32, + ) + + def step(self, actions: ActType) -> tuple[ObsType, ArrayType, ArrayType, ArrayType, dict[str, Any]]: + """Steps through each of the environments returning the batched results. + + Returns: + The batched environment step results + """ + actions = iterate(self.action_space, actions) + + observations, infos = [], {} + for i, action in enumerate(actions): + if self._autoreset_envs[i]: + env_obs, env_info = self.envs[i].reset() + + self._rewards[i] = np.zeros(self.reward_space.shape[0]) # This overrides Gymnasium's implem + self._terminations[i] = False + self._truncations[i] = False + else: + (env_obs, self._rewards[i], self._terminations[i], self._truncations[i], env_info,) = self.envs[ + i + ].step(action) + + observations.append(env_obs) + infos = self._add_info(infos, env_info, i) + + # Concatenate the observations + self._observations = concatenate(self.single_observation_space, observations, self._observations) + self._autoreset_envs = np.logical_or(self._terminations, self._truncations) + + return ( + deepcopy(self._observations) if self.copy else self._observations, + np.copy(self._rewards), + np.copy(self._terminations), + np.copy(self._truncations), + infos, + ) + + +class MORecordEpisodeStatistics(RecordEpisodeStatistics): + """This wrapper will keep track of cumulative rewards and episode lengths. + + At the end of any episode within the vectorized env, the statistics of the episode + will be added to ``info`` using the key ``episode``, and the ``_episode`` key + is used to indicate the environment index which has a terminated or truncated episode. + + For a vectorized environments the output will be in the form of (be careful to first wrap the env into vector before applying MORewordStatistics):: + + >>> infos = { # doctest: +SKIP + ... "episode": { + ... "r": "", + ... "dr": "", + ... "l": "", + ... "t": "" + ... }, + ... "_episode": "" + ... } + + Moreover, the most recent rewards and episode lengths are stored in buffers that can be accessed via + :attr:`wrapped_env.return_queue` and :attr:`wrapped_env.length_queue` respectively. + + Attributes: + return_queue: The cumulative rewards of the last ``deque_size``-many episodes + length_queue: The lengths of the last ``deque_size``-many episodes + """ + + def __init__( + self, + env: VectorEnv, + gamma: float = 1.0, + buffer_length: int = 100, + stats_key: str = "episode", + ): + """This wrapper will keep track of cumulative rewards and episode lengths. + + Args: + env (Env): The environment to apply the wrapper + gamma: The discount factor + buffer_length: The size of the buffers :attr:`return_queue`, :attr:`length_queue` and :attr:`time_queue` + stats_key: The info key to save the data + """ + gym.utils.RecordConstructorArgs.__init__(self, buffer_length=buffer_length, stats_key=stats_key) + RecordEpisodeStatistics.__init__(self, env, buffer_length=buffer_length, stats_key=stats_key) + self.reward_dim = self.env.unwrapped.reward_space.shape[0] + self.rewards_shape = (self.num_envs, self.reward_dim) + self.gamma = gamma + + def reset(self, **kwargs): + """Resets the environment using kwargs and resets the episode returns and lengths.""" + obs, info = super().reset(**kwargs) + + # CHANGE: Here we just override the standard implementation to extend to MO + self.episode_returns = np.zeros(self.rewards_shape, dtype=np.float32) + self.disc_episode_returns = np.zeros(self.rewards_shape, dtype=np.float32) + + return obs, info + + def step(self, actions: ActType) -> tuple[ObsType, ArrayType, ArrayType, ArrayType, dict]: + """Steps through the environment, recording the episode statistics.""" + ( + observations, + rewards, + terminations, + truncations, + infos, + ) = self.env.step(actions) + + assert isinstance( + infos, dict + ), f"`vector.RecordEpisodeStatistics` requires `info` type to be `dict`, its actual type is {type(infos)}. This may be due to usage of other wrappers in the wrong order." + + self.episode_returns[self.prev_dones] = 0 + self.episode_lengths[self.prev_dones] = 0 + self.episode_start_times[self.prev_dones] = time.perf_counter() + self.episode_returns[~self.prev_dones] += rewards[~self.prev_dones] + self.episode_lengths[~self.prev_dones] += 1 + + # CHANGE: The discounted returns are also computed here + self.disc_episode_returns += rewards * np.repeat(self.gamma**self.episode_lengths, self.reward_dim).reshape( + self.episode_returns.shape + ) + + self.prev_dones = dones = np.logical_or(terminations, truncations) + num_dones = np.sum(dones) + if num_dones: + if self._stats_key in infos or f"_{self._stats_key}" in infos: + raise ValueError(f"Attempted to add episode stats when they already exist, info keys: {list(infos.keys())}") + else: + # CHANGE to handle the vectorial reward and do deepcopies + episode_return = np.zeros(self.rewards_shape, dtype=np.float32) + disc_episode_return = np.zeros(self.rewards_shape, dtype=np.float32) + + for i in range(self.num_envs): + if dones[i]: + episode_return[i] = np.copy(self.episode_returns[i]) + disc_episode_return[i] = np.copy(self.disc_episode_returns[i]) + + episode_time_length = np.round(time.perf_counter() - self.episode_start_times, 6) + infos[self._stats_key] = { + "r": np.where(dones, self.episode_returns, np.zeros(self.rewards_shape, dtype=np.float32)), + "dr": np.where(dones, self.disc_episode_returns, np.zeros(self.rewards_shape, dtype=np.float32)), + "l": np.where(dones, self.episode_lengths, 0), + "t": np.where(dones, episode_time_length, 0.0), + } + infos[f"_{self._stats_key}"] = dones + + self.episode_count += num_dones + + for i in np.where(dones): + self.time_queue.extend(episode_time_length[i]) + self.return_queue.extend(self.episode_returns[i]) + self.length_queue.extend(self.episode_lengths[i]) + + return ( + observations, + rewards, + terminations, + truncations, + infos, + ) diff --git a/tests/test_vector_wrappers.py b/tests/test_vector_wrappers.py index e69de29b..e42105af 100644 --- a/tests/test_vector_wrappers.py +++ b/tests/test_vector_wrappers.py @@ -0,0 +1,57 @@ +import numpy as np + +import mo_gymnasium as mo_gym +from mo_gymnasium.wrappers.vector import MORecordEpisodeStatistics, MOSyncVectorEnv + + +def test_mo_sync_wrapper(): + def make_env(env_id): + def thunk(): + env = mo_gym.make(env_id) + env = MORecordEpisodeStatistics(env, gamma=0.97) + return env + + return thunk + + num_envs = 3 + envs = MOSyncVectorEnv( + [ + lambda: make_env("deep-sea-treasure-v0"), + lambda: make_env("deep-sea-treasure-v0"), + lambda: make_env("deep-sea-treasure-v0"), + ] + ) + + envs.reset() + obs, rewards, terminateds, truncateds, infos = envs.step(envs.action_space.sample()) + assert len(obs) == num_envs, "Number of observations do not match the number of envs" + assert len(rewards) == num_envs, "Number of rewards do not match the number of envs" + assert len(terminateds) == num_envs, "Number of terminateds do not match the number of envs" + assert len(truncateds) == num_envs, "Number of truncateds do not match the number of envs" + + +def test_mo_record_ep_statistic_vector_env(): + def make_env(env_id): + def thunk(): + env = mo_gym.make(env_id) + return env + + return thunk + + num_envs = 3 + envs = MOSyncVectorEnv([make_env("deep-sea-treasure-v0") for _ in range(num_envs)]) + envs = MORecordEpisodeStatistics(envs) + + envs.reset() + terminateds = np.array([False] * num_envs) + info = {} + while not np.any(terminateds): + obs, rewards, terminateds, _, info = envs.step(envs.action_space.sample()) + + assert isinstance(info["episode"]["r"], np.ndarray) + assert isinstance(info["episode"]["dr"], np.ndarray) + # Episode records are vectorized because multiple environments + assert info["episode"]["r"].shape == (num_envs, 2) + assert info["episode"]["dr"].shape == (num_envs, 2) + assert isinstance(info["episode"]["l"], np.ndarray) + assert isinstance(info["episode"]["t"], np.ndarray) From 062849aee21da0940e4893053a0e768b1f1a1a12 Mon Sep 17 00:00:00 2001 From: Florian Felten Date: Thu, 23 May 2024 14:53:12 +0200 Subject: [PATCH 08/21] Rollback Vector env contstructor --- tests/test_vector_wrappers.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/tests/test_vector_wrappers.py b/tests/test_vector_wrappers.py index e42105af..4e574869 100644 --- a/tests/test_vector_wrappers.py +++ b/tests/test_vector_wrappers.py @@ -1,7 +1,10 @@ import numpy as np import mo_gymnasium as mo_gym -from mo_gymnasium.wrappers.vector import MORecordEpisodeStatistics, MOSyncVectorEnv +from mo_gymnasium.wrappers.vector import ( + MORecordEpisodeStatistics, + MOSyncVectorEnv, +) def test_mo_sync_wrapper(): @@ -14,13 +17,7 @@ def thunk(): return thunk num_envs = 3 - envs = MOSyncVectorEnv( - [ - lambda: make_env("deep-sea-treasure-v0"), - lambda: make_env("deep-sea-treasure-v0"), - lambda: make_env("deep-sea-treasure-v0"), - ] - ) + envs = MOSyncVectorEnv([make_env("deep-sea-treasure-v0") for _ in range(num_envs)]) envs.reset() obs, rewards, terminateds, truncateds, infos = envs.step(envs.action_space.sample()) From 7fbaf386e79ec55f960c0857d98197781d33f7b4 Mon Sep 17 00:00:00 2001 From: Florian Felten Date: Thu, 23 May 2024 15:04:13 +0200 Subject: [PATCH 09/21] Tests are passing --- mo_gymnasium/wrappers/vector/wrappers.py | 10 +++++----- tests/test_vector_wrappers.py | 6 +----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/mo_gymnasium/wrappers/vector/wrappers.py b/mo_gymnasium/wrappers/vector/wrappers.py index 369053db..0493f184 100644 --- a/mo_gymnasium/wrappers/vector/wrappers.py +++ b/mo_gymnasium/wrappers/vector/wrappers.py @@ -1,7 +1,7 @@ """Vector wrappers.""" import time from copy import deepcopy -from typing import Any, Iterator +from typing import Any, Iterator, Tuple, Dict import gymnasium as gym import numpy as np @@ -92,7 +92,7 @@ def __init__( dtype=np.float32, ) - def step(self, actions: ActType) -> tuple[ObsType, ArrayType, ArrayType, ArrayType, dict[str, Any]]: + def step(self, actions: ActType) -> Tuple[ObsType, ArrayType, ArrayType, ArrayType, Dict[str, Any]]: """Steps through each of the environments returning the batched results. Returns: @@ -187,7 +187,7 @@ def reset(self, **kwargs): return obs, info - def step(self, actions: ActType) -> tuple[ObsType, ArrayType, ArrayType, ArrayType, dict]: + def step(self, actions: ActType) -> Tuple[ObsType, ArrayType, ArrayType, ArrayType, Dict[str, Any]]: """Steps through the environment, recording the episode statistics.""" ( observations, @@ -229,8 +229,8 @@ def step(self, actions: ActType) -> tuple[ObsType, ArrayType, ArrayType, ArrayTy episode_time_length = np.round(time.perf_counter() - self.episode_start_times, 6) infos[self._stats_key] = { - "r": np.where(dones, self.episode_returns, np.zeros(self.rewards_shape, dtype=np.float32)), - "dr": np.where(dones, self.disc_episode_returns, np.zeros(self.rewards_shape, dtype=np.float32)), + "r": episode_return, + "dr": disc_episode_return, "l": np.where(dones, self.episode_lengths, 0), "t": np.where(dones, episode_time_length, 0.0), } diff --git a/tests/test_vector_wrappers.py b/tests/test_vector_wrappers.py index 4e574869..6a74e617 100644 --- a/tests/test_vector_wrappers.py +++ b/tests/test_vector_wrappers.py @@ -1,17 +1,13 @@ import numpy as np import mo_gymnasium as mo_gym -from mo_gymnasium.wrappers.vector import ( - MORecordEpisodeStatistics, - MOSyncVectorEnv, -) +from mo_gymnasium.wrappers.vector import MORecordEpisodeStatistics, MOSyncVectorEnv def test_mo_sync_wrapper(): def make_env(env_id): def thunk(): env = mo_gym.make(env_id) - env = MORecordEpisodeStatistics(env, gamma=0.97) return env return thunk From 7e9f5b82038bba75d70b55eccd82ce961bd222a6 Mon Sep 17 00:00:00 2001 From: Florian Felten Date: Thu, 23 May 2024 15:11:44 +0200 Subject: [PATCH 10/21] Remove comments --- mo_gymnasium/wrappers/vector/wrappers.py | 65 +++--------------------- 1 file changed, 8 insertions(+), 57 deletions(-) diff --git a/mo_gymnasium/wrappers/vector/wrappers.py b/mo_gymnasium/wrappers/vector/wrappers.py index 0493f184..832342f0 100644 --- a/mo_gymnasium/wrappers/vector/wrappers.py +++ b/mo_gymnasium/wrappers/vector/wrappers.py @@ -1,7 +1,7 @@ """Vector wrappers.""" import time from copy import deepcopy -from typing import Any, Iterator, Tuple, Dict +from typing import Any, Dict, Iterator, Tuple import gymnasium as gym import numpy as np @@ -12,61 +12,6 @@ from gymnasium.wrappers.vector import RecordEpisodeStatistics -# class MONormalizeReward(gym.Wrapper, gym.utils.RecordConstructorArgs): -# """Wrapper to normalize the reward component at index idx. Does not touch other reward components.""" -# -# def __init__(self, env: gym.Env, idx: int, gamma: float = 0.99, epsilon: float = 1e-8): -# """This wrapper will normalize immediate rewards s.t. their exponential moving average has a fixed variance. -# -# Args: -# env (env): The environment to apply the wrapper -# idx (int): the index of the reward to normalize -# epsilon (float): A stability parameter -# gamma (float): The discount factor that is used in the exponential moving average. -# """ -# gym.utils.RecordConstructorArgs.__init__(self, idx=idx, gamma=gamma, epsilon=epsilon) -# gym.Wrapper.__init__(self, env) -# self.idx = idx -# self.num_envs = getattr(env, "num_envs", 1) -# self.is_vector_env = getattr(env, "is_vector_env", False) -# self.return_rms = RunningMeanStd(shape=()) -# self.returns = np.zeros(self.num_envs) -# self.gamma = gamma -# self.epsilon = epsilon -# -# def step(self, action: ActType): -# """Steps through the environment, normalizing the rewards returned. -# -# Args: -# action: action to perform -# Returns: obs, normalized_rewards, terminated, truncated, infos -# """ -# obs, rews, terminated, truncated, infos = self.env.step(action) -# # Extracts the objective value to normalize -# to_normalize = rews[self.idx] -# if not self.is_vector_env: -# to_normalize = np.array([to_normalize]) -# self.returns = self.returns * self.gamma + to_normalize -# # Defer normalization to gym implementation -# to_normalize = self.normalize(to_normalize) -# self.returns[terminated] = 0.0 -# if not self.is_vector_env: -# to_normalize = to_normalize[0] -# # Injecting the normalized objective value back into the reward vector -# rews[self.idx] = to_normalize -# return obs, rews, terminated, truncated, infos -# -# def normalize(self, rews): -# """Normalizes the rewards with the running mean rewards and their variance. -# -# Args: -# rews: rewards -# Returns: the normalized reward -# """ -# self.return_rms.update(self.returns) -# return rews / np.sqrt(self.return_rms.var + self.epsilon) -# -# class MOSyncVectorEnv(SyncVectorEnv): """Vectorized environment that serially runs multiple environments.""" @@ -109,7 +54,13 @@ def step(self, actions: ActType) -> Tuple[ObsType, ArrayType, ArrayType, ArrayTy self._terminations[i] = False self._truncations[i] = False else: - (env_obs, self._rewards[i], self._terminations[i], self._truncations[i], env_info,) = self.envs[ + ( + env_obs, + self._rewards[i], + self._terminations[i], + self._truncations[i], + env_info, + ) = self.envs[ i ].step(action) From f6914a4339509233cc18a1a8b343ff47d80e097c Mon Sep 17 00:00:00 2001 From: Florian Felten Date: Thu, 23 May 2024 15:19:02 +0200 Subject: [PATCH 11/21] Export wrappers --- docs/index.md | 1 + docs/wrappers/vector_wrappers.md | 20 ++++++++++++++++++++ docs/wrappers/wrappers.md | 10 +++++----- mo_gymnasium/__init__.py | 1 + mo_gymnasium/wrappers/__init__.py | 1 + 5 files changed, 28 insertions(+), 5 deletions(-) create mode 100644 docs/wrappers/vector_wrappers.md diff --git a/docs/index.md b/docs/index.md index fb6d56ff..f1d24905 100644 --- a/docs/index.md +++ b/docs/index.md @@ -11,6 +11,7 @@ lastpage: introduction/install introduction/api wrappers/wrappers +wrappers/vector_wrappers examples/morl_baselines ``` diff --git a/docs/wrappers/vector_wrappers.md b/docs/wrappers/vector_wrappers.md new file mode 100644 index 00000000..ade24022 --- /dev/null +++ b/docs/wrappers/vector_wrappers.md @@ -0,0 +1,20 @@ +--- +title: "Vector Wrappers" +--- + +# Vector Wrappers + +Similar to the normal wrappers, MO-Gymnasium provides a few wrappers that are specifically designed to work with vectorized environments. They are all available directly from the `mo_gymnasium.wrappers.vector` module. + + +## `MOSyncVectorEnv` + +```{eval-rst} +.. autoclass:: mo_gymnasium.wrappers.vector.MOSyncVectorEnv +``` + +## `MORecordEpisodeStatistics` + +```{eval-rst} +.. autoclass:: mo_gymnasium.wrappers.vector.MORecordEpisodeStatistics +``` diff --git a/docs/wrappers/wrappers.md b/docs/wrappers/wrappers.md index 5d6bd55d..acf2ab56 100644 --- a/docs/wrappers/wrappers.md +++ b/docs/wrappers/wrappers.md @@ -4,7 +4,7 @@ title: "Wrappers" # Wrappers -A few wrappers inspired from Gymnasium's wrappers are available in MO-Gymnasium. They are all available directly from the `mo_gymnasium` module. +A few wrappers inspired from Gymnasium's wrappers are available in MO-Gymnasium. They are all available directly from the `mo_gymnasium.wrappers` module. ## `LinearReward` @@ -26,14 +26,14 @@ A few wrappers inspired from Gymnasium's wrappers are available in MO-Gymnasium. .. autoclass:: mo_gymnasium.wrappers.MOClipReward ``` -## `MOSyncVectorEnv` +## `MORecordEpisodeStatistics` ```{eval-rst} -.. autoclass:: mo_gymnasium.wrappers.MOSyncVectorEnv +.. autoclass:: mo_gymnasium.wrappers.MORecordEpisodeStatistics ``` -## `MORecordEpisodeStatistics` +## `MOMaxAndSkipObservation` ```{eval-rst} -.. autoclass:: mo_gymnasium.wrappers.MORecordEpisodeStatistics +.. autoclass:: mo_gymnasium.wrappers.MOMaxAndSkipObservation ``` diff --git a/mo_gymnasium/__init__.py b/mo_gymnasium/__init__.py index f49af753..a238e5cd 100644 --- a/mo_gymnasium/__init__.py +++ b/mo_gymnasium/__init__.py @@ -2,6 +2,7 @@ # Envs import mo_gymnasium.envs +from mo_gymnasium import wrappers # Utils from mo_gymnasium.utils import make diff --git a/mo_gymnasium/wrappers/__init__.py b/mo_gymnasium/wrappers/__init__.py index e77291a4..274241a0 100644 --- a/mo_gymnasium/wrappers/__init__.py +++ b/mo_gymnasium/wrappers/__init__.py @@ -1,5 +1,6 @@ """Contains all wrappers (vectors or not).""" +from mo_gymnasium.wrappers import vector from mo_gymnasium.wrappers.wrappers import ( LinearReward, MOClipReward, From bbaab1e6e6cc9a66a26a6fd7056e5df00650c847 Mon Sep 17 00:00:00 2001 From: Mark Towers Date: Tue, 28 May 2024 10:49:28 +0100 Subject: [PATCH 12/21] Update to use Gymnasium v1.0.0a1 --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 840ad0cf..8b96da9e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ readme = "README.md" requires-python = ">= 3.8" authors = [{ name = "Farama Foundation", email = "contact@farama.org" }] license = { text = "MIT License" } -keywords = ["Reinforcement Learning", "Multi-Objective", "RL", "AI", "gymnasium"] +keywords = ["Reinforcement Learning", "Multi-Objective", "RL", "AI", "Gymnasium"] classifiers = [ "Development Status :: 4 - Beta", # change to `5 - Production/Stable` when ready "License :: OSI Approved :: MIT License", @@ -22,7 +22,7 @@ classifiers = [ 'Topic :: Scientific/Engineering :: Artificial Intelligence', ] dependencies = [ - "gymnasium>=0.28.1", + "gymnasium >=1.0.0a1", "numpy >=1.21.0", "pygame >=2.1.0", "scipy >=1.7.3", From f72773a096e16307736760da1129aafb86fac2ae Mon Sep 17 00:00:00 2001 From: Florian Felten Date: Wed, 7 Aug 2024 10:15:06 +0200 Subject: [PATCH 13/21] Better doc and tests for vector wrappers --- mo_gymnasium/wrappers/vector/wrappers.py | 28 ++++++++++- pyproject.toml | 2 +- tests/test_vector_wrappers.py | 63 ++++++++++++++++++------ 3 files changed, 75 insertions(+), 18 deletions(-) diff --git a/mo_gymnasium/wrappers/vector/wrappers.py b/mo_gymnasium/wrappers/vector/wrappers.py index 832342f0..6028061d 100644 --- a/mo_gymnasium/wrappers/vector/wrappers.py +++ b/mo_gymnasium/wrappers/vector/wrappers.py @@ -1,4 +1,5 @@ """Vector wrappers.""" + import time from copy import deepcopy from typing import Any, Dict, Iterator, Tuple @@ -13,7 +14,29 @@ class MOSyncVectorEnv(SyncVectorEnv): - """Vectorized environment that serially runs multiple environments.""" + """Vectorized environment that serially runs multiple environments. + + Example: + >>> import mo_gymnasium as mo_gym + + >>> envs = mo_gym.wrappers.vector.MOSyncVectorEnv([ + ... lambda: mo_gym.make("deep-sea-treasure-v0") for _ in range(4) + ... ]) + >>> envs + MOSyncVectorEnv(num_envs=4) + >>> obs, infos = envs.reset() + >>> obs + array([[0, 0], [0, 0], [0, 0], [0, 0]], dtype=int32) + >>> _ = envs.action_space.seed(42) + >>> actions = envs.action_space.sample() + >>> obs, rewards, terminateds, truncateds, infos = envs.step([0, 1, 2, 3]) + >>> obs + array([[0, 0], [1, 0], [0, 0], [0, 3]], dtype=int32) + >>> rewards + array([[0., -1.], [0.7, -1.], [0., -1.], [0., -1.]], dtype=float32) + >>> terminateds + array([False, True, False, False]) + """ def __init__( self, @@ -124,6 +147,7 @@ def __init__( """ gym.utils.RecordConstructorArgs.__init__(self, buffer_length=buffer_length, stats_key=stats_key) RecordEpisodeStatistics.__init__(self, env, buffer_length=buffer_length, stats_key=stats_key) + self.disc_episode_returns = None self.reward_dim = self.env.unwrapped.reward_space.shape[0] self.rewards_shape = (self.num_envs, self.reward_dim) self.gamma = gamma @@ -156,12 +180,12 @@ def step(self, actions: ActType) -> Tuple[ObsType, ArrayType, ArrayType, ArrayTy self.episode_lengths[self.prev_dones] = 0 self.episode_start_times[self.prev_dones] = time.perf_counter() self.episode_returns[~self.prev_dones] += rewards[~self.prev_dones] - self.episode_lengths[~self.prev_dones] += 1 # CHANGE: The discounted returns are also computed here self.disc_episode_returns += rewards * np.repeat(self.gamma**self.episode_lengths, self.reward_dim).reshape( self.episode_returns.shape ) + self.episode_lengths[~self.prev_dones] += 1 self.prev_dones = dones = np.logical_or(terminations, truncations) num_dones = np.sum(dones) diff --git a/pyproject.toml b/pyproject.toml index 8b96da9e..1b98c66e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ classifiers = [ ] dependencies = [ "gymnasium >=1.0.0a1", - "numpy >=1.21.0", + "numpy >=1.21.0,<2.0", "pygame >=2.1.0", "scipy >=1.7.3", "pymoo >=0.6.0", diff --git a/tests/test_vector_wrappers.py b/tests/test_vector_wrappers.py index 6a74e617..1ffb4afd 100644 --- a/tests/test_vector_wrappers.py +++ b/tests/test_vector_wrappers.py @@ -1,3 +1,4 @@ +import gymnasium as gym import numpy as np import mo_gymnasium as mo_gym @@ -5,15 +6,8 @@ def test_mo_sync_wrapper(): - def make_env(env_id): - def thunk(): - env = mo_gym.make(env_id) - return env - - return thunk - num_envs = 3 - envs = MOSyncVectorEnv([make_env("deep-sea-treasure-v0") for _ in range(num_envs)]) + envs = MOSyncVectorEnv([lambda: mo_gym.make("deep-sea-treasure-v0") for _ in range(num_envs)]) envs.reset() obs, rewards, terminateds, truncateds, infos = envs.step(envs.action_space.sample()) @@ -21,18 +15,39 @@ def thunk(): assert len(rewards) == num_envs, "Number of rewards do not match the number of envs" assert len(terminateds) == num_envs, "Number of terminateds do not match the number of envs" assert len(truncateds) == num_envs, "Number of truncateds do not match the number of envs" + envs.close() -def test_mo_record_ep_statistic_vector_env(): - def make_env(env_id): - def thunk(): - env = mo_gym.make(env_id) - return env +def test_mo_sync_autoreset(): + num_envs = 2 + envs = MOSyncVectorEnv([lambda: mo_gym.make("deep-sea-treasure-v0") for _ in range(num_envs)]) + + obs, infos = envs.reset() + assert (obs[0] == [0, 0]).all() + assert (obs[1] == [0, 0]).all() + obs, rewards, terminateds, truncateds, infos = envs.step([0, 1]) + assert (obs[0] == [0, 0]).all() + assert (obs[1] == [1, 0]).all() + # Use np assert almost equal to avoid floating point errors + np.testing.assert_almost_equal(rewards[0], np.array([0.0, -1.0], dtype=np.float32), decimal=2) + np.testing.assert_almost_equal(rewards[1], np.array([0.7, -1.0], dtype=np.float32), decimal=2) + assert not terminateds[0] + assert terminateds[1] # This one is done + assert not truncateds[0] + assert not truncateds[1] + obs, rewards, terminateds, truncateds, infos = envs.step([0, 1]) + assert (obs[0] == [0, 0]).all() + assert (obs[1] == [0, 0]).all() + assert (rewards[0] == [0.0, -1.0]).all() + assert (rewards[1] == [0.0, 0.0]).all() # Reset step + assert not terminateds[0] + assert not terminateds[1] # Not done anymore + envs.close() - return thunk +def test_mo_record_ep_statistic_vector_env(): num_envs = 3 - envs = MOSyncVectorEnv([make_env("deep-sea-treasure-v0") for _ in range(num_envs)]) + envs = MOSyncVectorEnv([lambda: mo_gym.make("deep-sea-treasure-v0") for _ in range(num_envs)]) envs = MORecordEpisodeStatistics(envs) envs.reset() @@ -48,3 +63,21 @@ def thunk(): assert info["episode"]["dr"].shape == (num_envs, 2) assert isinstance(info["episode"]["l"], np.ndarray) assert isinstance(info["episode"]["t"], np.ndarray) + envs.close() + + +def test_gym_wrapper_and_vector(): + # This tests the integration of gym-wrapped envs with MO-Gymnasium vectorized envs + num_envs = 2 + envs = MOSyncVectorEnv( + [lambda: gym.wrappers.NormalizeObservation(mo_gym.make("deep-sea-treasure-v0")) for _ in range(num_envs)] + ) + + envs.reset() + for i in range(30): + obs, rewards, terminateds, truncateds, infos = envs.step(envs.action_space.sample()) + assert len(obs) == num_envs, "Number of observations do not match the number of envs" + assert len(rewards) == num_envs, "Number of rewards do not match the number of envs" + assert len(terminateds) == num_envs, "Number of terminateds do not match the number of envs" + assert len(truncateds) == num_envs, "Number of truncateds do not match the number of envs" + envs.close() From f442ea4b1dcfb7809d360718b3149900f4bab97e Mon Sep 17 00:00:00 2001 From: Florian Felten Date: Wed, 7 Aug 2024 11:24:16 +0200 Subject: [PATCH 14/21] Enhance wrappers doc and tests --- mo_gymnasium/wrappers/wrappers.py | 13 ++++++++++++- tests/test_vector_wrappers.py | 14 ++++++++++---- tests/test_wrappers.py | 14 +++++++------- 3 files changed, 29 insertions(+), 12 deletions(-) diff --git a/mo_gymnasium/wrappers/wrappers.py b/mo_gymnasium/wrappers/wrappers.py index 7e4071f3..b4f8e398 100644 --- a/mo_gymnasium/wrappers/wrappers.py +++ b/mo_gymnasium/wrappers/wrappers.py @@ -59,6 +59,17 @@ class MONormalizeReward(gym.Wrapper, gym.utils.RecordConstructorArgs): """Wrapper to normalize the reward component at index idx. Does not touch other reward components. This code is heavily inspired on Gymnasium's except that it extracts the reward component at given idx, normalizes it, and reinjects it. + + (!) This smoothes the moving average of the reward, which can be useful for training stability. But it does not "normalize" the reward in the sense of making it have a mean of 0 and a standard deviation of 1. + + Example: + >>> import mo_gymnasium as mo_gym + >>> from mo_gymnasium.wrappers import MONormalizeReward + >>> env = mo_gym.make("deep-sea-treasure-v0") + >>> norm_treasure_env = MONormalizeReward(env, idx=0) + >>> both_norm_env = MONormalizeReward(norm_treasure_env, idx=1) + >>> both_norm_env.reset() # This one normalizes both rewards + """ def __init__(self, env: gym.Env, idx: int, gamma: float = 0.99, epsilon: float = 1e-8): @@ -193,12 +204,12 @@ def step(self, action): info, dict ), f"`info` dtype is {type(info)} while supported dtype is `dict`. This may be due to usage of other wrappers in the wrong order." self.episode_returns += rewards - self.episode_lengths += 1 # CHANGE: The discounted returns are also computed here self.disc_episode_returns += rewards * np.repeat(self.gamma**self.episode_lengths, self.reward_dim).reshape( self.episode_returns.shape ) + self.episode_lengths += 1 if terminated or truncated: assert self._stats_key not in info diff --git a/tests/test_vector_wrappers.py b/tests/test_vector_wrappers.py index 1ffb4afd..d57d7567 100644 --- a/tests/test_vector_wrappers.py +++ b/tests/test_vector_wrappers.py @@ -46,22 +46,28 @@ def test_mo_sync_autoreset(): def test_mo_record_ep_statistic_vector_env(): - num_envs = 3 + num_envs = 2 envs = MOSyncVectorEnv([lambda: mo_gym.make("deep-sea-treasure-v0") for _ in range(num_envs)]) - envs = MORecordEpisodeStatistics(envs) + envs = MORecordEpisodeStatistics(envs, gamma=0.97) envs.reset() terminateds = np.array([False] * num_envs) info = {} - while not np.any(terminateds): - obs, rewards, terminateds, _, info = envs.step(envs.action_space.sample()) + obs, rewards, terminateds, _, info = envs.step([0, 3]) + obs, rewards, terminateds, _, info = envs.step([0, 1]) + obs, rewards, terminateds, _, info = envs.step([0, 1]) assert isinstance(info["episode"]["r"], np.ndarray) assert isinstance(info["episode"]["dr"], np.ndarray) # Episode records are vectorized because multiple environments assert info["episode"]["r"].shape == (num_envs, 2) + np.testing.assert_almost_equal(info["episode"]["r"][0], np.array([0.0, 0.0], dtype=np.float32), decimal=2) + np.testing.assert_almost_equal(info["episode"]["r"][1], np.array([8.2, -3.0], dtype=np.float32), decimal=2) assert info["episode"]["dr"].shape == (num_envs, 2) + np.testing.assert_almost_equal(info["episode"]["dr"][0], np.array([0.0, 0.0], dtype=np.float32), decimal=2) + np.testing.assert_almost_equal(info["episode"]["dr"][1], np.array([7.72, -2.91], dtype=np.float32), decimal=2) assert isinstance(info["episode"]["l"], np.ndarray) + np.testing.assert_almost_equal(info["episode"]["l"], np.array([0, 3], dtype=np.float32), decimal=2) assert isinstance(info["episode"]["t"], np.ndarray) envs.close() diff --git a/tests/test_wrappers.py b/tests/test_wrappers.py index b8bc2697..4a4023e0 100644 --- a/tests/test_wrappers.py +++ b/tests/test_wrappers.py @@ -20,6 +20,8 @@ def go_to_8_3(env): def test_normalization_wrapper(): + # Watch out that the wrapper does not normalize the rewards to have a mean of 0 and std of 1 + # instead it smoothens the moving average of the rewards env = mo_gym.make("deep-sea-treasure-v0") norm_treasure_env = MONormalizeReward(env, idx=0) both_norm_env = MONormalizeReward(norm_treasure_env, idx=1) @@ -27,18 +29,16 @@ def test_normalization_wrapper(): # No normalization env.reset(seed=0) _, rewards, _, _, _ = env.step(1) - np.testing.assert_allclose(rewards, [0.7, -1.0], rtol=0, atol=1e-2) + np.testing.assert_almost_equal(rewards, [0.7, -1.0], decimal=2) # Tests for both rewards normalized for i in range(30): go_to_8_3(both_norm_env) both_norm_env.reset(seed=0) _, rewards, _, _, _ = both_norm_env.step(1) # down - np.testing.assert_allclose( - rewards, [0.49, -1.24], rtol=0, atol=1e-2 - ) # TODO PR check why we had to change those values @Mark? + np.testing.assert_almost_equal(rewards, [0.5, -1.24], decimal=2) rewards, _ = go_to_8_3(both_norm_env) - np.testing.assert_allclose(rewards, [4.73, -1.24], rtol=0, atol=1e-2) + np.testing.assert_almost_equal(rewards, [4.73, -1.24], decimal=2) # Tests for only treasure normalized for i in range(30): @@ -46,9 +46,9 @@ def test_normalization_wrapper(): norm_treasure_env.reset(seed=0) _, rewards, _, _, _ = norm_treasure_env.step(1) # down # Time rewards are not normalized (-1) - np.testing.assert_allclose(rewards, [0.51, -1.0], rtol=0, atol=1e-2) + np.testing.assert_almost_equal(rewards, [0.51, -1.0], decimal=2) rewards, _ = go_to_8_3(norm_treasure_env) - np.testing.assert_allclose(rewards, [5.33, -1.0], rtol=0, atol=1e-2) + np.testing.assert_almost_equal(rewards, [5.33, -1.0], decimal=2) def test_clip_wrapper(): From 9b9a3ea7054e40be62cb6617107ca265a99db1a9 Mon Sep 17 00:00:00 2001 From: Florian Felten Date: Wed, 7 Aug 2024 11:42:57 +0200 Subject: [PATCH 15/21] Remove print --- mo_gymnasium/wrappers/wrappers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mo_gymnasium/wrappers/wrappers.py b/mo_gymnasium/wrappers/wrappers.py index b4f8e398..f7830865 100644 --- a/mo_gymnasium/wrappers/wrappers.py +++ b/mo_gymnasium/wrappers/wrappers.py @@ -109,7 +109,6 @@ def step(self, action: ActType): """ obs, rews, terminated, truncated, infos = self.env.step(action) # Extracts the objective value to normalize - print("Normalizing reward at index", self.idx) to_normalize = rews[self.idx] self.discounted_reward = self.discounted_reward * self.gamma * (1 - terminated) + float(to_normalize) From 98a695e274e058a2fd6ef96289792092496c4f47 Mon Sep 17 00:00:00 2001 From: Florian Felten Date: Thu, 8 Aug 2024 08:31:34 +0200 Subject: [PATCH 16/21] Fix test --- mo_gymnasium/__init__.py | 2 +- tests/test_wrappers.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mo_gymnasium/__init__.py b/mo_gymnasium/__init__.py index a238e5cd..94fe1c92 100644 --- a/mo_gymnasium/__init__.py +++ b/mo_gymnasium/__init__.py @@ -8,4 +8,4 @@ from mo_gymnasium.utils import make -__version__ = "1.1.0" +__version__ = "1.2.0" diff --git a/tests/test_wrappers.py b/tests/test_wrappers.py index 4a4023e0..6d44b6d2 100644 --- a/tests/test_wrappers.py +++ b/tests/test_wrappers.py @@ -85,8 +85,8 @@ def test_mo_record_ep_statistic(): assert info["episode"]["dr"].shape == (2,) assert tuple(info["episode"]["r"]) == (np.float32(8.2), np.float32(-3.0)) assert tuple(np.round(info["episode"]["dr"], 2)) == ( - np.float32(7.48), - np.float32(-2.82), + np.float32(7.72), + np.float32(-2.91), ) assert isinstance(info["episode"]["l"], int) assert info["episode"]["l"] == 3 From 7480c646e562df029397b3ef00096a49e11a49a2 Mon Sep 17 00:00:00 2001 From: Lucas Alegre Date: Fri, 9 Aug 2024 14:39:45 +0200 Subject: [PATCH 17/21] Remove pybullet mo-reacher --- mo_gymnasium/envs/__init__.py | 1 - mo_gymnasium/envs/mujoco/reacher.py | 2 +- mo_gymnasium/envs/reacher/__init__.py | 9 -- mo_gymnasium/envs/reacher/reacher.py | 159 -------------------------- tests/test_envs.py | 3 +- 5 files changed, 2 insertions(+), 172 deletions(-) delete mode 100644 mo_gymnasium/envs/reacher/__init__.py delete mode 100644 mo_gymnasium/envs/reacher/reacher.py diff --git a/mo_gymnasium/envs/__init__.py b/mo_gymnasium/envs/__init__.py index 7e917397..c4846df6 100644 --- a/mo_gymnasium/envs/__init__.py +++ b/mo_gymnasium/envs/__init__.py @@ -10,6 +10,5 @@ import mo_gymnasium.envs.minecart import mo_gymnasium.envs.mountain_car import mo_gymnasium.envs.mujoco -import mo_gymnasium.envs.reacher import mo_gymnasium.envs.resource_gathering import mo_gymnasium.envs.water_reservoir diff --git a/mo_gymnasium/envs/mujoco/reacher.py b/mo_gymnasium/envs/mujoco/reacher.py index 01a5bc9d..9596f64c 100644 --- a/mo_gymnasium/envs/mujoco/reacher.py +++ b/mo_gymnasium/envs/mujoco/reacher.py @@ -13,7 +13,7 @@ class MOReacherEnv(ReacherEnv): """ ## Description - Mujoco version of `mo-reacher-v0`, based on [`Reacher-v4` environment](https://gymnasium.farama.org/environments/mujoco/reacher/). + Multi-objective version of the [`Reacher-v4` environment](https://gymnasium.farama.org/environments/mujoco/reacher/). ## Observation Space The observation is 6-dimensional and contains: diff --git a/mo_gymnasium/envs/reacher/__init__.py b/mo_gymnasium/envs/reacher/__init__.py deleted file mode 100644 index b752382c..00000000 --- a/mo_gymnasium/envs/reacher/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from gymnasium.envs.registration import register - - -register( - id="mo-reacher-v0", - entry_point="mo_gymnasium.envs.reacher.reacher:ReacherBulletEnv", - max_episode_steps=100, - kwargs={"fixed_initial_state": None}, -) diff --git a/mo_gymnasium/envs/reacher/reacher.py b/mo_gymnasium/envs/reacher/reacher.py deleted file mode 100644 index 13782b58..00000000 --- a/mo_gymnasium/envs/reacher/reacher.py +++ /dev/null @@ -1,159 +0,0 @@ -from typing import Optional - -import gymnasium as gym -import numpy as np -from gymnasium import spaces -from gymnasium.utils import EzPickle, seeding -from pybulletgym.envs.roboschool.envs.env_bases import BaseBulletEnv -from pybulletgym.envs.roboschool.robots.robot_bases import MJCFBasedRobot -from pybulletgym.envs.roboschool.scenes.scene_bases import SingleRobotEmptyScene - - -target_positions = list(map(lambda l: np.array(l), [(0.14, 0.0), (-0.14, 0.0), (0.0, 0.14), (0.0, -0.14)])) - - -class ReacherBulletEnv(gym.Env, BaseBulletEnv, EzPickle): - metadata = {"render_modes": ["human", "rgb_array"]} - - def __init__( - self, - render_mode: Optional[str] = None, - target=(0.14, 0.0), - fixed_initial_state: Optional[tuple] = (3.14, 0), - ): - EzPickle.__init__(self, render_mode, target, fixed_initial_state) - self.robot = ReacherRobot(target, fixed_initial_state=fixed_initial_state) - self.render_mode = render_mode - BaseBulletEnv.__init__(self, self.robot, render=render_mode == "human") - self._cam_dist = 0.75 - - # self.target_positions = list(map(lambda l: np.array(l), [(0.14, 0.0), (-0.14, 0.0), (0.0, 0.14), (0.0, -0.14), (0.22, 0.0), (-0.22, 0.0), (0.0, 0.22), (0.0, -0.22), (0.1, 0.1), (0.1, -0.1), (-0.1, 0.1), (-0.1, -0.1)])) - # self.target_positions = list(map(lambda l: np.array(l), [(0.14, 0.0), (-0.14, 0.0), (0.0, 0.14), (0.0, -0.14), (0.1, 0.1), (0.1, -0.1), (-0.1, 0.1), (-0.1, -0.1)])) - self.target_positions = list( - map( - lambda l: np.array(l), - [(0.14, 0.0), (-0.14, 0.0), (0.0, 0.14), (0.0, -0.14)], - ) - ) - - actions = [-1.0, 0.0, 1.0] - self.action_dict = dict() - for a1 in actions: - for a2 in actions: - self.action_dict[len(self.action_dict)] = (a1, a2) - - self.action_space = spaces.Discrete(9) - self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(5,), dtype=np.float32) - self.reward_space = spaces.Box(low=-1.0, high=1.0, shape=(4,), dtype=np.float32) - self.reward_dim = 4 - - def create_single_player_scene(self, bullet_client): - return SingleRobotEmptyScene(bullet_client, gravity=0.0, timestep=0.0165, frame_skip=1) - - def step(self, a): - real_action = self.action_dict[int(a)] - - assert not self.scene.multiplayer - self.robot.apply_action(real_action) - self.scene.global_step() - - state = self.robot.calc_state() # sets self.to_target_vec - - """ delta = np.linalg.norm(np.array(self.robot.fingertip.pose().xyz()) - np.array(self.robot.target.pose().xyz())) - reward = 1. - 4. * delta """ - - phi = np.zeros(len(self.target_positions), dtype=np.float32) - for index, target in enumerate(self.target_positions): - delta = np.linalg.norm(np.array(self.robot.fingertip.pose().xyz()[:2]) - target) - phi[index] = 1.0 - 4 * delta # 1 - 4 - - self.HUD(state, real_action, False) - - if self.render_mode == "human": - self._render(mode="human") - - return state, phi, False, False, {} - - def render(self): - if self.render_mode == "human": - self._render(mode="human") - else: - return self._render(mode="rgb_array") - - def camera_adjust(self): - x, y, z = self.robot.fingertip.pose().xyz() - x *= 0.5 - y *= 0.5 - self.camera.move_and_look_at(0.3, 0.3, 0.3, x, y, z) - - def reset(self, seed=None, **kwargs): - self._seed(seed) - if seed is not None: - self._np_random, seed = seeding.np_random(seed) - obs = self._reset() - if self.render_mode == "human": - self._render(mode="human") - return obs, {} - - -class ReacherRobot(MJCFBasedRobot): - TARG_LIMIT = 0.27 - - def __init__(self, target, fixed_initial_state=False): - MJCFBasedRobot.__init__(self, "reacher.xml", "body0", action_dim=2, obs_dim=4) - self.target_pos = target - self.fixed_initial_state = fixed_initial_state - - def robot_specific_reset(self, bullet_client): - self.jdict["target_x"].reset_current_position(target_positions[0][0], 0) - self.jdict["target_y"].reset_current_position(target_positions[0][1], 0) - - """ self.jdict["target2_x"].reset_current_position(target_positions[1][0], 0) - self.jdict["target2_y"].reset_current_position(target_positions[1][1], 0) - self.jdict["target3_x"].reset_current_position(target_positions[2][0], 0) - self.jdict["target3_y"].reset_current_position(target_positions[2][1], 0) - self.jdict["target4_x"].reset_current_position(target_positions[3][0], 0) - self.jdict["target4_y"].reset_current_position(target_positions[3][1], 0) """ - - self.fingertip = self.parts["fingertip"] - self.target = self.parts["target"] - self.central_joint = self.jdict["joint0"] - self.elbow_joint = self.jdict["joint1"] - if self.fixed_initial_state is None: - self.central_joint.reset_current_position(self.np_random.uniform(low=-3.14, high=3.14), 0) - self.elbow_joint.reset_current_position(self.np_random.uniform(low=-3.14 / 2, high=3.14 / 2), 0) - else: - self.central_joint.reset_current_position(0, 0) - self.elbow_joint.reset_current_position(self.fixed_initial_state[0], self.fixed_initial_state[1]) - - def apply_action(self, a): - assert np.isfinite(a).all() - self.central_joint.set_motor_torque(0.05 * float(np.clip(a[0], -1, +1))) - self.elbow_joint.set_motor_torque(0.05 * float(np.clip(a[1], -1, +1))) - - def calc_state(self): - theta, self.theta_dot = self.central_joint.current_relative_position() - self.gamma, self.gamma_dot = self.elbow_joint.current_relative_position() - # target_x, _ = self.jdict["target_x"].current_position() - # target_y, _ = self.jdict["target_y"].current_position() - self.to_target_vec = np.array(self.fingertip.pose().xyz()) - np.array(self.target.pose().xyz()) - return np.array( - [ - np.cos(theta), - np.sin(theta), - self.theta_dot * 0.1, - self.gamma, - self.gamma_dot * 0.1, - ], - dtype=np.float32, - ) - - -if __name__ == "__main__": - env = ReacherBulletEnv() - # env.render(mode='human') - obs = env.reset() - print(env.observation_space.contains(obs), obs.dtype, env.observation_space) - while True: - env.step(env.action_space.sample()) - # env.render(mode='human') diff --git a/tests/test_envs.py b/tests/test_envs.py index 16180c8b..e30aa151 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -14,8 +14,7 @@ for env_spec in gym.envs.registry.values(): if type(env_spec.entry_point) is not str: continue - if "highway" in env_spec.entry_point: - continue + # collect MO Gymnasium envs if env_spec.entry_point.split(".")[0] == "mo_gymnasium": all_testing_env_specs.append(env_spec) From 4e39d18881d40248ad5712074acf861ba1383790 Mon Sep 17 00:00:00 2001 From: Lucas Alegre Date: Mon, 12 Aug 2024 09:47:39 +0200 Subject: [PATCH 18/21] Require highway-env >= 1.9.1 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 1b98c66e..7592a09f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,7 @@ all = [ "imageio >=2.14.1", "mujoco >=2.2.0", # highway - "highway-env >= 1.8", + "highway-env >= 1.9.1", # box2d "box2d-py ==2.3.5", "pygame ==2.1.3.dev8", From dbddf3a6e7eee03832bb963d304dd88bf13bc832 Mon Sep 17 00:00:00 2001 From: Florian Felten Date: Tue, 13 Aug 2024 11:40:00 +0200 Subject: [PATCH 19/21] test type --- tests/test_wrappers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_wrappers.py b/tests/test_wrappers.py index 91d4abf5..df2ded4a 100644 --- a/tests/test_wrappers.py +++ b/tests/test_wrappers.py @@ -87,6 +87,6 @@ def test_mo_record_ep_statistic(): np.testing.assert_allclose(info["episode"]["dr"], [7.71538, -2.9109], rtol=0, atol=1e-2) # 0 * 0.97**0 + 0 * 0.97**1 + 8.2 * 0.97**2 == 7.71538 # -1 * 0.97**0 + -1 * 0.97**1 + -1 * 0.97**2 == -2.9109 - assert isinstance(info["episode"]["l"], np.int32) + assert isinstance(info["episode"]["l"], int) assert info["episode"]["l"] == 3 assert isinstance(info["episode"]["t"], float) From fbac985007477d17ec767733cd525bc2bbacfeda Mon Sep 17 00:00:00 2001 From: Lucas Alegre Date: Fri, 16 Aug 2024 11:57:19 +0200 Subject: [PATCH 20/21] Add Mujoco v5 environments --- mo_gymnasium/envs/mujoco/__init__.py | 38 +++++++++ mo_gymnasium/envs/mujoco/ant_v5.py | 56 +++++++++++++ mo_gymnasium/envs/mujoco/humanoid_v5.py | 37 +++++++++ mo_gymnasium/envs/mujoco/reacher_v5.py | 101 ++++++++++++++++++++++++ mo_gymnasium/envs/mujoco/swimmer_v5.py | 41 ++++++++++ mo_gymnasium/envs/mujoco/walker2d_v5.py | 38 +++++++++ 6 files changed, 311 insertions(+) create mode 100644 mo_gymnasium/envs/mujoco/ant_v5.py create mode 100644 mo_gymnasium/envs/mujoco/humanoid_v5.py create mode 100644 mo_gymnasium/envs/mujoco/reacher_v5.py create mode 100644 mo_gymnasium/envs/mujoco/swimmer_v5.py create mode 100644 mo_gymnasium/envs/mujoco/walker2d_v5.py diff --git a/mo_gymnasium/envs/mujoco/__init__.py b/mo_gymnasium/envs/mujoco/__init__.py index 4415d577..f5d5639e 100644 --- a/mo_gymnasium/envs/mujoco/__init__.py +++ b/mo_gymnasium/envs/mujoco/__init__.py @@ -45,6 +45,12 @@ max_episode_steps=1000, ) +register( + id="mo-walker2d-v5", + entry_point="mo_gymnasium.envs.mujoco.walker2d_v5:MOWalker2dEnv", + max_episode_steps=1000, +) + register( id="mo-ant-v4", entry_point="mo_gymnasium.envs.mujoco.ant:MOAntEnv", @@ -58,20 +64,52 @@ kwargs={"cost_objective": False}, ) + +register( + id="mo-ant-v5", + entry_point="mo_gymnasium.envs.mujoco.ant_v5:MOAntEnv", + max_episode_steps=1000, +) + +register( + id="mo-ant-2d-v5", + entry_point="mo_gymnasium.envs.mujoco.ant_v5:MOAntEnv", + max_episode_steps=1000, + kwargs={"cost_objective": False}, +) + register( id="mo-swimmer-v4", entry_point="mo_gymnasium.envs.mujoco.swimmer:MOSwimmerEnv", max_episode_steps=1000, ) +register( + id="mo-swimmer-v5", + entry_point="mo_gymnasium.envs.mujoco.swimmer_v5:MOSwimmerEnv", + max_episode_steps=1000, +) + register( id="mo-humanoid-v4", entry_point="mo_gymnasium.envs.mujoco.humanoid:MOHumanoidEnv", max_episode_steps=1000, ) +register( + id="mo-humanoid-v5", + entry_point="mo_gymnasium.envs.mujoco.humanoid_v5:MOHumanoidEnv", + max_episode_steps=1000, +) + register( id="mo-reacher-v4", entry_point="mo_gymnasium.envs.mujoco.reacher_v4:MOReacherEnv", max_episode_steps=50, ) + +register( + id="mo-reacher-v5", + entry_point="mo_gymnasium.envs.mujoco.reacher_v5:MOReacherEnv", + max_episode_steps=50, +) diff --git a/mo_gymnasium/envs/mujoco/ant_v5.py b/mo_gymnasium/envs/mujoco/ant_v5.py new file mode 100644 index 00000000..701c6658 --- /dev/null +++ b/mo_gymnasium/envs/mujoco/ant_v5.py @@ -0,0 +1,56 @@ +import numpy as np +from gymnasium.envs.mujoco.ant_v5 import AntEnv +from gymnasium.spaces import Box +from gymnasium.utils import EzPickle + + +class MOAntEnv(AntEnv, EzPickle): + """ + ## Description + Multi-objective version of the AntEnv environment. + + See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/ant/) for more information. + + The original Gymnasium's 'Ant-v5' is recovered by the following linear scalarization: + + env = mo_gym.make('mo-ant-v4', cost_objective=False) + LinearReward(env, weight=np.array([1.0, 0.0])) + + ## Reward Space + The reward is 2- or 3-dimensional: + - 0: x-velocity + - 1: y-velocity + - 2: Control cost of the action + If the cost_objective flag is set to False, the reward is 2-dimensional, and the cost is added to other objectives. + A healthy reward is added to all objectives. + + ## Version History + - v5: Now includes contact forces in the reward and observation. + See https://gymnasium.farama.org/environments/mujoco/ant/#version-history + """ + def __init__(self, cost_objective=True, **kwargs): + super().__init__(**kwargs) + EzPickle.__init__(self, cost_objective, **kwargs) + self.cost_objetive = cost_objective + self.reward_dim = 3 if cost_objective else 2 + self.reward_space = Box(low=-np.inf, high=np.inf, shape=(self.reward_dim,)) + + def step(self, action): + observation, reward, terminated, truncated, info = super().step(action) + x_velocity = info["x_velocity"] + y_velocity = info["y_velocity"] + cost = info["reward_ctrl"] + contact_cost = info["reward_contact"] + healthy_reward = info["reward_survive"] + + if self.cost_objetive: + cost /= self._ctrl_cost_weight # Ignore the weight in the original AntEnv + contact_cost /= self._contact_cost_weight + vec_reward = np.array([x_velocity, y_velocity, cost], dtype=np.float32) + else: + vec_reward = np.array([x_velocity, y_velocity], dtype=np.float32) + vec_reward += cost + contact_cost + + vec_reward += healthy_reward + + return observation, vec_reward, terminated, truncated, info diff --git a/mo_gymnasium/envs/mujoco/humanoid_v5.py b/mo_gymnasium/envs/mujoco/humanoid_v5.py new file mode 100644 index 00000000..4cd5bf06 --- /dev/null +++ b/mo_gymnasium/envs/mujoco/humanoid_v5.py @@ -0,0 +1,37 @@ +import numpy as np +from gymnasium.envs.mujoco.humanoid_v5 import HumanoidEnv +from gymnasium.spaces import Box +from gymnasium.utils import EzPickle + + +class MOHumanoidEnv(HumanoidEnv, EzPickle): + """ + ## Description + Multi-objective version of the HumanoidEnv environment. + + See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/humanoid/) for more information. + + ## Reward Space + The reward is 2-dimensional: + - 0: Reward for running forward (x-velocity) + - 1: Control cost of the action + + ## Version History: + - v5: Now includes contact forces. See: https://gymnasium.farama.org/environments/mujoco/humanoid/#version-history + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + EzPickle.__init__(self, **kwargs) + self.reward_space = Box(low=-np.inf, high=np.inf, shape=(2,)) + self.reward_dim = 2 + + def step(self, action): + observation, reward, terminated, truncated, info = super().step(action) + velocity = info["x_velocity"] + negative_cost = 10 * info["reward_ctrl"] + info["reward_contact"] + vec_reward = np.array([velocity, negative_cost], dtype=np.float32) + + vec_reward += self.healthy_reward # All objectives are penalyzed when the agent falls + + return observation, vec_reward, terminated, truncated, info diff --git a/mo_gymnasium/envs/mujoco/reacher_v5.py b/mo_gymnasium/envs/mujoco/reacher_v5.py new file mode 100644 index 00000000..79196851 --- /dev/null +++ b/mo_gymnasium/envs/mujoco/reacher_v5.py @@ -0,0 +1,101 @@ +from os import path + +import numpy as np +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.envs.mujoco.reacher_v5 import ReacherEnv +from gymnasium.spaces import Box, Discrete + + +DEFAULT_CAMERA_CONFIG = {"trackbodyid": 0} + + +class MOReacherEnv(ReacherEnv): + """ + ## Description + Multi-objective version of the [`Reacher-v4` environment](https://gymnasium.farama.org/environments/mujoco/reacher/). + + ## Observation Space + The observation is 6-dimensional and contains: + - sin and cos of the angles of the central and elbow joints + - angular velocity of the central and elbow joints + + ## Action Space + The action space is discrete and contains the 3^2=9 possible actions based on applying positive (+1), negative (-1) or zero (0) torque to each of the two joints. + + ## Reward Space + The reward is 4-dimensional and is defined based on the distance of the tip of the arm and the four target locations. + For each i={1,2,3,4} it is computed as: + ```math + r_i = 1 - 4 * || finger_tip_coord - target_i ||^2 + ``` + + ## Version History: + See https://gymnasium.farama.org/environments/mujoco/reacher/#version-history + """ + + def __init__(self, **kwargs): + utils.EzPickle.__init__(self, **kwargs) + self.observation_space = Box(low=-np.inf, high=np.inf, shape=(6,), dtype=np.float64) + MujocoEnv.__init__( + self, + path.join(path.dirname(__file__), "assets", "mo_reacher.xml"), + 2, + observation_space=self.observation_space, + default_camera_config=DEFAULT_CAMERA_CONFIG, + **kwargs, + ) + actions = [-1.0, 0.0, 1.0] + self.action_dict = dict() + for a1 in actions: + for a2 in actions: + self.action_dict[len(self.action_dict)] = (a1, a2) + self.action_space = Discrete(9) + # Target goals: x1, y1, x2, y2, ... x4, y4 + self.goal = np.array([0.14, 0.0, -0.14, 0.0, 0.0, 0.14, 0.0, -0.14]) + self.reward_space = Box(low=-1.0, high=1.0, shape=(4,)) + self.reward_dim = 4 + + def step(self, a): + real_action = self.action_dict[int(a)] + vec_reward = np.array( + [ + 1 - 4 * np.linalg.norm(self.get_body_com("fingertip")[:2] - self.get_body_com("target1")[:2]), + 1 - 4 * np.linalg.norm(self.get_body_com("fingertip")[:2] - self.get_body_com("target2")[:2]), + 1 - 4 * np.linalg.norm(self.get_body_com("fingertip")[:2] - self.get_body_com("target3")[:2]), + 1 - 4 * np.linalg.norm(self.get_body_com("fingertip")[:2] - self.get_body_com("target4")[:2]), + ], + dtype=np.float32, + ) + + self._step_mujoco_simulation(real_action, self.frame_skip) + if self.render_mode == "human": + self.render() + + ob = self._get_obs() + return ( + ob, + vec_reward, + False, + False, + {}, + ) + + def reset_model(self): + qpos = self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq) + self.init_qpos + qpos[:2] = np.array([0, 3.1415 / 2]) # init position + qpos[-len(self.goal) :] = self.goal + qvel = self.init_qvel + self.np_random.uniform(low=-0.005, high=0.005, size=self.model.nv) + qvel[-2:] = 0 + self.set_state(qpos, qvel) + return self._get_obs() + + def _get_obs(self): + theta = self.data.qpos.flatten()[:2] + return np.concatenate( + [ + np.cos(theta), + np.sin(theta), + self.data.qvel.flatten()[:2] * 0.1, + ] + ) diff --git a/mo_gymnasium/envs/mujoco/swimmer_v5.py b/mo_gymnasium/envs/mujoco/swimmer_v5.py new file mode 100644 index 00000000..11a160ca --- /dev/null +++ b/mo_gymnasium/envs/mujoco/swimmer_v5.py @@ -0,0 +1,41 @@ +import numpy as np +from gymnasium.envs.mujoco.swimmer_v5 import SwimmerEnv +from gymnasium.spaces import Box +from gymnasium.utils import EzPickle + + +class MOSwimmerEnv(SwimmerEnv, EzPickle): + """ + ## Description + Multi-objective version of the SwimmerEnv environment. + + See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/swimmer/) for more information. + + The original Gymnasium's 'Swimmer-v4' is recovered by the following linear scalarization: + + env = mo_gym.make('mo-swimmer-v4') + LinearReward(env, weight=np.array([1.0, 1e-4])) + + ## Reward Space + The reward is 2-dimensional: + - 0: Reward for moving forward (x-velocity) + - 1: Control cost of the action + + ## Version History: + See https://gymnasium.farama.org/main/environments/mujoco/swimmer/#version-history + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + EzPickle.__init__(self, **kwargs) + self.reward_space = Box(low=-np.inf, high=np.inf, shape=(2,)) + self.reward_dim = 2 + + def step(self, action): + observation, reward, terminated, truncated, info = super().step(action) + velocity = info["x_velocity"] + energy = -np.sum(np.square(action)) + + vec_reward = np.array([velocity, energy], dtype=np.float32) + + return observation, vec_reward, terminated, truncated, info diff --git a/mo_gymnasium/envs/mujoco/walker2d_v5.py b/mo_gymnasium/envs/mujoco/walker2d_v5.py new file mode 100644 index 00000000..5b036db5 --- /dev/null +++ b/mo_gymnasium/envs/mujoco/walker2d_v5.py @@ -0,0 +1,38 @@ +import numpy as np +from gymnasium.envs.mujoco.walker2d_v5 import Walker2dEnv +from gymnasium.spaces import Box +from gymnasium.utils import EzPickle + + +class MOWalker2dEnv(Walker2dEnv, EzPickle): + """ + ## Description + Multi-objective version of the Walker2dEnv environment. + + See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/walker2d/) for more information. + + ## Reward Space + The reward is 2-dimensional: + - 0: Reward for running forward (x-velocity) + - 1: Control cost of the action + + # Version History + - See https://gymnasium.farama.org/main/environments/mujoco/walker2d/#version-history + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + EzPickle.__init__(self, **kwargs) + self.reward_space = Box(low=-np.inf, high=np.inf, shape=(2,)) + self.reward_dim = 2 + + def step(self, action): + observation, reward, terminated, truncated, info = super().step(action) + velocity = info["x_velocity"] + energy = -np.sum(np.square(action)) + + vec_reward = np.array([velocity, energy], dtype=np.float32) + + vec_reward += self.healthy_reward # All objectives are penalyzed when the agent falls + + return observation, vec_reward, terminated, truncated, info From eab459202ee9b5bcbff8678a24caf6aa5775caab Mon Sep 17 00:00:00 2001 From: Lucas Alegre Date: Fri, 16 Aug 2024 12:00:29 +0200 Subject: [PATCH 21/21] pre-commit --- mo_gymnasium/envs/mujoco/ant_v5.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mo_gymnasium/envs/mujoco/ant_v5.py b/mo_gymnasium/envs/mujoco/ant_v5.py index 701c6658..c4e4ad9d 100644 --- a/mo_gymnasium/envs/mujoco/ant_v5.py +++ b/mo_gymnasium/envs/mujoco/ant_v5.py @@ -28,6 +28,7 @@ class MOAntEnv(AntEnv, EzPickle): - v5: Now includes contact forces in the reward and observation. See https://gymnasium.farama.org/environments/mujoco/ant/#version-history """ + def __init__(self, cost_objective=True, **kwargs): super().__init__(**kwargs) EzPickle.__init__(self, cost_objective, **kwargs)