diff --git a/.gitignore b/.gitignore index 6473ced97..a98cef38e 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,4 @@ build/ dist/ .idea/ results/ -examples/gym/results/ +examples/gymnasium/results/ diff --git a/.pfnci/run.sh b/.pfnci/run.sh index dc575fec6..37480af36 100644 --- a/.pfnci/run.sh +++ b/.pfnci/run.sh @@ -75,7 +75,7 @@ main() { # pytest does not run with attrs==19.2.0 (https://github.com/pytest-dev/pytest/issues/3280) # NOQA "${PYTHON}" -m pip install \ 'pytest==4.1.1' 'attrs==19.1.0' 'pytest-xdist==1.26.1' \ - 'gym[atari,classic_control]==0.19.0' 'optuna' 'zipp==1.0.0' 'pybullet==2.8.1' 'jupyterlab==2.1.5' 'traitlets==5.1.1' 'pyglet==1.5.27' + 'gymnasium[atari,classic_control]==0.19.0' 'optuna' 'zipp==1.0.0' 'pybullet==2.8.1' 'jupyterlab==2.1.5' 'traitlets==5.1.1' git config --global user.email "you@example.com" git config --global user.name "Your Name" diff --git a/README.md b/README.md index 1a88055c3..d2840f7e6 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Refer to [Installation](http://pfrl.readthedocs.io/en/latest/install.html) for m ## Getting started -You can try [PFRL Quickstart Guide](examples/quickstart/quickstart.ipynb) first, or check the [examples](examples) ready for Atari 2600 and Open AI Gym. +You can try [PFRL Quickstart Guide](examples/quickstart/quickstart.ipynb) first, or check the [examples](examples) ready for Atari 2600 and Farama Foundation's gymnasium. For more information, you can refer to [PFRL's documentation](http://pfrl.readthedocs.io/en/latest/index.html). @@ -64,9 +64,9 @@ Following algorithms have been implemented in PFRL: - [ACER (Actor-Critic with Experience Replay)](https://arxiv.org/abs/1611.01224) - examples: [[atari]](examples/atari/train_acer_ale.py) - [Categorical DQN](https://arxiv.org/abs/1707.06887) - - examples: [[atari]](examples/atari/train_categorical_dqn_ale.py) [[general gym]](examples/gym/train_categorical_dqn_gym.py) + - examples: [[atari]](examples/atari/train_categorical_dqn_ale.py) [[general gymnasium]](examples/gymnasium/train_categorical_dqn_gymnasium.py) - [DQN (Deep Q-Network)](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf) (including [Double DQN](https://arxiv.org/abs/1509.06461), [Persistent Advantage Learning (PAL)](https://arxiv.org/abs/1512.04860), Double PAL, [Dynamic Policy Programming (DPP)](http://www.jmlr.org/papers/volume13/azar12a/azar12a.pdf)) - - examples: [[atari reproduction]](examples/atari/reproduction/dqn) [[atari]](examples/atari/train_dqn_ale.py) [[atari (batched)]](examples/atari/train_dqn_batch_ale.py) [[flickering atari]](examples/atari/train_drqn_ale.py) [[general gym]](examples/gym/train_dqn_gym.py) + - examples: [[atari reproduction]](examples/atari/reproduction/dqn) [[atari]](examples/atari/train_dqn_ale.py) [[atari (batched)]](examples/atari/train_dqn_batch_ale.py) [[flickering atari]](examples/atari/train_drqn_ale.py) [[general gymnasium]](examples/gymnasium/train_dqn_gymnasium.py) - [DDPG (Deep Deterministic Policy Gradients)](https://arxiv.org/abs/1509.02971) (including [SVG(0)](https://arxiv.org/abs/1510.09142)) - examples: [[mujoco reproduction]](examples/mujoco/reproduction/ddpg) - [IQN (Implicit Quantile Networks)](https://arxiv.org/abs/1806.06923) @@ -76,7 +76,7 @@ Following algorithms have been implemented in PFRL: - [Rainbow](https://arxiv.org/abs/1710.02298) - examples: [[atari reproduction]](examples/atari/reproduction/rainbow) [[Slime volleyball]](examples/slimevolley/) - [REINFORCE](http://www-anw.cs.umass.edu/~barto/courses/cs687/williams92simple.pdf) - - examples: [[general gym]](examples/gym/train_reinforce_gym.py) + - examples: [[general gymnasium]](examples/gymnasium/train_reinforce_gymnasium.py) - [SAC (Soft Actor-Critic)](https://arxiv.org/abs/1812.05905) - examples: [[mujoco reproduction]](examples/mujoco/reproduction/soft_actor_critic) [[Atlas walk]](examples/atlas/) - [TRPO (Trust Region Policy Optimization)](https://arxiv.org/abs/1502.05477) with [GAE (Generalized Advantage Estimation)](https://arxiv.org/abs/1506.02438) @@ -92,14 +92,14 @@ Following useful techniques have been also implemented in PFRL: - [Dueling Network](https://arxiv.org/abs/1511.06581) - examples: [[Rainbow]](examples/atari/reproduction/rainbow) [[DQN/DoubleDQN/PAL]](examples/atari/train_dqn_ale.py) - [Normalized Advantage Function](https://arxiv.org/abs/1603.00748) - - examples: [[DQN]](examples/gym/train_dqn_gym.py) (for continuous-action envs only) + - examples: [[DQN]](examples/gymnasium/train_dqn_gymnasium.py) (for continuous-action envs only) - [Deep Recurrent Q-Network](https://arxiv.org/abs/1507.06527) - examples: [[DQN]](examples/atari/train_drqn_ale.py) ## Environments -Environments that support the subset of OpenAI Gym's interface (`reset` and `step` methods) can be used. +Environments that support the subset of Farama Foundation's gymnasium's interface (`reset` and `step` methods) can be used. ## Contributing diff --git a/examples/README.md b/examples/README.md index f8fc3c4b6..4b97fc16c 100644 --- a/examples/README.md +++ b/examples/README.md @@ -3,7 +3,7 @@ - `atari`: examples for general Atari games - `atari/reproduction`: examples with benchmark scores for reproducing published results on Atari - `atlas`: training an Atlas robot to walk -- `gym`: examples for OpenAI Gym environments +- `gymnasium`: examples for OpenAI gymnasium environments - `grasping`: examples for a Bullet-based robotic grasping environment - `mujoco/reproduction`: examples with benchmark scores for reproducing published results on MuJoCo tasks - `quickstart`: a quickstart guide of PFRL diff --git a/examples/atari/train_acer_ale.py b/examples/atari/train_acer_ale.py index 9cdfa5945..d95cb1cca 100644 --- a/examples/atari/train_acer_ale.py +++ b/examples/atari/train_acer_ale.py @@ -4,8 +4,8 @@ # Prevent numpy from using multiple threads os.environ["OMP_NUM_THREADS"] = "1" -import gym # NOQA:E402 -import gym.wrappers # NOQA:E402 +import gymnasium # NOQA:E402 +import gymnasium.wrappers # NOQA:E402 import numpy as np # NOQA:E402 from torch import nn # NOQA:E402 @@ -91,7 +91,7 @@ def main(): args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) - n_actions = gym.make(args.env).action_space.n + n_actions = gymnasium.make(args.env).action_space.n input_to_hidden = nn.Sequential( nn.Conv2d(4, 16, 8, stride=4), diff --git a/examples/atari/train_drqn_ale.py b/examples/atari/train_drqn_ale.py index ccbefa699..a0425784d 100644 --- a/examples/atari/train_drqn_ale.py +++ b/examples/atari/train_drqn_ale.py @@ -11,8 +11,8 @@ """ import argparse -import gym -import gym.wrappers +import gymnasium +import gymnasium.wrappers import numpy as np import torch from torch import nn @@ -193,7 +193,7 @@ def make_env(test): # Randomize actions like epsilon-greedy in evaluation as well env = pfrl.wrappers.RandomizeAction(env, args.eval_epsilon) if args.monitor: - env = gym.wrappers.Monitor( + env = gymnasium.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training" ) if args.render: diff --git a/examples/atari/train_ppo_ale.py b/examples/atari/train_ppo_ale.py index 80bac591f..dd48244fd 100644 --- a/examples/atari/train_ppo_ale.py +++ b/examples/atari/train_ppo_ale.py @@ -1,4 +1,4 @@ -"""An example of training PPO against OpenAI Gym Atari Envs. +"""An example of training PPO against OpenAI gymnasium Atari Envs. This script is an example of training a PPO agent on Atari envs. @@ -25,7 +25,7 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument( - "--env", type=str, default="BreakoutNoFrameskip-v4", help="Gym Env ID." + "--env", type=str, default="BreakoutNoFrameskip-v4", help="gymnasium Env ID." ) parser.add_argument( "--gpu", type=int, default=0, help="GPU device ID. Set to -1 to use CPUs only." diff --git a/examples/atlas/train_soft_actor_critic_atlas.py b/examples/atlas/train_soft_actor_critic_atlas.py index 1d35d6e82..76d147279 100644 --- a/examples/atlas/train_soft_actor_critic_atlas.py +++ b/examples/atlas/train_soft_actor_critic_atlas.py @@ -4,8 +4,8 @@ import logging import sys -import gym -import gym.wrappers +import gymnasium +import gymnasium.wrappers import numpy as np import torch from torch import distributions, nn @@ -17,16 +17,16 @@ def make_env(args, seed, test): if args.env.startswith("Roboschool"): - # Check gym version because roboschool does not work with gym>=0.15.6 + # Check gymnasium version because roboschool does not work with gymnasium>=0.15.6 from distutils.version import StrictVersion - gym_version = StrictVersion(gym.__version__) - if gym_version >= StrictVersion("0.15.6"): - raise RuntimeError("roboschool does not work with gym>=0.15.6") + gymnasium_version = StrictVersion(gymnasium.__version__) + if gymnasium_version >= StrictVersion("0.15.6"): + raise RuntimeError("roboschool does not work with gymnasium>=0.15.6") import roboschool # NOQA - env = gym.make(args.env) + env = gymnasium.make(args.env) # Unwrap TimiLimit wrapper - assert isinstance(env, gym.wrappers.TimeLimit) + assert isinstance(env, gymnasium.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs env_seed = 2**32 - 1 - seed if test else seed @@ -59,7 +59,7 @@ def main(): "--env", type=str, default="RoboschoolAtlasForwardWalk-v1", - help="OpenAI Gym env to perform algorithm on.", + help="OpenAI gymnasium env to perform algorithm on.", ) parser.add_argument( "--num-envs", type=int, default=4, help="Number of envs run in parallel." diff --git a/examples/grasping/train_dqn_batch_grasping.py b/examples/grasping/train_dqn_batch_grasping.py index 0274a0530..e4fa96024 100644 --- a/examples/grasping/train_dqn_batch_grasping.py +++ b/examples/grasping/train_dqn_batch_grasping.py @@ -2,8 +2,8 @@ import functools import os -import gym -import gym.spaces +import gymnasium +import gymnasium.spaces import numpy as np import torch from torch import nn @@ -13,7 +13,7 @@ from pfrl.q_functions import DiscreteActionValueHead -class CastAction(gym.ActionWrapper): +class CastAction(gymnasium.ActionWrapper): """Cast actions to a given type.""" def __init__(self, env, type_): @@ -24,14 +24,14 @@ def action(self, action): return self.type_(action) -class TransposeObservation(gym.ObservationWrapper): +class TransposeObservation(gymnasium.ObservationWrapper): """Transpose observations.""" def __init__(self, env, axes): super().__init__(env) self._axes = axes - assert isinstance(env.observation_space, gym.spaces.Box) - self.observation_space = gym.spaces.Box( + assert isinstance(env.observation_space, gymnasium.spaces.Box) + self.observation_space = gymnasium.spaces.Box( low=env.observation_space.low.transpose(*self._axes), high=env.observation_space.high.transpose(*self._axes), dtype=env.observation_space.dtype, @@ -41,7 +41,7 @@ def observation(self, observation): return observation.transpose(*self._axes) -class ObserveElapsedSteps(gym.Wrapper): +class ObserveElapsedSteps(gymnasium.Wrapper): """Observe the number of elapsed steps in an episode. A new observation will be a tuple of an original observation and an integer @@ -52,10 +52,10 @@ def __init__(self, env, max_steps): super().__init__(env) self._max_steps = max_steps self._elapsed_steps = 0 - self.observation_space = gym.spaces.Tuple( + self.observation_space = gymnasium.spaces.Tuple( ( env.observation_space, - gym.spaces.Discrete(self._max_steps + 1), + gymnasium.spaces.Discrete(self._max_steps + 1), ) ) @@ -64,13 +64,13 @@ def reset(self): return self.env.reset(), self._elapsed_steps def step(self, action): - observation, reward, done, info = self.env.step(action) + observation, reward, terminated, truncated, info = self.env.step(action) self._elapsed_steps += 1 assert self._elapsed_steps <= self._max_steps - return (observation, self._elapsed_steps), reward, done, info + return (observation, self._elapsed_steps), reward, terminated, truncated, info -class RecordMovie(gym.Wrapper): +class RecordMovie(gymnasium.Wrapper): """Record MP4 videos using pybullet's logging API.""" def __init__(self, env, dirname): @@ -87,7 +87,7 @@ def reset(self): pybullet.STATE_LOGGING_VIDEO_MP4, os.path.join(self._dirname, "{}.mp4".format(self._episode_idx)), ) - return obs + return obs, {} class GraspingQFunction(nn.Module): @@ -243,7 +243,7 @@ def main(): max_episode_steps = 8 def make_env(idx, test): - from pybullet_envs.bullet.kuka_diverse_object_gym_env import ( # NOQA + from pybullet_envs.bullet.kuka_diverse_object_gymnasium_env import ( # NOQA KukaDiverseObjectEnv, ) @@ -263,7 +263,7 @@ def make_env(idx, test): # Disable file caching to keep memory usage small env._p.setPhysicsEngineParameter(enableFileCaching=False) assert env.observation_space is None - env.observation_space = gym.spaces.Box( + env.observation_space = gymnasium.spaces.Box( low=0, high=255, shape=(84, 84, 3), dtype=np.uint8 ) # (84, 84, 3) -> (3, 84, 84) diff --git a/examples/gym/README.md b/examples/gym/README.md deleted file mode 100644 index 0e46abf0d..000000000 --- a/examples/gym/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# Examples for OpenAI Gym environments - -- `train_categorical_dqn_gym.py`: CategoricalDQN for discrete action action spaces -- `train_dqn_gym.py`: DQN for both discrete action and continuous action spaces -- `train_reinforce_gym.py`: REINFORCE for both discrete action and continuous action spaces (only for episodic envs) - -## How to run - -``` -python train_categorical_dqn_gym.py [options] -python train_dqn_gym.py [options] -python train_reinforce_gym.py [options] -``` - -Specify `--help` or read code for options. diff --git a/examples/gymnasium/README.md b/examples/gymnasium/README.md new file mode 100644 index 000000000..b17585519 --- /dev/null +++ b/examples/gymnasium/README.md @@ -0,0 +1,15 @@ +# Examples for OpenAI gymnasium environments + +- `train_categorical_dqn_gymnasium.py`: CategoricalDQN for discrete action action spaces +- `train_dqn_gymnasium.py`: DQN for both discrete action and continuous action spaces +- `train_reinforce_gymnasium.py`: REINFORCE for both discrete action and continuous action spaces (only for episodic envs) + +## How to run + +``` +python train_categorical_dqn_gymnasium.py [options] +python train_dqn_gymnasium.py [options] +python train_reinforce_gymnasium.py [options] +``` + +Specify `--help` or read code for options. diff --git a/examples/gym/train_categorical_dqn_gym.py b/examples/gymnasium/train_categorical_dqn_gym.py similarity index 95% rename from examples/gym/train_categorical_dqn_gym.py rename to examples/gymnasium/train_categorical_dqn_gym.py index 7c7105189..ac07557c7 100644 --- a/examples/gym/train_categorical_dqn_gym.py +++ b/examples/gymnasium/train_categorical_dqn_gym.py @@ -1,16 +1,16 @@ -"""An example of training Categorical DQN against OpenAI Gym Envs. +"""An example of training Categorical DQN against OpenAI gymnasium Envs. This script is an example of training a CategoricalDQN agent against OpenAI -Gym envs. Only discrete spaces are supported. +gymnasium envs. Only discrete spaces are supported. To solve CartPole-v0, run: - python train_categorical_dqn_gym.py --env CartPole-v0 + python train_categorical_dqn_gymnasium.py --env CartPole-v0 """ import argparse import sys -import gym +import gymnasium import torch import pfrl @@ -66,7 +66,7 @@ def main(): print("Output files are saved in {}".format(args.outdir)) def make_env(test): - env = gym.make(args.env) + env = gymnasium.make(args.env) env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 diff --git a/examples/gym/train_dqn_gym.py b/examples/gymnasium/train_dqn_gym.py similarity index 96% rename from examples/gym/train_dqn_gym.py rename to examples/gymnasium/train_dqn_gym.py index 7a310965f..b4a5c22a2 100644 --- a/examples/gym/train_dqn_gym.py +++ b/examples/gymnasium/train_dqn_gym.py @@ -1,24 +1,24 @@ -"""An example of training DQN against OpenAI Gym Envs. +"""An example of training DQN against OpenAI gymnasium Envs. -This script is an example of training a DQN agent against OpenAI Gym envs. +This script is an example of training a DQN agent against OpenAI gymnasium envs. Both discrete and continuous action spaces are supported. For continuous action spaces, A NAF (Normalized Advantage Function) is used to approximate Q-values. -To solve CartPole-v0, run: - python train_dqn_gym.py --env CartPole-v0 +To solve CartPole-v1, run: + python train_dqn_gymnasium.py --env CartPole-v1 -To solve Pendulum-v0, run: - python train_dqn_gym.py --env Pendulum-v0 +To solve Pendulum-v1, run: + python train_dqn_gymnasium.py --env Pendulum-v1 """ import argparse import os import sys -import gym +import gymnasium import numpy as np import torch.optim as optim -from gym import spaces +from gymnasium import spaces import pfrl from pfrl import experiments, explorers @@ -42,7 +42,7 @@ def main(): " If it does not exist, it will be created." ), ) - parser.add_argument("--env", type=str, default="Pendulum-v0") + parser.add_argument("--env", type=str, default="Pendulum-v1") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument("--gpu", type=int, default=0) parser.add_argument("--final-exploration-steps", type=int, default=10**4) @@ -100,7 +100,7 @@ def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) def make_env(idx=0, test=False): - env = gym.make(args.env) + env = gymnasium.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed diff --git a/examples/gym/train_reinforce_gym.py b/examples/gymnasium/train_reinforce_gym.py similarity index 93% rename from examples/gym/train_reinforce_gym.py rename to examples/gymnasium/train_reinforce_gym.py index f2c9eaa61..c82ed51e0 100644 --- a/examples/gym/train_reinforce_gym.py +++ b/examples/gymnasium/train_reinforce_gym.py @@ -1,18 +1,18 @@ -"""An example of training a REINFORCE agent against OpenAI Gym envs. +"""An example of training a REINFORCE agent against OpenAI gymnasium envs. -This script is an example of training a REINFORCE agent against OpenAI Gym +This script is an example of training a REINFORCE agent against OpenAI gymnasium envs. Both discrete and continuous action spaces are supported. To solve CartPole-v0, run: - python train_reinforce_gym.py + python train_reinforce_gymnasium.py To solve InvertedPendulum-v1, run: - python train_reinforce_gym.py --env InvertedPendulum-v1 + python train_reinforce_gymnasium.py --env InvertedPendulum-v1 """ import argparse -import gym -import gym.spaces +import gymnasium +import gymnasium.spaces import torch from torch import nn @@ -59,7 +59,7 @@ def main(): args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(test): - env = gym.make(args.env) + env = gymnasium.make(args.env) # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) @@ -83,7 +83,7 @@ def make_env(test): obs_size = obs_space.low.size hidden_size = 200 # Switch policy types accordingly to action space types - if isinstance(action_space, gym.spaces.Box): + if isinstance(action_space, gymnasium.spaces.Box): model = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.LeakyReLU(0.2), diff --git a/examples/mujoco/reproduction/ddpg/README.md b/examples/mujoco/reproduction/ddpg/README.md index bdc824806..4821f7abc 100644 --- a/examples/mujoco/reproduction/ddpg/README.md +++ b/examples/mujoco/reproduction/ddpg/README.md @@ -1,6 +1,6 @@ # DDPG on MuJoCo benchmarks -This example trains a DDPG agent ([Continuous Control with Deep Reinforcement Learning](https://arxiv.org/abs/1509.02971)) on MuJoCo benchmarks from OpenAI Gym. +This example trains a DDPG agent ([Continuous Control with Deep Reinforcement Learning](https://arxiv.org/abs/1509.02971)) on MuJoCo benchmarks from OpenAI gymnasium. We follow the training and evaluation settings of [Addressing Function Approximation Error in Actor-Critic Methods](http://arxiv.org/abs/1802.09477), which provides thorough, highly tuned benchmark results. diff --git a/examples/mujoco/reproduction/ddpg/train_ddpg.py b/examples/mujoco/reproduction/ddpg/train_ddpg.py index 397d231a6..41932d354 100644 --- a/examples/mujoco/reproduction/ddpg/train_ddpg.py +++ b/examples/mujoco/reproduction/ddpg/train_ddpg.py @@ -1,4 +1,4 @@ -"""A training script of DDPG on OpenAI Gym Mujoco environments. +"""A training script of DDPG on OpenAI gymnasium Mujoco environments. This script follows the settings of http://arxiv.org/abs/1802.09477 as much as possible. @@ -8,8 +8,8 @@ import logging import sys -import gym -import gym.wrappers +import gymnasium +import gymnasium.wrappers import numpy as np import torch from torch import nn @@ -36,7 +36,7 @@ def main(): "--env", type=str, default="Hopper-v2", - help="OpenAI Gym MuJoCo env to perform algorithm on.", + help="OpenAI gymnasium MuJoCo env to perform algorithm on.", ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument( @@ -81,7 +81,7 @@ def main(): "--pretrained-type", type=str, default="best", choices=["best", "final"] ) parser.add_argument( - "--monitor", action="store_true", help="Wrap env with gym.wrappers.Monitor." + "--monitor", action="store_true", help="Wrap env with gymnasium.wrappers.Monitor." ) parser.add_argument( "--log-level", type=int, default=logging.INFO, help="Level of the root logger." @@ -97,9 +97,9 @@ def main(): utils.set_random_seed(args.seed) def make_env(test): - env = gym.make(args.env) + env = gymnasium.make(args.env) # Unwrap TimeLimit wrapper - assert isinstance(env, gym.wrappers.TimeLimit) + assert isinstance(env, gymnasium.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed diff --git a/examples/mujoco/reproduction/ppo/README.md b/examples/mujoco/reproduction/ppo/README.md index 7170455c4..ad1129aaf 100644 --- a/examples/mujoco/reproduction/ppo/README.md +++ b/examples/mujoco/reproduction/ppo/README.md @@ -1,6 +1,6 @@ # PPO on MuJoCo benchmarks -This example trains a PPO agent ([Proximal Policy Optimization Algorithms](http://arxiv.org/abs/1707.06347)) on MuJoCo benchmarks from OpenAI Gym. +This example trains a PPO agent ([Proximal Policy Optimization Algorithms](http://arxiv.org/abs/1707.06347)) on MuJoCo benchmarks from OpenAI gymnasium. We follow the training and evaluation settings of [Deep Reinforcement Learning that Matters](https://arxiv.org/abs/1709.06560), which provides thorough, highly tuned benchmark results. @@ -37,7 +37,7 @@ To view the full list of options, either view the code or run the example with t ## Known differences - While the original paper initialized weights by normal distribution (https://github.com/Breakend/baselines/blob/50ffe01d254221db75cdb5c2ba0ab51a6da06b0a/baselines/ppo1/mlp_policy.py#L28), we use orthogonal initialization as the latest openai/baselines does (https://github.com/openai/baselines/blob/9b68103b737ac46bc201dfb3121cfa5df2127e53/baselines/a2c/utils.py#L61). -- We used version v2 of the environments whereas the original results were reported for version v1, however this doesn't seem to introduce significant differences: https://github.com/openai/gym/pull/834 +- We used version v2 of the environments whereas the original results were reported for version v1, however this doesn't seem to introduce significant differences: https://github.com/openai/gymnasium/pull/834 ## Results diff --git a/examples/mujoco/reproduction/ppo/train_ppo.py b/examples/mujoco/reproduction/ppo/train_ppo.py index a42d8f0af..991de8aec 100644 --- a/examples/mujoco/reproduction/ppo/train_ppo.py +++ b/examples/mujoco/reproduction/ppo/train_ppo.py @@ -1,4 +1,4 @@ -"""A training script of PPO on OpenAI Gym Mujoco environments. +"""A training script of PPO on OpenAI gymnasium Mujoco environments. This script follows the settings of https://arxiv.org/abs/1709.06560 as much as possible. @@ -6,8 +6,8 @@ import argparse import functools -import gym -import gym.spaces +import gymnasium +import gymnasium.spaces import numpy as np import torch from torch import nn @@ -28,7 +28,7 @@ def main(): "--env", type=str, default="Hopper-v2", - help="OpenAI Gym MuJoCo env to perform algorithm on.", + help="OpenAI gymnasium MuJoCo env to perform algorithm on.", ) parser.add_argument( "--num-envs", type=int, default=1, help="Number of envs run in parallel." @@ -75,7 +75,7 @@ def main(): "--log-level", type=int, default=logging.INFO, help="Level of the root logger." ) parser.add_argument( - "--monitor", action="store_true", help="Wrap env with gym.wrappers.Monitor." + "--monitor", action="store_true", help="Wrap env with gymnasium.wrappers.Monitor." ) parser.add_argument( "--log-interval", @@ -112,7 +112,7 @@ def main(): args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): - env = gym.make(args.env) + env = gymnasium.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed @@ -134,14 +134,14 @@ def make_batch_env(test): ) # Only for getting timesteps, and obs-action spaces - sample_env = gym.make(args.env) + sample_env = gymnasium.make(args.env) timestep_limit = sample_env.spec.max_episode_steps obs_space = sample_env.observation_space action_space = sample_env.action_space print("Observation space:", obs_space) print("Action space:", action_space) - assert isinstance(action_space, gym.spaces.Box) + assert isinstance(action_space, gymnasium.spaces.Box) # Normalize observations based on their empirical mean and variance obs_normalizer = pfrl.nn.EmpiricalNormalization( diff --git a/examples/mujoco/reproduction/soft_actor_critic/README.md b/examples/mujoco/reproduction/soft_actor_critic/README.md index 319fdd0c0..da7dd4fde 100644 --- a/examples/mujoco/reproduction/soft_actor_critic/README.md +++ b/examples/mujoco/reproduction/soft_actor_critic/README.md @@ -1,6 +1,6 @@ # Soft Actor-Critic (SAC) on MuJoCo benchmarks -This example trains a SAC agent ([Soft Actor-Critic Algorithms and Applications](https://arxiv.org/abs/1812.05905)) on MuJoCo benchmarks from OpenAI Gym. +This example trains a SAC agent ([Soft Actor-Critic Algorithms and Applications](https://arxiv.org/abs/1812.05905)) on MuJoCo benchmarks from OpenAI gymnasium. ## Requirements diff --git a/examples/mujoco/reproduction/soft_actor_critic/train_soft_actor_critic.py b/examples/mujoco/reproduction/soft_actor_critic/train_soft_actor_critic.py index 851785682..577ca881a 100644 --- a/examples/mujoco/reproduction/soft_actor_critic/train_soft_actor_critic.py +++ b/examples/mujoco/reproduction/soft_actor_critic/train_soft_actor_critic.py @@ -1,4 +1,4 @@ -"""A training script of Soft Actor-Critic on OpenAI Gym Mujoco environments. +"""A training script of Soft Actor-Critic on OpenAI gymnasium Mujoco environments. This script follows the settings of https://arxiv.org/abs/1812.05905 as much as possible. @@ -9,8 +9,8 @@ import sys from distutils.version import LooseVersion -import gym -import gym.wrappers +import gymnasium +import gymnasium.wrappers import numpy as np import torch from torch import distributions, nn @@ -35,7 +35,7 @@ def main(): "--env", type=str, default="Hopper-v2", - help="OpenAI Gym MuJoCo env to perform algorithm on.", + help="OpenAI gymnasium MuJoCo env to perform algorithm on.", ) parser.add_argument( "--num-envs", type=int, default=1, help="Number of envs run in parallel." @@ -83,7 +83,7 @@ def main(): "--pretrained-type", type=str, default="best", choices=["best", "final"] ) parser.add_argument( - "--monitor", action="store_true", help="Wrap env with gym.wrappers.Monitor." + "--monitor", action="store_true", help="Wrap env with gymnasium.wrappers.Monitor." ) parser.add_argument( "--log-interval", @@ -117,9 +117,9 @@ def main(): assert process_seeds.max() < 2**32 def make_env(process_idx, test): - env = gym.make(args.env) + env = gymnasium.make(args.env) # Unwrap TimiLimit wrapper - assert isinstance(env, gym.wrappers.TimeLimit) + assert isinstance(env, gymnasium.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) @@ -130,7 +130,7 @@ def make_env(process_idx, test): # Normalize action space to [-1, 1]^n env = pfrl.wrappers.NormalizeActionSpace(env) if args.monitor: - env = gym.wrappers.Monitor(env, args.outdir) + env = gymnasium.wrappers.Monitor(env, args.outdir) if args.render: env = pfrl.wrappers.Render(env) return env diff --git a/examples/mujoco/reproduction/td3/README.md b/examples/mujoco/reproduction/td3/README.md index a9503b03c..81c2748d9 100644 --- a/examples/mujoco/reproduction/td3/README.md +++ b/examples/mujoco/reproduction/td3/README.md @@ -1,6 +1,6 @@ # TD3 on MuJoCo benchmarks -This example trains a TD3 agent ([Addressing Function Approximation Error in Actor-Critic Methods](http://arxiv.org/abs/1802.09477)) on MuJoCo benchmarks from OpenAI Gym. +This example trains a TD3 agent ([Addressing Function Approximation Error in Actor-Critic Methods](http://arxiv.org/abs/1802.09477)) on MuJoCo benchmarks from OpenAI gymnasium. ## Requirements @@ -55,7 +55,7 @@ Each evaluation reports average return over 10 episodes without exploration nois Maximum evaluation scores, averaged over 10 trials (+/- standard deviation), are reported for each environment. Reported scores are taken from the "TD3" column of Table 1 of [Addressing Function Approximation Error in Actor-Critic Methods](http://arxiv.org/abs/1802.09477). -Although the original paper used v1 versions of MuJoCo envs, we used v2 as v1 are not supported by recent versions of OpenAI Gym. +Although the original paper used v1 versions of MuJoCo envs, we used v2 as v1 are not supported by recent versions of OpenAI gymnasium. | Environment | PFRL Score | Reported Score | | ------------------------- |:---------------------:|:---------------------:| @@ -73,7 +73,7 @@ Although the original paper used v1 versions of MuJoCo envs, we used v2 as v1 ar Average return of last 10 evaluation scores, averaged over 10 trials, are reported for each environment. Reported scores are taken from the "TD3" row of Table 2 of [Addressing Function Approximation Error in Actor-Critic Methods](http://arxiv.org/abs/1802.09477). -Although the original paper used v1 versions of MuJoCo envs, we used v2 as v1 are not supported by recent versions of OpenAI Gym. +Although the original paper used v1 versions of MuJoCo envs, we used v2 as v1 are not supported by recent versions of OpenAI gymnasium. | Environment | PFRL Score | Reported Score | | ------------------------- |:------------:|:--------------:| diff --git a/examples/mujoco/reproduction/td3/train_td3.py b/examples/mujoco/reproduction/td3/train_td3.py index e9ad62259..021388051 100644 --- a/examples/mujoco/reproduction/td3/train_td3.py +++ b/examples/mujoco/reproduction/td3/train_td3.py @@ -1,4 +1,4 @@ -"""A training script of TD3 on OpenAI Gym Mujoco environments. +"""A training script of TD3 on OpenAI gymnasium Mujoco environments. This script follows the settings of http://arxiv.org/abs/1802.09477 as much as possible. @@ -8,8 +8,8 @@ import logging import sys -import gym -import gym.wrappers +import gymnasium +import gymnasium.wrappers import numpy as np import torch from torch import nn @@ -33,7 +33,7 @@ def main(): "--env", type=str, default="Hopper-v2", - help="OpenAI Gym MuJoCo env to perform algorithm on.", + help="OpenAI gymnasium MuJoCo env to perform algorithm on.", ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument( @@ -78,7 +78,7 @@ def main(): "--pretrained-type", type=str, default="best", choices=["best", "final"] ) parser.add_argument( - "--monitor", action="store_true", help="Wrap env with gym.wrappers.Monitor." + "--monitor", action="store_true", help="Wrap env with gymnasium.wrappers.Monitor." ) parser.add_argument( "--log-level", type=int, default=logging.INFO, help="Level of the root logger." @@ -94,9 +94,9 @@ def main(): utils.set_random_seed(args.seed) def make_env(test): - env = gym.make(args.env) + env = gymnasium.make(args.env) # Unwrap TimeLimit wrapper - assert isinstance(env, gym.wrappers.TimeLimit) + assert isinstance(env, gymnasium.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed diff --git a/examples/mujoco/reproduction/trpo/README.md b/examples/mujoco/reproduction/trpo/README.md index 1841ee7e4..b2b176ece 100644 --- a/examples/mujoco/reproduction/trpo/README.md +++ b/examples/mujoco/reproduction/trpo/README.md @@ -1,6 +1,6 @@ # TRPO on MuJoCo benchmarks -This example trains a TRPO agent ([Trust Region Policy Optimization](https://arxiv.org/abs/1502.05477)) on MuJoCo benchmarks from OpenAI Gym. +This example trains a TRPO agent ([Trust Region Policy Optimization](https://arxiv.org/abs/1502.05477)) on MuJoCo benchmarks from OpenAI gymnasium. We follow the training and evaluation settings of [Deep Reinforcement Learning that Matters](https://arxiv.org/abs/1709.06560), which provides thorough, highly tuned benchmark results. @@ -37,7 +37,7 @@ To view the full list of options, either view the code or run the example with t ## Known differences -- We used version v2 of the environments whereas the original results were reported for version v1, however this doesn't seem to introduce significant differences: https://github.com/openai/gym/pull/834 +- We used version v2 of the environments whereas the original results were reported for version v1, however this doesn't seem to introduce significant differences: https://github.com/openai/gymnasium/pull/834 ## Results diff --git a/examples/mujoco/reproduction/trpo/train_trpo.py b/examples/mujoco/reproduction/trpo/train_trpo.py index 339a4955d..f11a0a331 100644 --- a/examples/mujoco/reproduction/trpo/train_trpo.py +++ b/examples/mujoco/reproduction/trpo/train_trpo.py @@ -1,4 +1,4 @@ -"""A training script of TRPO on OpenAI Gym Mujoco environments. +"""A training script of TRPO on OpenAI gymnasium Mujoco environments. This script follows the settings of https://arxiv.org/abs/1709.06560 as much as possible. @@ -6,9 +6,9 @@ import argparse import logging -import gym -import gym.spaces -import gym.wrappers +import gymnasium +import gymnasium.spaces +import gymnasium.wrappers import torch from torch import nn @@ -20,7 +20,7 @@ def main(): parser.add_argument( "--gpu", type=int, default=0, help="GPU device ID. Set to -1 to use CPUs only." ) - parser.add_argument("--env", type=str, default="Hopper-v2", help="Gym Env ID") + parser.add_argument("--env", type=str, default="Hopper-v2", help="gymnasium Env ID") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument( "--outdir", @@ -81,7 +81,7 @@ def main(): "--monitor", action="store_true", help=( - "Monitor the env by gym.wrappers.Monitor." + "Monitor the env by gymnasium.wrappers.Monitor." " Videos and additional log will be saved." ), ) @@ -95,14 +95,14 @@ def main(): args.outdir = pfrl.experiments.prepare_output_dir(args, args.outdir) def make_env(test): - env = gym.make(args.env) + env = gymnasium.make(args.env) # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) if args.monitor: - env = gym.wrappers.Monitor(env, args.outdir) + env = gymnasium.wrappers.Monitor(env, args.outdir) if args.render: env = pfrl.wrappers.Render(env) return env @@ -114,7 +114,7 @@ def make_env(test): print("Observation space:", obs_space) print("Action space:", action_space) - assert isinstance(obs_space, gym.spaces.Box) + assert isinstance(obs_space, gymnasium.spaces.Box) # Normalize observations based on their empirical mean and variance obs_normalizer = pfrl.nn.EmpiricalNormalization( diff --git a/examples/optuna/optuna_dqn_obs1d.py b/examples/optuna/optuna_dqn_obs1d.py index c21e70e8d..c1cd44011 100644 --- a/examples/optuna/optuna_dqn_obs1d.py +++ b/examples/optuna/optuna_dqn_obs1d.py @@ -14,7 +14,7 @@ import os import random -import gym +import gymnasium as gym import torch.optim as optim try: @@ -244,7 +244,7 @@ def main(): "--env", type=str, default="LunarLander-v2", - help="OpenAI Gym Environment ID.", + help="OpenAI gym Environment ID.", ) parser.add_argument( "--outdir", diff --git a/examples/quickstart/quickstart.ipynb b/examples/quickstart/quickstart.ipynb index b31d0fe2e..d139c0ef7 100644 --- a/examples/quickstart/quickstart.ipynb +++ b/examples/quickstart/quickstart.ipynb @@ -15,7 +15,7 @@ "\n", "If you have already installed PFRL, let's begin!\n", "\n", - "First, you need to import necessary modules. The module name of PFRL is `pfrl`. Let's import `torch`, `gym`, and `numpy` as well since they are used later." + "First, you need to import necessary modules. The module name of PFRL is `pfrl`. Let's import `torch`, `gymnasium`, and `numpy` as well since they are used later." ] }, { @@ -27,7 +27,7 @@ "import pfrl\n", "import torch\n", "import torch.nn\n", - "import gym\n", + "import gymnasium\n", "import numpy" ] }, @@ -35,13 +35,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "PFRL can be used for any problems if they are modeled as \"environments\". [OpenAI Gym](https://github.com/openai/gym) provides various kinds of benchmark environments and defines the common interface among them. PFRL uses a subset of the interface. Specifically, an environment must define its observation space and action space and have at least two methods: `reset` and `step`.\n", + "PFRL can be used for any problems if they are modeled as \"environments\". [OpenAI gymnasium](https://github.com/openai/gymnasium) provides various kinds of benchmark environments and defines the common interface among them. PFRL uses a subset of the interface. Specifically, an environment must define its observation space and action space and have at least two methods: `reset` and `step`.\n", "\n", "- `env.reset` will reset the environment to the initial state and return the initial observation.\n", - "- `env.step` will execute a given action, move to the next state and return four values:\n", + "- `env.step` will execute a given action, move to the next state and return five values:\n", " - a next observation\n", " - a scalar reward\n", " - a boolean value indicating whether the current state is terminal or not\n", + " - a boolean value indicating whether the episode has been truncated or not\n", " - additional information\n", "- `env.render` will render the current state. (optional)\n", "\n", @@ -73,7 +74,7 @@ } ], "source": [ - "env = gym.make('CartPole-v0')\n", + "env = gymnasium.make('CartPole-v0')\n", "print('observation space:', env.observation_space)\n", "print('action space:', env.action_space)\n", "\n", @@ -81,10 +82,11 @@ "print('initial observation:', obs)\n", "\n", "action = env.action_space.sample()\n", - "obs, r, done, info = env.step(action)\n", + "obs, r, terminated, truncated, info = env.step(action)\n", "print('next observation:', obs)\n", "print('reward:', r)\n", - "print('done:', done)\n", + "print('terminated:', terminated)\n", + "print('terminated:', truncated)\n", "print('info:', info)\n", "\n", "# Uncomment to open a GUI window rendering the current state of the environment\n", @@ -315,11 +317,11 @@ " # Uncomment to watch the behavior in a GUI window\n", " # env.render()\n", " action = agent.act(obs)\n", - " obs, reward, done, _ = env.step(action)\n", + " obs, reward, terminated, _, _ = env.step(action)\n", " R += reward\n", " t += 1\n", " reset = t == max_episode_len\n", - " agent.observe(obs, reward, done, reset)\n", + " agent.observe(obs, reward, terminated, reset)\n", " if done or reset:\n", " break\n", " if i % 10 == 0:\n", @@ -373,11 +375,11 @@ " # Uncomment to watch the behavior in a GUI window\n", " # env.render()\n", " action = agent.act(obs)\n", - " obs, r, done, _ = env.step(action)\n", + " obs, r, terminated, _, _ = env.step(action)\n", " R += r\n", " t += 1\n", " reset = t == 200\n", - " agent.observe(obs, r, done, reset)\n", + " agent.observe(obs, r, terminated, reset)\n", " if done or reset:\n", " break\n", " print('evaluation episode:', i, 'R:', R)" diff --git a/examples/slimevolley/README.md b/examples/slimevolley/README.md index a3a4eac8f..b70b73032 100644 --- a/examples/slimevolley/README.md +++ b/examples/slimevolley/README.md @@ -37,7 +37,7 @@ python examples/slimevolley/train_rainbow.py --demo --render --load > i) % 2 for i in range(self.orig_action_space.n)] @@ -129,10 +129,10 @@ def main(): def make_env(test): if "SlimeVolley" in args.env: - # You need to install slimevolleygym - import slimevolleygym # NOQA + # You need to install slimevolleygymnasium + import slimevolleygymnasium # NOQA - env = gym.make(args.env) + env = gymnasium.make(args.env) # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env.seed(int(env_seed)) @@ -142,7 +142,7 @@ def make_env(test): ) if args.render: env = pfrl.wrappers.Render(env) - if isinstance(env.action_space, gym.spaces.MultiBinary): + if isinstance(env.action_space, gymnasium.spaces.MultiBinary): env = MultiBinaryAsDiscreteAction(env) return env diff --git a/examples_tests/gym/test_categorical_dqn.sh b/examples_tests/gym/test_categorical_dqn.sh index db8c8505f..28181fb16 100644 --- a/examples_tests/gym/test_categorical_dqn.sh +++ b/examples_tests/gym/test_categorical_dqn.sh @@ -6,7 +6,7 @@ outdir=$(mktemp -d) gpu="$1" -# gym/categorical_dqn -python examples/gym/train_categorical_dqn_gym.py --steps 100 --replay-start-size 50 --outdir $outdir/gym/categorical_dqn --gpu $gpu -model=$(find $outdir/gym/categorical_dqn -name "*_finish") -python examples/gym/train_categorical_dqn_gym.py --demo --load $model --eval-n-runs 1 --outdir $outdir/temp --gpu $gpu +# gymnasium/categorical_dqn +python examples/gymnasium/train_categorical_dqn_gymnasium.py --steps 100 --replay-start-size 50 --outdir $outdir/gymnasium/categorical_dqn --gpu $gpu +model=$(find $outdir/gymnasium/categorical_dqn -name "*_finish") +python examples/gymnasium/train_categorical_dqn_gymnasium.py --demo --load $model --eval-n-runs 1 --outdir $outdir/temp --gpu $gpu diff --git a/examples_tests/gym/test_dqn.sh b/examples_tests/gym/test_dqn.sh index c4452538c..fca628ddf 100644 --- a/examples_tests/gym/test_dqn.sh +++ b/examples_tests/gym/test_dqn.sh @@ -6,7 +6,7 @@ outdir=$(mktemp -d) gpu="$1" -# gym/dqn -python examples/gym/train_dqn_gym.py --steps 100 --replay-start-size 50 --outdir $outdir/gym/dqn --gpu $gpu -model=$(find $outdir/gym/dqn -name "*_finish") -python examples/gym/train_dqn_gym.py --demo --load $model --eval-n-runs 1 --outdir $outdir/temp --gpu $gpu +# gymnasium/dqn +python examples/gymnasium/train_dqn_gymnasium.py --steps 100 --replay-start-size 50 --outdir $outdir/gymnasium/dqn --gpu $gpu +model=$(find $outdir/gymnasium/dqn -name "*_finish") +python examples/gymnasium/train_dqn_gymnasium.py --demo --load $model --eval-n-runs 1 --outdir $outdir/temp --gpu $gpu diff --git a/examples_tests/gym/test_reinforce.sh b/examples_tests/gym/test_reinforce.sh index 77a36bc89..f5a8d1e86 100644 --- a/examples_tests/gym/test_reinforce.sh +++ b/examples_tests/gym/test_reinforce.sh @@ -6,7 +6,7 @@ outdir=$(mktemp -d) gpu="$1" -# gym/reinforce -python examples/gym/train_reinforce_gym.py --steps 100 --batchsize 1 --outdir $outdir/gym/reinforce --gpu $gpu -model=$(find $outdir/gym/reinforce -name "*_finish") -python examples/gym/train_reinforce_gym.py --demo --load $model --eval-n-runs 1 --outdir $outdir/temp --gpu $gpu +# gymnasium/reinforce +python examples/gymnasium/train_reinforce_gymnasium.py --steps 100 --batchsize 1 --outdir $outdir/gymnasium/reinforce --gpu $gpu +model=$(find $outdir/gymnasium/reinforce -name "*_finish") +python examples/gymnasium/train_reinforce_gymnasium.py --demo --load $model --eval-n-runs 1 --outdir $outdir/temp --gpu $gpu diff --git a/examples_tests/slimevolley/test_rainbow.sh b/examples_tests/slimevolley/test_rainbow.sh index 605f19b08..e2c48c133 100644 --- a/examples_tests/slimevolley/test_rainbow.sh +++ b/examples_tests/slimevolley/test_rainbow.sh @@ -7,7 +7,7 @@ outdir=$(mktemp -d) gpu="$1" # slimevolley/rainbow -# Use CartPole-v0 to test without installing slimevolleygym +# Use CartPole-v0 to test without installing slimevolleygymnasium python examples/slimevolley/train_rainbow.py --gpu $gpu --steps 100 --outdir $outdir/slimevolley/rainbow --env CartPole-v0 model=$(find $outdir/slimevolley/rainbow -name "*_finish") python examples/slimevolley/train_rainbow.py --demo --load $model --eval-n-episodes 1 --outdir $outdir/temp --gpu $gpu --env CartPole-v0 diff --git a/pfrl/envs/abc.py b/pfrl/envs/abc.py index 29b7b8e29..53e5591fe 100644 --- a/pfrl/envs/abc.py +++ b/pfrl/envs/abc.py @@ -1,5 +1,6 @@ import numpy as np -from gym import spaces +import gymnasium as gym +from gymnasium import spaces from pfrl import env @@ -123,7 +124,7 @@ def reset(self): self._offset = np.random.randint(self.n_max_offset + 1) else: self._offset = 0 - return self.observe() + return self.observe(), {} def step(self, action): if isinstance(self.action_space, spaces.Box): diff --git a/pfrl/envs/multiprocess_vector_env.py b/pfrl/envs/multiprocess_vector_env.py index a993e1940..540552f17 100644 --- a/pfrl/envs/multiprocess_vector_env.py +++ b/pfrl/envs/multiprocess_vector_env.py @@ -16,11 +16,11 @@ def worker(remote, env_fn): while True: cmd, data = remote.recv() if cmd == "step": - ob, reward, done, info = env.step(data) - remote.send((ob, reward, done, info)) + ob, reward, terminated, truncated, info = env.step(data) + remote.send((ob, reward, terminated, truncated, info)) elif cmd == "reset": - ob = env.reset() - remote.send(ob) + ob, info = env.reset() + remote.send((ob, info)) elif cmd == "close": remote.close() break @@ -41,7 +41,7 @@ class MultiprocessVectorEnv(pfrl.env.VectorEnv): Args: env_fns (list of callable): List of callables, each of which - returns gym.Env that is run in its own subprocess. + returns gymnasium.Env that is run in its own subprocess. """ def __init__(self, env_fns): @@ -83,8 +83,8 @@ def step(self, actions): for remote, action in zip(self.remotes, actions): remote.send(("step", action)) results = [remote.recv() for remote in self.remotes] - self.last_obs, rews, dones, infos = zip(*results) - return self.last_obs, rews, dones, infos + self.last_obs, rews, terminateds, truncateds, infos = zip(*results) + return self.last_obs, rews, terminateds, truncateds, infos def reset(self, mask=None): self._assert_not_closed() @@ -94,12 +94,13 @@ def reset(self, mask=None): if not m: remote.send(("reset", None)) - obs = [ - remote.recv() if not m else o + results = [ + remote.recv() if not m else (o, {}) for m, remote, o in zip(mask, self.remotes, self.last_obs) ] + obs, info = zip(*results) self.last_obs = obs - return obs + return obs, info def close(self): self._assert_not_closed() diff --git a/pfrl/envs/serial_vector_env.py b/pfrl/envs/serial_vector_env.py index 73104adfe..7c2416fe6 100644 --- a/pfrl/envs/serial_vector_env.py +++ b/pfrl/envs/serial_vector_env.py @@ -10,7 +10,7 @@ class SerialVectorEnv(pfrl.env.VectorEnv): use MultiprocessVectorEnv if possible. Args: - env_fns (list of gym.Env): List of gym.Env. + env_fns (list of gymnasium.Env): List of gymnasium.Env. """ def __init__(self, envs): @@ -22,8 +22,8 @@ def __init__(self, envs): def step(self, actions): results = [env.step(a) for env, a in zip(self.envs, actions)] - self.last_obs, rews, dones, infos = zip(*results) - return self.last_obs, rews, dones, infos + self.last_obs, rews, terminations, truncations, infos = zip(*results) + return self.last_obs, rews, terminations, truncations, infos def reset(self, mask=None): if mask is None: @@ -33,7 +33,7 @@ def reset(self, mask=None): for m, env, o in zip(mask, self.envs, self.last_obs) ] self.last_obs = obs - return obs + return obs, {} def seed(self, seeds): for env, seed in zip(self.envs, seeds): diff --git a/pfrl/experiments/evaluator.py b/pfrl/experiments/evaluator.py index 75691784c..4b0afbede 100644 --- a/pfrl/experiments/evaluator.py +++ b/pfrl/experiments/evaluator.py @@ -8,7 +8,6 @@ import pfrl - def _run_episodes( env, agent, @@ -23,24 +22,24 @@ def _run_episodes( logger = logger or logging.getLogger(__name__) scores = [] lengths = [] - terminate = False + terminated = False timestep = 0 reset = True - while not terminate: + while not terminated: if reset: - obs = env.reset() - done = False + obs, info = env.reset() + terminated = False test_r = 0 episode_len = 0 info = {} a = agent.act(obs) - obs, r, done, info = env.step(a) + obs, r, terminated, truncated, info = env.step(a) test_r += r episode_len += 1 timestep += 1 - reset = done or episode_len == max_episode_len or info.get("needs_reset", False) - agent.observe(obs, r, done, reset) + reset = terminated or episode_len == max_episode_len or info.get("needs_reset", False) or truncated + agent.observe(obs, r, terminated, reset) if reset: logger.info( "evaluation episode %s length:%s R:%s", len(scores), episode_len, test_r @@ -50,9 +49,9 @@ def _run_episodes( scores.append(float(test_r)) lengths.append(float(episode_len)) if n_steps is None: - terminate = len(scores) >= n_episodes + terminated = len(scores) >= n_episodes else: - terminate = timestep >= n_steps + terminated = timestep >= n_steps # If all steps were used for a single unfinished episode if len(scores) == 0: scores.append(float(test_r)) @@ -120,7 +119,7 @@ def _batch_run_episodes( episode_r = np.zeros(num_envs, dtype=np.float64) episode_len = np.zeros(num_envs, dtype="i") - obss = env.reset() + obss, infos = env.reset() rs = np.zeros(num_envs, dtype="f") termination_conditions = False @@ -130,7 +129,7 @@ def _batch_run_episodes( actions = agent.batch_act(obss) timestep += 1 # o_{t+1}, r_{t+1} - obss, rs, dones, infos = env.step(actions) + obss, rs, terminations, truncations, infos = env.step(actions) episode_r += rs episode_len += 1 # Compute mask for done and reset @@ -139,11 +138,11 @@ def _batch_run_episodes( else: resets = episode_len == max_episode_len resets = np.logical_or( - resets, [info.get("needs_reset", False) for info in infos] + resets, [info.get("needs_reset", False) or truncated for truncated, info in zip(truncations, infos)] ) # Make mask. 0 if done/reset, 1 if pass - end = np.logical_or(resets, dones) + end = np.logical_or(resets, terminations) not_end = np.logical_not(end) for index in range(len(end)): @@ -199,12 +198,12 @@ def _batch_run_episodes( resets.fill(True) # Agent observes the consequences. - agent.batch_observe(obss, rs, dones, resets) + agent.batch_observe(obss, rs, terminations, resets) if termination_conditions: break else: - obss = env.reset(not_end) + obss, infos = env.reset(not_end) for i, (epi_len, epi_ret) in enumerate( zip(eval_episode_lens, eval_episode_returns) diff --git a/pfrl/experiments/train_agent.py b/pfrl/experiments/train_agent.py index c28e71b35..81321b9ac 100644 --- a/pfrl/experiments/train_agent.py +++ b/pfrl/experiments/train_agent.py @@ -41,7 +41,7 @@ def train_agent( episode_idx = 0 # o_0, r_0 - obs = env.reset() + obs , info = env.reset() t = step_offset if hasattr(agent, "t"): @@ -54,17 +54,17 @@ def train_agent( # a_t action = agent.act(obs) # o_{t+1}, r_{t+1} - obs, r, done, info = env.step(action) + obs, r, terminated, truncated, info = env.step(action) t += 1 episode_r += r episode_len += 1 - reset = episode_len == max_episode_len or info.get("needs_reset", False) - agent.observe(obs, r, done, reset) + reset = episode_len == max_episode_len or info.get("needs_reset", False) or truncated + agent.observe(obs, r, terminated, reset) for hook in step_hooks: hook(env, agent, t) - episode_end = done or reset or t == steps + episode_end = terminated or reset or t == steps if episode_end: logger.info( @@ -96,7 +96,7 @@ def train_agent( # Start a new episode episode_r = 0 episode_len = 0 - obs = env.reset() + obs, info = env.reset() if checkpoint_freq and t % checkpoint_freq == 0: save_agent(agent, t, outdir, logger, suffix="_checkpoint") diff --git a/pfrl/experiments/train_agent_async.py b/pfrl/experiments/train_agent_async.py index d8b3b4057..9e5971523 100644 --- a/pfrl/experiments/train_agent_async.py +++ b/pfrl/experiments/train_agent_async.py @@ -58,7 +58,7 @@ def save_model(): global_t = 0 local_t = 0 global_episodes = 0 - obs = env.reset() + obs, info = env.reset() episode_len = 0 successful = False @@ -66,12 +66,12 @@ def save_model(): # a_t a = agent.act(obs) # o_{t+1}, r_{t+1} - obs, r, done, info = env.step(a) + obs, r, terminated, truncated, info = env.step(a) local_t += 1 episode_r += r episode_len += 1 - reset = episode_len == max_episode_len or info.get("needs_reset", False) - agent.observe(obs, r, done, reset) + reset = episode_len == max_episode_len or info.get("needs_reset", False) or truncated + agent.observe(obs, r, terminated, reset) # Get and increment the global counter with counter.get_lock(): @@ -81,7 +81,7 @@ def save_model(): for hook in global_step_hooks: hook(env, agent, global_t) - if done or reset or global_t >= steps or stop_event.is_set(): + if terminated or reset or global_t >= steps or stop_event.is_set(): if process_idx == 0: logger.info( "outdir:%s global_step:%s local_step:%s R:%s", @@ -119,7 +119,7 @@ def save_model(): # Start a new episode episode_r = 0 episode_len = 0 - obs = env.reset() + obs, info = env.reset() if process_idx == 0 and exception_event.is_set(): logger.exception("An exception detected, exiting") diff --git a/pfrl/experiments/train_agent_batch.py b/pfrl/experiments/train_agent_batch.py index add7cda81..8826830ef 100644 --- a/pfrl/experiments/train_agent_batch.py +++ b/pfrl/experiments/train_agent_batch.py @@ -54,7 +54,7 @@ def train_agent_batch( episode_len = np.zeros(num_envs, dtype="i") # o_0, r_0 - obss = env.reset() + obss, infos = env.reset() t = step_offset if hasattr(agent, "t"): @@ -66,7 +66,7 @@ def train_agent_batch( # a_t actions = agent.batch_act(obss) # o_{t+1}, r_{t+1} - obss, rs, dones, infos = env.step(actions) + obss, rs, terminations, truncations, infos = env.step(actions) episode_r += rs episode_len += 1 @@ -76,13 +76,13 @@ def train_agent_batch( else: resets = episode_len == max_episode_len resets = np.logical_or( - resets, [info.get("needs_reset", False) for info in infos] + resets, [info.get("needs_reset", False) or truncation for truncation, info in zip(truncations, infos)] ) # Agent observes the consequences - agent.batch_observe(obss, rs, dones, resets) + agent.batch_observe(obss, rs, terminations, resets) - # Make mask. 0 if done/reset, 1 if pass - end = np.logical_or(resets, dones) + # Make mask. 0 if termination/reset, 1 if pass + end = np.logical_or(resets, terminations) not_end = np.logical_not(end) # For episodes that ends, do the following: @@ -138,7 +138,7 @@ def train_agent_batch( # Start new episodes if needed episode_r[end] = 0 episode_len[end] = 0 - obss = env.reset(not_end) + obss, infos = env.reset(not_end) except (Exception, KeyboardInterrupt): # Save the current model before being killed diff --git a/pfrl/utils/env_modifiers.py b/pfrl/utils/env_modifiers.py index a605b7b71..2c8b94259 100644 --- a/pfrl/utils/env_modifiers.py +++ b/pfrl/utils/env_modifiers.py @@ -24,11 +24,11 @@ def make_timestep_limited(env, timestep_limit): old_reset = env.reset def step(action): - observation, reward, done, info = old_step(action) + observation, reward, done, truncated, info = old_step(action) if t[0] >= timestep_limit: done = True t[0] += 1 - return observation, reward, done, info + return observation, reward, done, truncated, info def reset(): t[0] = 1 @@ -51,9 +51,9 @@ def make_reward_filtered(env, reward_filter): old_step = env.step def step(action): - observation, reward, done, info = old_step(action) + observation, reward, done, truncated, info = old_step(action) reward = reward_filter(reward) - return observation, reward, done, info + return observation, reward, done, truncated, info env.step = step @@ -73,10 +73,10 @@ def make_action_repeated(env, n_times): def step(action): r_total = 0 for _ in range(n_times): - obs, r, done, info = old_step(action) + obs, r, done, truncated, info = old_step(action) r_total += r if done: break - return obs, r_total, done, info + return obs, r_total, done, truncated, info env.step = step diff --git a/pfrl/utils/pretrained_models.py b/pfrl/utils/pretrained_models.py index 3c7e64d02..37e7bc5a0 100644 --- a/pfrl/utils/pretrained_models.py +++ b/pfrl/utils/pretrained_models.py @@ -162,7 +162,7 @@ def download_model(alg, env, model_type="best"): Args: alg (string): URL to download from. - env (string): Gym Environment name. + env (string): gymnasium Environment name. model_type (string): Either `best` or `final`. Returns: str: Path to the downloaded file. diff --git a/pfrl/wrappers/__init__.py b/pfrl/wrappers/__init__.py index 0f3e99258..ae26a4db5 100644 --- a/pfrl/wrappers/__init__.py +++ b/pfrl/wrappers/__init__.py @@ -1,7 +1,5 @@ from pfrl.wrappers.cast_observation import CastObservation # NOQA from pfrl.wrappers.cast_observation import CastObservationToFloat32 # NOQA -from pfrl.wrappers.continuing_time_limit import ContinuingTimeLimit # NOQA -from pfrl.wrappers.monitor import Monitor # NOQA from pfrl.wrappers.normalize_action_space import NormalizeActionSpace # NOQA from pfrl.wrappers.randomize_action import RandomizeAction # NOQA from pfrl.wrappers.render import Render # NOQA diff --git a/pfrl/wrappers/atari_wrappers.py b/pfrl/wrappers/atari_wrappers.py index 2a4977952..02a821f7e 100644 --- a/pfrl/wrappers/atari_wrappers.py +++ b/pfrl/wrappers/atari_wrappers.py @@ -4,13 +4,14 @@ from collections import deque -import gym +import gymnasium as gym import numpy as np -from gym import spaces +from gymnasium import spaces from packaging import version import pfrl + try: import cv2 @@ -45,10 +46,10 @@ def reset(self, **kwargs): assert noops > 0 obs = None for _ in range(noops): - obs, _, done, info = self.env.step(self.noop_action) - if done or info.get("needs_reset", False): - obs = self.env.reset(**kwargs) - return obs + obs, _, done, truncated, info = self.env.step(self.noop_action) + if done or info.get("needs_reset", False) or truncated: + obs, info = self.env.reset(**kwargs) + return obs, info def step(self, ac): return self.env.step(ac) @@ -63,13 +64,13 @@ def __init__(self, env): def reset(self, **kwargs): self.env.reset(**kwargs) - obs, _, done, info = self.env.step(1) - if done or info.get("needs_reset", False): + obs, _, done, truncated, info = self.env.step(1) + if done or info.get("needs_reset", False) or truncated: self.env.reset(**kwargs) obs, _, done, info = self.env.step(2) - if done or info.get("needs_reset", False): + if done or info.get("needs_reset", False) or truncated: self.env.reset(**kwargs) - return obs + return obs, {} def step(self, ac): return self.env.step(ac) @@ -86,8 +87,8 @@ def __init__(self, env): self.needs_real_reset = True def step(self, action): - obs, reward, done, info = self.env.step(action) - self.needs_real_reset = done or info.get("needs_reset", False) + obs, reward, terminated, truncated, info = self.env.step(action) + self.needs_real_reset = terminated or info.get("needs_reset", False) or truncated # check current lives, make loss of life terminal, # then update lives to handle bonus lives lives = self.env.unwrapped.ale.lives() @@ -96,9 +97,9 @@ def step(self, action): # frames # so its important to keep lives > 0, so that we only reset once # the environment advertises done. - done = True + terminated = True self.lives = lives - return obs, reward, done, info + return obs, reward, terminated, truncated, info def reset(self, **kwargs): """Reset only when lives are exhausted. @@ -107,12 +108,12 @@ def reset(self, **kwargs): and the learner need not know about any of this behind-the-scenes. """ if self.needs_real_reset: - obs = self.env.reset(**kwargs) + obs, info = self.env.reset(**kwargs) else: # no-op step to advance from terminal/lost life state - obs, _, _, _ = self.env.step(0) + obs, _, _, _, info = self.env.step(0) self.lives = self.env.unwrapped.ale.lives() - return obs + return obs, info class MaxAndSkipEnv(gym.Wrapper): @@ -128,19 +129,19 @@ def step(self, action): total_reward = 0.0 done = None for i in range(self._skip): - obs, reward, done, info = self.env.step(action) + obs, reward, done, truncated, info = self.env.step(action) if i == self._skip - 2: self._obs_buffer[0] = obs if i == self._skip - 1: self._obs_buffer[1] = obs total_reward += reward - if done or info.get("needs_reset", False): + if done or info.get("needs_reset", False) or truncated: break # Note that the observation on the done=True frame # doesn't matter max_frame = self._obs_buffer.max(axis=0) - return max_frame, total_reward, done, info + return max_frame, total_reward, done, truncated, info def reset(self, **kwargs): return self.env.reset(**kwargs) @@ -207,15 +208,15 @@ def __init__(self, env, k, channel_order="hwc"): ) def reset(self): - ob = self.env.reset() + ob, info = self.env.reset() for _ in range(self.k): self.frames.append(ob) - return self._get_ob() + return self._get_ob(), info def step(self, action): - ob, reward, done, info = self.env.step(action) + ob, reward, done, truncated, info = self.env.step(action) self.frames.append(ob) - return self._get_ob(), reward, done, info + return self._get_ob(), reward, done, truncated, info def _get_ob(self): assert len(self.frames) == self.k @@ -286,13 +287,11 @@ def observation(self, observation): def make_atari(env_id, max_frames=30 * 60 * 60): - env = gym.make(env_id) + env = gym.make(env_id, + repeat_action_probability=0.0, + full_action_space=False, frameskip=1, + max_num_frames_per_episode=max_frames) assert "NoFrameskip" in env.spec.id - assert isinstance(env, gym.wrappers.TimeLimit) - # Unwrap TimeLimit wrapper because we use our own time limits - env = env.env - if max_frames: - env = pfrl.wrappers.ContinuingTimeLimit(env, max_episode_steps=max_frames) env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) return env diff --git a/pfrl/wrappers/cast_observation.py b/pfrl/wrappers/cast_observation.py index 4519e6fd4..2fc853243 100644 --- a/pfrl/wrappers/cast_observation.py +++ b/pfrl/wrappers/cast_observation.py @@ -1,4 +1,4 @@ -import gym +import gymnasium as gym import numpy as np diff --git a/pfrl/wrappers/continuing_time_limit.py b/pfrl/wrappers/continuing_time_limit.py deleted file mode 100644 index 04d7bec4f..000000000 --- a/pfrl/wrappers/continuing_time_limit.py +++ /dev/null @@ -1,41 +0,0 @@ -import gym - - -class ContinuingTimeLimit(gym.Wrapper): - """TimeLimit wrapper for continuing environments. - - This is similar gym.wrappers.TimeLimit, which sets a time limit for - each episode, except that done=False is returned and that - info['needs_reset'] is set to True when past the limit. - - Code that calls env.step is responsible for checking the info dict, the - fourth returned value, and resetting the env if it has the 'needs_reset' - key and its value is True. - - Args: - env (gym.Env): Env to wrap. - max_episode_steps (int): Maximum number of timesteps during an episode, - after which the env needs a reset. - """ - - def __init__(self, env, max_episode_steps): - super(ContinuingTimeLimit, self).__init__(env) - self._max_episode_steps = max_episode_steps - - self._elapsed_steps = None - - def step(self, action): - assert ( - self._elapsed_steps is not None - ), "Cannot call env.step() before calling reset()" - observation, reward, done, info = self.env.step(action) - self._elapsed_steps += 1 - - if self._max_episode_steps <= self._elapsed_steps: - info["needs_reset"] = True - - return observation, reward, done, info - - def reset(self): - self._elapsed_steps = 0 - return self.env.reset() diff --git a/pfrl/wrappers/gym_wrapper.py b/pfrl/wrappers/gym_wrapper.py new file mode 100644 index 000000000..728cb40c6 --- /dev/null +++ b/pfrl/wrappers/gym_wrapper.py @@ -0,0 +1,16 @@ +import gymnasium + + +class GymWrapper(gymnasium.Env): + def __init__(self, gym_env): + """A Gymnasium environment that wraps OpenAI gym environments.""" + super(GymWrapper, self).__init__() + self.env = gym_env + + def reset(self, **kwargs): + obs = self.env.reset() + return obs, {} + + def step(self, action): + obs, reward, done, info = self.env.step(action) + return obs, reward, done, False, info diff --git a/pfrl/wrappers/monitor.py b/pfrl/wrappers/monitor.py deleted file mode 100644 index 4e8e842da..000000000 --- a/pfrl/wrappers/monitor.py +++ /dev/null @@ -1,113 +0,0 @@ -import time -from logging import getLogger - -try: - from gym.wrappers import Monitor as _GymMonitor -except ImportError: - - class _Stub: - def __init__(self, *args, **kwargs): - raise RuntimeError("Monitor is not available in this version of gym") - - class _GymMonitor(_Stub): # type: ignore - pass - - class _GymStatsRecorder(_Stub): - pass - -else: - from gym.wrappers.monitoring.stats_recorder import StatsRecorder as _GymStatsRecorder # type: ignore # isort: skip # noqa: E501 - - -class Monitor(_GymMonitor): - """`Monitor` with PFRL's `ContinuingTimeLimit` support. - - `Agent` in PFRL might reset the env even when `done=False` - if `ContinuingTimeLimit` returns `info['needs_reset']=True`, - which is not expected for `gym.Monitor`. - - For details, see - https://github.com/openai/gym/blob/master/gym/wrappers/monitor.py - """ - - def _start( - self, - directory, - video_callable=None, - force=False, - resume=False, - write_upon_reset=False, - uid=None, - mode=None, - ): - if self.env_semantics_autoreset: - raise NotImplementedError( - "Detect 'semantics.autoreset=True' in `env.metadata`, " - "which means the env is from deprecated OpenAI Universe." - ) - ret = super()._start( - directory=directory, - video_callable=video_callable, - force=force, - resume=resume, - write_upon_reset=write_upon_reset, - uid=uid, - mode=mode, - ) - env_id = self.stats_recorder.env_id - self.stats_recorder = _StatsRecorder( - directory, - "{}.episode_batch.{}".format(self.file_prefix, self.file_infix), - autoreset=False, - env_id=env_id, - ) - if mode is not None: - self._set_mode(mode) - return ret - - -class _StatsRecorder(_GymStatsRecorder): - """`StatsRecorder` with PFRL's `ContinuingTimeLimit` support. - - For details, see - https://github.com/openai/gym/blob/master/gym/wrappers/monitoring/stats_recorder.py - """ - - def __init__( - self, - directory, - file_prefix, - autoreset=False, - env_id=None, - logger=getLogger(__name__), - ): - super().__init__(directory, file_prefix, autoreset=autoreset, env_id=env_id) - self._save_completed = True - self.logger = logger - - def before_reset(self): - assert not self.closed - - if self.done is not None and not self.done and self.steps > 0: - self.logger.debug( - "Tried to reset the env which is not done=True. " - "StatsRecorder completes the last episode." - ) - self.save_complete() - - self.done = False - if self.initial_reset_timestamp is None: - self.initial_reset_timestamp = time.time() - - def after_step(self, observation, reward, done, info): - self._save_completed = False - return super().after_step(observation, reward, done, info) - - def save_complete(self): - if not self._save_completed: - super().save_complete() - self._save_completed = True - - def close(self): - self.save_complete() - super().close() diff --git a/pfrl/wrappers/normalize_action_space.py b/pfrl/wrappers/normalize_action_space.py index dbf0ed24f..3e485c91f 100644 --- a/pfrl/wrappers/normalize_action_space.py +++ b/pfrl/wrappers/normalize_action_space.py @@ -1,15 +1,15 @@ -import gym -import gym.spaces +import gymnasium +import gymnasium.spaces import numpy as np -class NormalizeActionSpace(gym.ActionWrapper): +class NormalizeActionSpace(gymnasium.ActionWrapper): """Normalize a Box action space to [-1, 1]^n.""" def __init__(self, env): super().__init__(env) - assert isinstance(env.action_space, gym.spaces.Box) - self.action_space = gym.spaces.Box( + assert isinstance(env.action_space, gymnasium.spaces.Box) + self.action_space = gymnasium.spaces.Box( low=-np.ones_like(env.action_space.low), high=np.ones_like(env.action_space.low), ) diff --git a/pfrl/wrappers/randomize_action.py b/pfrl/wrappers/randomize_action.py index 9390f33bf..d9485aa8c 100644 --- a/pfrl/wrappers/randomize_action.py +++ b/pfrl/wrappers/randomize_action.py @@ -1,21 +1,21 @@ -import gym +import gymnasium import numpy as np -class RandomizeAction(gym.ActionWrapper): +class RandomizeAction(gymnasium.ActionWrapper): """Apply a random action instead of the one sent by the agent. This wrapper can be used to make a stochastic env. The common use is for evaluation in Atari environments, where actions are replaced with random ones with a low probability. - Only gym.spaces.Discrete is supported as an action space. + Only gymnasium.spaces.Discrete is supported as an action space. For exploration during training, use explorers like pfrl.explorers.ConstantEpsilonGreedy instead of this wrapper. Args: - env (gym.Env): Env to wrap. + env (gymnasium.Env): Env to wrap. random_fraction (float): Fraction of actions that will be replaced with a random action. It must be in [0, 1]. """ @@ -24,17 +24,19 @@ def __init__(self, env, random_fraction): super().__init__(env) assert 0 <= random_fraction <= 1 assert isinstance( - env.action_space, gym.spaces.Discrete - ), "RandomizeAction supports only gym.spaces.Discrete as an action space" + env.action_space, gymnasium.spaces.Discrete + ), "RandomizeAction supports only gymnasium.spaces.Discrete as an action space" self._random_fraction = random_fraction - self._np_random = np.random.RandomState() + self._rng = np.random.RandomState() def action(self, action): - if self._np_random.rand() < self._random_fraction: - return self._np_random.randint(self.env.action_space.n) + if self._rng.rand() < self._random_fraction: + return self._rng.randint(self.env.action_space.n) else: return action - def seed(self, seed): - super().seed(seed) - self._np_random.seed(seed) + def reset(self, **kwargs): + if 'seed' in kwargs: + self._rng = np.random.RandomState(kwargs['seed']) + return self.env.reset(**kwargs) + diff --git a/pfrl/wrappers/render.py b/pfrl/wrappers/render.py index 6dc0c0384..83dede7aa 100644 --- a/pfrl/wrappers/render.py +++ b/pfrl/wrappers/render.py @@ -1,11 +1,11 @@ -import gym +import gymnasium -class Render(gym.Wrapper): +class Render(gymnasium.Wrapper): """Render env by calling its render method. Args: - env (gym.Env): Env to wrap. + env (gymnasium.Env): Env to wrap. **kwargs: Keyword arguments passed to the render method. """ @@ -16,7 +16,7 @@ def __init__(self, env, **kwargs): def reset(self, **kwargs): ret = self.env.reset(**kwargs) self.env.render(**self._kwargs) - return ret + return ret, {} def step(self, action): ret = self.env.step(action) diff --git a/pfrl/wrappers/scale_reward.py b/pfrl/wrappers/scale_reward.py index 784616da5..d34a238d3 100644 --- a/pfrl/wrappers/scale_reward.py +++ b/pfrl/wrappers/scale_reward.py @@ -1,4 +1,4 @@ -import gym +import gymnasium as gym class ScaleReward(gym.RewardWrapper): diff --git a/pfrl/wrappers/vector_frame_stack.py b/pfrl/wrappers/vector_frame_stack.py index 5596f5b87..1165b7e18 100644 --- a/pfrl/wrappers/vector_frame_stack.py +++ b/pfrl/wrappers/vector_frame_stack.py @@ -1,14 +1,14 @@ from collections import deque import numpy as np -from gym import spaces +from gymnasium import spaces from pfrl.env import VectorEnv from pfrl.wrappers.atari_wrappers import LazyFrames class VectorEnvWrapper(VectorEnv): - """VectorEnv analog to gym.Wrapper.""" + """VectorEnv analog to gymnasium.Wrapper.""" def __init__(self, env): self.env = env @@ -88,13 +88,13 @@ def reset(self, mask=None): if not m: for _ in range(self.k): frames.append(ob) - return self._get_ob() + return self._get_ob(), {} def step(self, action): - batch_ob, reward, done, info = self.env.step(action) + batch_ob, reward, terminated, _, info = self.env.step(action) for frames, ob in zip(self.frames, batch_ob): frames.append(ob) - return self._get_ob(), reward, done, info + return self._get_ob(), reward, terminated, info def _get_ob(self): assert len(self.frames) == self.env.num_envs diff --git a/requirements.txt b/requirements.txt index 45b6e8b0b..2ac56ecd7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ torch>=1.3.0 -gym>=0.9.7 +gymnasium>=0.9.7 numpy>=1.10.4 filelock pillow diff --git a/setup.cfg b/setup.cfg index 808dfd412..84f1c2234 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,7 +3,7 @@ [mypy-torch.*] ignore_missing_imports = True -[mypy-gym.*] +[mypy-gymnasium.*] ignore_missing_imports = True [mypy-numpy.*] diff --git a/setup.py b/setup.py index 47a5e4ef2..e037cf196 100644 --- a/setup.py +++ b/setup.py @@ -4,8 +4,8 @@ install_requires = [ 'torch>=1.3.0', - 'gym>=0.9.7', - 'numpy>=1.10.4', + 'gymnasium[atari]', + 'numpy>=1.11.0', 'pillow', 'filelock', ] diff --git a/tests/envs_tests/test_vector_envs.py b/tests/envs_tests/test_vector_envs.py index 768c09cc1..7a89e9984 100644 --- a/tests/envs_tests/test_vector_envs.py +++ b/tests/envs_tests/test_vector_envs.py @@ -1,4 +1,4 @@ -import gym +import gymnasium import numpy as np import pytest @@ -21,16 +21,16 @@ def setUp(self, num_envs, env_id, random_seed_offset, vector_env_to_test): # Init VectorEnv to test if self.vector_env_to_test == "SerialVectorEnv": self.vec_env = pfrl.envs.SerialVectorEnv( - [gym.make(self.env_id) for _ in range(self.num_envs)] + [gymnasium.make(self.env_id) for _ in range(self.num_envs)] ) elif self.vector_env_to_test == "MultiprocessVectorEnv": self.vec_env = pfrl.envs.MultiprocessVectorEnv( - [(lambda: gym.make(self.env_id)) for _ in range(self.num_envs)] + [(lambda: gymnasium.make(self.env_id)) for _ in range(self.num_envs)] ) else: assert False # Init envs to compare against - self.envs = [gym.make(self.env_id) for _ in range(self.num_envs)] + self.envs = [gymnasium.make(self.env_id) for _ in range(self.num_envs)] def teardown_method(self): # Delete so that all the subprocesses are joined @@ -59,14 +59,15 @@ def test_seed_reset_and_step(self): # step actions = [env.action_space.sample() for env in self.envs] - real_obss, real_rewards, real_dones, real_infos = zip( + real_obss, real_rewards, real_terminations, real_truncations, real_infos = zip( *[env.step(action) for env, action in zip(self.envs, actions)] ) - obss, rewards, dones, infos = self.vec_env.step(actions) + obss, rewards, terminations, truncations, infos = self.vec_env.step(actions) np.testing.assert_allclose(obss, real_obss) assert rewards == real_rewards - assert dones == real_dones + assert terminations == real_terminations assert infos == real_infos + assert truncations == real_truncations # reset with full mask should have no effect mask = np.ones(self.num_envs) diff --git a/tests/experiments_tests/test_evaluator.py b/tests/experiments_tests/test_evaluator.py index 6c0d96b23..2f6f82791 100644 --- a/tests/experiments_tests/test_evaluator.py +++ b/tests/experiments_tests/test_evaluator.py @@ -21,8 +21,8 @@ def test_evaluator_evaluate_if_necessary(save_best_so_far_agent, n_steps, n_epis agent.get_statistics.return_value = [] env = mock.Mock() - env.reset.return_value = "obs" - env.step.return_value = ("obs", 0, True, {}) + env.reset.return_value = "obs", {} + env.step.return_value = ("obs", 0, True, False, {}) env.get_statistics.return_value = [] evaluation_hook = mock.create_autospec( @@ -88,7 +88,7 @@ def test_evaluator_evaluate_if_necessary(save_best_so_far_agent, n_steps, n_epis assert agent.save.call_count == 0 # Third evaluation with a better score - env.step.return_value = ("obs", 1, True, {}) + env.step.return_value = ("obs", 1, True, False, {}) agent_evaluator.evaluate_if_necessary(t=9, episodes=9) assert agent.act.call_count == 3 * value assert agent.observe.call_count == 3 * value @@ -110,8 +110,8 @@ def test_async_evaluator_evaluate_if_necessary(save_best_so_far_agent, n_episode agent.get_statistics.return_value = [] env = mock.Mock() - env.reset.return_value = "obs" - env.step.return_value = ("obs", 0, True, {}) + env.reset.return_value = "obs", {} + env.step.return_value = ("obs", 0, True, False, {}) env.get_statistics.return_value = [] evaluation_hook = mock.create_autospec( @@ -158,7 +158,7 @@ def test_async_evaluator_evaluate_if_necessary(save_best_so_far_agent, n_episode assert agent.save.call_count == 0 # Third evaluation with a better score - env.step.return_value = ("obs", 1, True, {}) + env.step.return_value = ("obs", 1, True, False, {}) agent_evaluator.evaluate_if_necessary(t=9, episodes=9, env=env, agent=agent) assert agent.act.call_count == 3 * n_episodes assert agent.observe.call_count == 3 * n_episodes @@ -179,12 +179,12 @@ def test_run_evaluation_episodes_with_n_steps(n_episodes, n_steps): # Second episode: 4 -> 5 -> 6 -> 7 (done) env.reset.side_effect = [("state", 0), ("state", 4)] env.step.side_effect = [ - (("state", 1), 0.1, False, {}), - (("state", 2), 0.2, False, {}), - (("state", 3), 0.3, False, {"needs_reset": True}), - (("state", 5), -0.5, False, {}), - (("state", 6), 0, False, {}), - (("state", 7), 1, True, {}), + (("state", 1), 0.1, False, False, {}), + (("state", 2), 0.2, False, False, {}), + (("state", 3), 0.3, False, True, {"needs_reset": True}), + (("state", 5), -0.5, False, False, {}), + (("state", 6), 0, False, False, {}), + (("state", 7), 1, True, False, {}), ] if n_episodes: @@ -226,12 +226,12 @@ def test_needs_reset(self): # Second episode: 4 -> 5 -> 6 -> 7 (done) env.reset.side_effect = [("state", 0), ("state", 4)] env.step.side_effect = [ - (("state", 1), 0, False, {}), - (("state", 2), 0, False, {}), - (("state", 3), 0, False, {"needs_reset": True}), - (("state", 5), -0.5, False, {}), - (("state", 6), 0, False, {}), - (("state", 7), 1, True, {}), + (("state", 1), 0, False, False, {}), + (("state", 2), 0, False, False, {}), + (("state", 3), 0, False, True, {"needs_reset": True}), + (("state", 5), -0.5, False, False, {}), + (("state", 6), 0, False, False, {}), + (("state", 7), 1, True, False, {}), ] scores, lengths = evaluator.run_evaluation_episodes( env, agent, n_steps=None, n_episodes=2 @@ -261,11 +261,11 @@ def make_env(idx): # Second episode: 4 -> 5 -> 6 -> 7 (done) env.reset.side_effect = [("state", 0), ("state", 4)] env.step.side_effect = [ - (("state", 1), 0, False, {}), - (("state", 2), 0.1, False, {}), - (("state", 3), 0.2, False, {"needs_reset": True}), - (("state", 5), -0.5, False, {}), - (("state", 6), 0, False, {}), + (("state", 1), 0, False, False, {}), + (("state", 2), 0.1, False, False, {}), + (("state", 3), 0.2, False, False, {"needs_reset": True}), + (("state", 5), -0.5, False, False, {}), + (("state", 6), 0, False, False, {}), (("state", 7), 1, True, {}), ] else: @@ -274,11 +274,11 @@ def make_env(idx): # Third episode: 4 -> 5 -> 6 -> 7 (done) env.reset.side_effect = [("state", 0), ("state", 2), ("state", 4)] env.step.side_effect = [ - (("state", 1), 2, False, {"needs_reset": True}), - (("state", 3), 3, False, {"needs_reset": True}), - (("state", 5), -0.6, False, {}), - (("state", 6), 0, False, {}), - (("state", 7), 1, True, {}), + (("state", 1), 2, False, False, {"needs_reset": True}), + (("state", 3), 3, False, False, {"needs_reset": True}), + (("state", 5), -0.6, False, False, {}), + (("state", 6), 0, False, False, {}), + (("state", 7), 1, True, False, {}), ] return env @@ -326,12 +326,12 @@ def make_env(idx): # Second episode: 4 -> 5 -> 6 -> 7 (done) env.reset.side_effect = [("state", 0), ("state", 4)] env.step.side_effect = [ - (("state", 1), 0, False, {}), - (("state", 2), 0, False, {}), - (("state", 3), 0, False, {"needs_reset": True}), - (("state", 5), -0.5, False, {}), - (("state", 6), 0, False, {}), - (("state", 7), 1, True, {}), + (("state", 1), 0, False, False, {}), + (("state", 2), 0, False, False, {}), + (("state", 3), 0, False, False, {"needs_reset": True}), + (("state", 5), -0.5, False, False, {}), + (("state", 6), 0, False, False, {}), + (("state", 7), 1, True, False, {}), ] else: # First episode: 0 -> 1 (reset) @@ -339,11 +339,11 @@ def make_env(idx): # Third episode: 4 -> 5 -> 6 -> 7 (done) env.reset.side_effect = [("state", 0), ("state", 2), ("state", 4)] env.step.side_effect = [ - (("state", 1), 2, False, {"needs_reset": True}), - (("state", 3), 3, False, {"needs_reset": True}), - (("state", 5), -0.6, False, {}), - (("state", 6), 0, False, {}), - (("state", 7), 1, True, {}), + (("state", 1), 2, False, False, {"needs_reset": True}), + (("state", 3), 3, False, False, {"needs_reset": True}), + (("state", 5), -0.6, False, False, {}), + (("state", 6), 0, False, False, {}), + (("state", 7), 1, True, False, {}), ] return env diff --git a/tests/experiments_tests/test_train_agent.py b/tests/experiments_tests/test_train_agent.py index a83315339..d60249f57 100644 --- a/tests/experiments_tests/test_train_agent.py +++ b/tests/experiments_tests/test_train_agent.py @@ -16,11 +16,11 @@ def test(self): # Reaches the terminal state after five actions env.reset.side_effect = [("state", 0)] env.step.side_effect = [ - (("state", 1), 0, False, {}), - (("state", 2), 0, False, {}), - (("state", 3), -0.5, False, {}), - (("state", 4), 0, False, {}), - (("state", 5), 1, True, {}), + (("state", 1), 0, False, False, {}), + (("state", 2), 0, False, False, {}), + (("state", 3), -0.5, False, False, {}), + (("state", 4), 0, False, False, {}), + (("state", 5), 1, True, False, {}), ] hook = mock.Mock() @@ -57,12 +57,12 @@ def test_needs_reset(self): # Second episode: 4 -> 5 -> 6 -> 7 (done) env.reset.side_effect = [("state", 0), ("state", 4)] env.step.side_effect = [ - (("state", 1), 0, False, {}), - (("state", 2), 0, False, {}), - (("state", 3), 0, False, {"needs_reset": True}), - (("state", 5), -0.5, False, {}), - (("state", 6), 0, False, {}), - (("state", 7), 1, True, {}), + (("state", 1), 0, False, False, {}), + (("state", 2), 0, False, False, {}), + (("state", 3), 0, False, False, {"needs_reset": True}), + (("state", 5), -0.5, False, False, {}), + (("state", 6), 0, False, False,{}), + (("state", 7), 1, True, False, {}), ] hook = mock.Mock() @@ -141,11 +141,11 @@ def test_eval_during_episode(eval_during_episode): # Two episodes env.reset.side_effect = [("state", 0)] * 2 env.step.side_effect = [ - (("state", 1), 0, False, {}), - (("state", 2), 0, False, {}), - (("state", 3), -0.5, True, {}), - (("state", 4), 0, False, {}), - (("state", 5), 1, True, {}), + (("state", 1), 0, False, False, {}), + (("state", 2), 0, False, False, {}), + (("state", 3), -0.5, True, False, {}), + (("state", 4), 0, False, False, {}), + (("state", 5), 1, True, False, {}), ] evaluator = mock.Mock() diff --git a/tests/experiments_tests/test_train_agent_async.py b/tests/experiments_tests/test_train_agent_async.py index 4d023269d..8437ffc01 100644 --- a/tests/experiments_tests/test_train_agent_async.py +++ b/tests/experiments_tests/test_train_agent_async.py @@ -26,16 +26,16 @@ def _make_env(process_idx, test): if max_episode_len is None: # Episodic env that terminates after 5 actions env.step.side_effect = [ - (("state", 1), 0, False, {}), - (("state", 2), 0, False, {}), - (("state", 3), -0.5, False, {}), - (("state", 4), 0, False, {}), - (("state", 5), 1, True, {}), + (("state", 1), 0, False, False, {}), + (("state", 2), 0, False, False, {}), + (("state", 3), -0.5, False, False, {}), + (("state", 4), 0, False, False, {}), + (("state", 5), 1, True, False, {}), ] * 1000 else: # Continuing env env.step.side_effect = [ - (("state", 1), 0, False, {}), + (("state", 1), 0, False,False, {}), ] * 1000 return env @@ -154,12 +154,12 @@ def test_needs_reset(self): # Second episode: 4 -> 5 -> 6 -> 7 (done) env.reset.side_effect = [("state", 0), ("state", 4)] env.step.side_effect = [ - (("state", 1), 0, False, {}), - (("state", 2), 0, False, {}), - (("state", 3), 0, False, {"needs_reset": True}), - (("state", 5), -0.5, False, {}), - (("state", 6), 0, False, {}), - (("state", 7), 1, True, {}), + (("state", 1), 0, False, False, {}), + (("state", 2), 0, False, False, {}), + (("state", 3), 0, False, False, {"needs_reset": True}), + (("state", 5), -0.5, False, False, {}), + (("state", 6), 0, False, False, {}), + (("state", 7), 1, True, False, {}), ] counter = mp.Value("i", 0) diff --git a/tests/experiments_tests/test_train_agent_batch.py b/tests/experiments_tests/test_train_agent_batch.py index 2c9d19c00..1e318c05b 100644 --- a/tests/experiments_tests/test_train_agent_batch.py +++ b/tests/experiments_tests/test_train_agent_batch.py @@ -24,16 +24,16 @@ def make_env(): if max_episode_len is None: # Episodic env that terminates after 5 actions env.step.side_effect = [ - (("state", 1), 0, False, {}), - (("state", 2), 0, False, {}), - (("state", 3), -0.5, False, {}), - (("state", 4), 0, False, {}), - (("state", 5), 1, True, {}), + (("state", 1), 0, False, False, {}), + (("state", 2), 0, False, False, {}), + (("state", 3), -0.5, False, False, {}), + (("state", 4), 0, False, False, {}), + (("state", 5), 1, True, False, {}), ] * 1000 else: # Continuing env env.step.side_effect = [ - (("state", 1), 0, False, {}), + (("state", 1), 0, False, False, {}), ] * 1000 return env @@ -193,12 +193,12 @@ def make_env(idx): # Second episode: 4 -> 5 -> 6 -> 7 (done) env.reset.side_effect = [("state", 0), ("state", 4)] env.step.side_effect = [ - (("state", 1), 0, False, {}), - (("state", 2), 0, False, {}), - (("state", 3), 0, False, {"needs_reset": True}), - (("state", 5), -0.5, False, {}), - (("state", 6), 0, False, {}), - (("state", 7), 1, True, {}), + (("state", 1), 0, False, False, {}), + (("state", 2), 0, False, False, {}), + (("state", 3), 0, False, False, {"needs_reset": True}), + (("state", 5), -0.5, False, False, {}), + (("state", 6), 0, False, False, {}), + (("state", 7), 1, True, False, {}), ] else: # First episode: 0 -> 1 (reset) @@ -206,11 +206,11 @@ def make_env(idx): # Third episode: 4 -> 5 -> 6 -> 7 (done) env.reset.side_effect = [("state", 0), ("state", 2), ("state", 4)] env.step.side_effect = [ - (("state", 1), 0, False, {"needs_reset": True}), - (("state", 3), 0, False, {"needs_reset": True}), - (("state", 5), -0.5, False, {}), - (("state", 6), 0, False, {}), - (("state", 7), 1, True, {}), + (("state", 1), 0, False, False, {"needs_reset": True}), + (("state", 3), 0, False, False, {"needs_reset": True}), + (("state", 5), -0.5, False, False, {}), + (("state", 6), 0, False, False, {}), + (("state", 7), 1, True, False, {}), ] return env diff --git a/tests/wrappers_tests/test_atari_wrappers.py b/tests/wrappers_tests/test_atari_wrappers.py index 04a21b573..f0be506e0 100644 --- a/tests/wrappers_tests/test_atari_wrappers.py +++ b/tests/wrappers_tests/test_atari_wrappers.py @@ -4,8 +4,8 @@ from unittest import mock -import gym -import gym.spaces +import gymnasium +import gymnasium.spaces import numpy as np import pytest @@ -45,8 +45,8 @@ def dtyped_rand(): ) for _ in range(steps) ] - env.action_space = gym.spaces.Discrete(2) - env.observation_space = gym.spaces.Box( + env.action_space = gymnasium.spaces.Discrete(2) + env.observation_space = gymnasium.spaces.Box( low=low, high=high, shape=(1, 84, 84), dtype=dtype ) return env @@ -73,8 +73,8 @@ def dtyped_rand(): for _ in range(steps - 1): action = env.action_space.sample() fs_action = fs_env.action_space.sample() - obs, r, done, info = env.step(action) - fs_obs, fs_r, fs_done, fs_info = fs_env.step(fs_action) + obs, r, done, _, info = env.step(action) + fs_obs, fs_r, fs_done, _, fs_info = fs_env.step(fs_action) assert isinstance(fs_obs, LazyFrames) np.testing.assert_allclose( obs.take(indices=0, axis=fs_env.stack_axis), @@ -116,8 +116,8 @@ def dtyped_rand(): ) for _ in range(steps) ] - env.action_space = gym.spaces.Discrete(2) - env.observation_space = gym.spaces.Box( + env.action_space = gymnasium.spaces.Discrete(2) + env.observation_space = gymnasium.spaces.Box( low=low, high=high, shape=(1, 84, 84), dtype=dtype ) return env @@ -140,8 +140,8 @@ def dtyped_rand(): for _ in range(steps - 1): action = env.action_space.sample() s_action = s_env.action_space.sample() - obs, r, done, info = env.step(action) - s_obs, s_r, s_done, s_info = s_env.step(s_action) + obs, r, terminated, _, info = env.step(action) + s_obs, s_r, s_terminated, _, s_info = s_env.step(s_action) np.testing.assert_allclose(np.array(obs) / s_env.scale, s_obs) assert r == s_r - assert done == s_done + assert terminated == s_terminated diff --git a/tests/wrappers_tests/test_cast_observation.py b/tests/wrappers_tests/test_cast_observation.py index f6fac6269..5f925fb39 100644 --- a/tests/wrappers_tests/test_cast_observation.py +++ b/tests/wrappers_tests/test_cast_observation.py @@ -1,4 +1,4 @@ -import gym +import gymnasium import numpy as np import pytest @@ -8,7 +8,7 @@ @pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v0"]) @pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64]) def test_cast_observation(env_id, dtype): - env = pfrl.wrappers.CastObservation(gym.make(env_id), dtype=dtype) + env = pfrl.wrappers.CastObservation(gymnasium.make(env_id), dtype=dtype) rtol = 1e-3 if dtype == np.float16 else 1e-7 obs = env.reset() @@ -16,7 +16,7 @@ def test_cast_observation(env_id, dtype): assert obs.dtype == dtype np.testing.assert_allclose(env.original_observation, obs, rtol=rtol) - obs, r, done, info = env.step(env.action_space.sample()) + obs, r, done, _, info = env.step(env.action_space.sample()) assert env.original_observation.dtype == np.float64 assert obs.dtype == dtype @@ -25,14 +25,14 @@ def test_cast_observation(env_id, dtype): @pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v0"]) def test_cast_observation_to_float32(env_id): - env = pfrl.wrappers.CastObservationToFloat32(gym.make(env_id)) + env = pfrl.wrappers.CastObservationToFloat32(gymnasium.make(env_id)) obs = env.reset() assert env.original_observation.dtype == np.float64 assert obs.dtype == np.float32 np.testing.assert_allclose(env.original_observation, obs) - obs, r, done, info = env.step(env.action_space.sample()) + obs, r, done, _, info = env.step(env.action_space.sample()) assert env.original_observation.dtype == np.float64 assert obs.dtype == np.float32 np.testing.assert_allclose(env.original_observation, obs) diff --git a/tests/wrappers_tests/test_continuing_time_limit.py b/tests/wrappers_tests/test_continuing_time_limit.py deleted file mode 100644 index 9a20d93c5..000000000 --- a/tests/wrappers_tests/test_continuing_time_limit.py +++ /dev/null @@ -1,31 +0,0 @@ -from unittest import mock - -import pytest - -import pfrl - - -@pytest.mark.parametrize("max_episode_steps", [1, 2, 3]) -def test_continuing_time_limit(max_episode_steps): - env = mock.Mock() - env.reset.side_effect = ["state"] * 2 - # Since info dicts are modified by the wapper, each step call needs to - # return a new info dict. - env.step.side_effect = [("state", 0, False, {}) for _ in range(6)] - env = pfrl.wrappers.ContinuingTimeLimit(env, max_episode_steps=max_episode_steps) - - env.reset() - for t in range(2): - _, _, done, info = env.step(0) - if t + 1 >= max_episode_steps: - assert info["needs_reset"] - else: - assert not info.get("needs_reset", False) - - env.reset() - for t in range(4): - _, _, done, info = env.step(0) - if t + 1 >= max_episode_steps: - assert info["needs_reset"] - else: - assert not info.get("needs_reset", False) diff --git a/tests/wrappers_tests/test_monitor.py b/tests/wrappers_tests/test_monitor.py deleted file mode 100644 index ba65e9cc9..000000000 --- a/tests/wrappers_tests/test_monitor.py +++ /dev/null @@ -1,59 +0,0 @@ -import os -import shutil -import tempfile - -import gym -import pytest -from gym.wrappers import TimeLimit - -import pfrl - - -@pytest.mark.parametrize("n_episodes", [1, 2, 3, 4]) -def test_monitor(n_episodes): - steps = 15 - - env = gym.make("CartPole-v1") - # unwrap default TimeLimit and wrap with new one to simulate done=True - # at step 5 - assert isinstance(env, TimeLimit) - env = env.env # unwrap - env = TimeLimit(env, max_episode_steps=5) # wrap - - tmpdir = tempfile.mkdtemp() - try: - env = pfrl.wrappers.Monitor( - env, directory=tmpdir, video_callable=lambda episode_id: True - ) - episode_idx = 0 - episode_len = 0 - t = 0 - _ = env.reset() - while True: - _, _, done, info = env.step(env.action_space.sample()) - episode_len += 1 - t += 1 - if episode_idx == 1 and episode_len >= 3: - info["needs_reset"] = True # simulate ContinuingTimeLimit - if done or info.get("needs_reset", False) or t == steps: - if episode_idx + 1 == n_episodes or t == steps: - break - env.reset() - episode_idx += 1 - episode_len = 0 - # `env.close()` is called when `env` is gabage-collected - # (or explicitly deleted/closed). - del env - # check if videos & meta files were generated - files = os.listdir(tmpdir) - mp4s = [f for f in files if f.endswith(".mp4")] - metas = [f for f in files if f.endswith(".meta.json")] - stats = [f for f in files if f.endswith(".stats.json")] - manifests = [f for f in files if f.endswith(".manifest.json")] - assert len(mp4s) == n_episodes - assert len(metas) == n_episodes - assert len(stats) == 1 - assert len(manifests) == 1 - - finally: - shutil.rmtree(tmpdir) diff --git a/tests/wrappers_tests/test_randomize_action.py b/tests/wrappers_tests/test_randomize_action.py index b9f7a6c13..d0c204a70 100644 --- a/tests/wrappers_tests/test_randomize_action.py +++ b/tests/wrappers_tests/test_randomize_action.py @@ -1,20 +1,21 @@ -import gym -import gym.spaces +import gymnasium +import gymnasium.spaces import numpy as np import pytest import pfrl -class ActionRecordingEnv(gym.Env): - observation_space = gym.spaces.Box(low=-1, high=1, shape=(1,)) - action_space = gym.spaces.Discrete(3) +class ActionRecordingEnv(gymnasium.Env): + + observation_space = gymnasium.spaces.Box(low=-1, high=1, shape=(1,)) + action_space = gymnasium.spaces.Discrete(3) def __init__(self): self.past_actions = [] def reset(self): - return self.observation_space.sample() + return self.observation_space.sample(), {} def step(self, action): self.past_actions.append(action) diff --git a/tests/wrappers_tests/test_render.py b/tests/wrappers_tests/test_render.py index 64c347370..4e555b37a 100644 --- a/tests/wrappers_tests/test_render.py +++ b/tests/wrappers_tests/test_render.py @@ -21,39 +21,41 @@ def test_render(render_kwargs): ("state", 3), ] orig_env.step.side_effect = [ - (("state", 1), 0, False, {}), - (("state", 2), 1, True, {}), + (("state", 1), 0, False, False, {}), + (("state", 2), 1, True, False, {}), ] env = pfrl.wrappers.Render(orig_env, **render_kwargs) # Not called env.render yet assert orig_env.render.call_count == 0 - obs = env.reset() + obs, _ = env.reset() assert obs == ("state", 0) # Called once assert orig_env.render.call_count == 1 - obs, reward, done, info = env.step(0) + obs, reward, terminated, truncated, info = env.step(0) assert obs == ("state", 1) assert reward == 0 - assert not done + assert not terminated + assert not truncated assert info == {} # Called twice assert orig_env.render.call_count == 2 - obs, reward, done, info = env.step(0) + obs, reward, terminated, truncated, info = env.step(0) assert obs == ("state", 2) assert reward == 1 - assert done + assert terminated + assert not truncated assert info == {} # Called thrice assert orig_env.render.call_count == 3 - obs = env.reset() + obs, _ = env.reset() assert obs == ("state", 3) # Called four times diff --git a/tests/wrappers_tests/test_scale_reward.py b/tests/wrappers_tests/test_scale_reward.py index 027287461..4bb95f720 100644 --- a/tests/wrappers_tests/test_scale_reward.py +++ b/tests/wrappers_tests/test_scale_reward.py @@ -1,4 +1,4 @@ -import gym +import gymnasium as gym import numpy as np import pytest diff --git a/tests/wrappers_tests/test_vector_frame_stack.py b/tests/wrappers_tests/test_vector_frame_stack.py index 1a236a1ff..e739c1ed8 100644 --- a/tests/wrappers_tests/test_vector_frame_stack.py +++ b/tests/wrappers_tests/test_vector_frame_stack.py @@ -2,8 +2,8 @@ import unittest from unittest import mock -import gym -import gym.spaces +import gymnasium +import gymnasium.spaces import numpy as np import pytest @@ -43,8 +43,8 @@ def make_env(idx): ) for _ in range(steps) ] - env.action_space = gym.spaces.Discrete(2) - env.observation_space = gym.spaces.Box( + env.action_space = gymnasium.spaces.Discrete(2) + env.observation_space = gymnasium.spaces.Box( low=0, high=255, shape=(1, 84, 84), dtype=np.uint8 ) return env @@ -72,8 +72,8 @@ def make_env(idx): assert fs_env.action_space == vfs_env.action_space assert fs_env.observation_space == vfs_env.observation_space - fs_obs = fs_env.reset() - vfs_obs = vfs_env.reset() + fs_obs, _ = fs_env.reset() + vfs_obs, _ = vfs_env.reset() # Same LazyFrames observations for env_idx in range(num_envs): @@ -84,8 +84,8 @@ def make_env(idx): ) batch_action = [0] * num_envs - fs_new_obs, fs_r, fs_done, _ = fs_env.step(batch_action) - vfs_new_obs, vfs_r, vfs_done, _ = vfs_env.step(batch_action) + fs_new_obs, fs_r, fs_done, _, _ = fs_env.step(batch_action) + vfs_new_obs, vfs_r, vfs_done, _, _ = vfs_env.step(batch_action) # Same LazyFrames observations, but those from fs_env are copies # while those from vfs_env are references. @@ -105,8 +105,8 @@ def make_env(idx): for _ in range(steps - 1): fs_env.reset(mask=np.logical_not(fs_done)) vfs_env.reset(mask=np.logical_not(vfs_done)) - fs_obs, fs_r, fs_done, _ = fs_env.step(batch_action) - vfs_obs, vfs_r, vfs_done, _ = vfs_env.step(batch_action) + fs_obs, fs_r, fs_terminated, _, _ = fs_env.step(batch_action) + vfs_obs, vfs_r, vfs_terminated, _, _ = vfs_env.step(batch_action) for env_idx in range(num_envs): assert isinstance(fs_new_obs[env_idx], LazyFrames) assert isinstance(vfs_new_obs[env_idx], LazyFrames) @@ -114,4 +114,4 @@ def make_env(idx): np.asarray(fs_new_obs[env_idx]), np.asarray(vfs_new_obs[env_idx]) ) np.testing.assert_allclose(fs_r, vfs_r) - np.testing.assert_allclose(fs_done, vfs_done) + np.testing.assert_allclose(fs_terminated, vfs_terminated)