diff --git a/gymnasium_robotics/envs/maze/ant_maze_v5.py b/gymnasium_robotics/envs/maze/ant_maze_v5.py index ec314c95..edddb120 100644 --- a/gymnasium_robotics/envs/maze/ant_maze_v5.py +++ b/gymnasium_robotics/envs/maze/ant_maze_v5.py @@ -26,187 +26,6 @@ class AntMazeEnv(MazeEnv, EzPickle): - """ - ### Description - - This environment was refactored from the [D4RL](https://github.com/Farama-Foundation/D4RL) repository, introduced by Justin Fu, Aviral Kumar, Ofir Nachum, George Tucker, and Sergey Levine - in ["D4RL: Datasets for Deep Data-Driven Reinforcement Learning"](https://arxiv.org/abs/2004.07219). - - The tasks found in the `AntMaze` environments are the same as the ones in the `PointMaze` environments. However, in this case the agent is the Ant quadruped from the main [Gymnaisum](https://gymnasium.farama.org/environments/mujoco/ant/) repository. - The control frequency of the ant is of `f = 20 Hz`. Each simulation timestep is of `dt=0.01` and the ant robot repeats the same action for 5 simulation steps. - - ### Maze Variations - - #### Maze size - The map variations for the mazes are the same as for `PointMaze`. The ant environments with fixed goal and reset locations are the following: - - * `AntMaze_UMaze-v5` - * `AntMaze_BigMaze-v5` - * `AntMaze_HardestMaze-v5` - - #### Diverse goal mazes - The environments with fixed reset position for the ant and randomly selected goals, also known as diverse goal, are: - - * `AntMaze_BigMaze_DG-v5` - * `AntMaze_HardestMaze_DG-v5` - - #### Diverse goal and reset mazes - - Finally, the environments that select the reset and goal locations randomly are: - - * `AntMaze_BigMaze_DGR-v5` - * `AntMaze_HardestMaze_DGR-v5` - - #### Custom maze - Also, any of the `AntMaze` environments can be initialized with a custom maze map by setting the `maze_map` argument like follows: - - ```python - import gymnasium as gym - import gymnasium_robotics - - gym.register_envs(gymnasium_robotics) - - example_map = [[1, 1, 1, 1, 1], - [1, C, 0, C, 1], - [1, 1, 1, 1, 1]] - - env = gym.make('AntMaze_UMaze-v5', maze_map=example_map) - ``` - - ### Action Space - The action space is the action space of [Gymnasium/MuJoCo/Ant](https://gymnasium.farama.org/environments/mujoco/ant/#action-space): - - The action space is a `Box(-1, 1, (8,), float32)`. An action represents the torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Type (Unit) | - | --- | ----------------------------------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ | - | 0 | Torque applied on the rotor between the torso and back right hip | -1 | 1 | hip_4 (right_back_leg) | hinge | torque (N m) | - | 1 | Torque applied on the rotor between the back right two links | -1 | 1 | angle_4 (right_back_leg) | hinge | torque (N m) | - | 2 | Torque applied on the rotor between the torso and front left hip | -1 | 1 | hip_1 (front_left_leg) | hinge | torque (N m) | - | 3 | Torque applied on the rotor between the front left two links | -1 | 1 | angle_1 (front_left_leg) | hinge | torque (N m) | - | 4 | Torque applied on the rotor between the torso and front right hip | -1 | 1 | hip_2 (front_right_leg) | hinge | torque (N m) | - | 5 | Torque applied on the rotor between the front right two links | -1 | 1 | angle_2 (front_right_leg) | hinge | torque (N m) | - | 6 | Torque applied on the rotor between the torso and back left hip | -1 | 1 | hip_3 (back_leg) | hinge | torque (N m) | - | 7 | Torque applied on the rotor between the back left two links | -1 | 1 | angle_3 (back_leg) | hinge | torque (N m) | - - ### Observation Space - The observation is a `goal-aware observation space`. It consists of a dictionary with information about the robot's position and goal. The dictionary consists of the following 3 keys: - - * `observation`: Observations consist of positional values of different body parts of the ant, followed by the velocities of those individual parts (their derivatives) with all - the positions ordered before all the velocities. - - By default, observations do not include the x- and y-coordinates of the ant's torso. These values are included in the `achieved_goal` key of the observation. - However, by default, an observation is a `ndarray` with shape `(111,)` if the external contact forces are included with the `use_contact_forces` arguments. Otherwise, the shape will be `(27, )` - The elements of the array correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - |-----|--------------------------------------------------------------|--------|--------|----------------------------------------|-------|--------------------------| - | 0 | z-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | - | 1 | x-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | - | 2 | y-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | - | 3 | z-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | - | 4 | w-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | - | 5 | angle between torso and first link on front left | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) | - | 6 | angle between the two links on the front left | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) | - | 7 | angle between torso and first link on front right | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) | - | 8 | angle between the two links on the front right | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) | - | 9 | angle between torso and first link on back left | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) | - | 10 | angle between the two links on the back left | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) | - | 11 | angle between torso and first link on back right | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) | - | 12 | angle between the two links on the back right | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) | - | 13 | x-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | - | 14 | y-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | - | 15 | z-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | - | 16 | x-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | - | 17 | y-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | - | 18 | z-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | - | 19 | angular velocity of angle between torso and front left link | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) | - | 20 | angular velocity of the angle between front left links | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) | - | 21 | angular velocity of angle between torso and front right link | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) | - | 22 | angular velocity of the angle between front right links | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) | - | 23 | angular velocity of angle between torso and back left link | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) | - | 24 | angular velocity of the angle between back left links | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) | - | 25 | angular velocity of angle between torso and back right link | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) | - | 26 |angular velocity of the angle between back right links | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) | - - The remaining 14*6 = 84 elements of the observation are contact forces (external forces - force x, y, z and torque x, y, z) applied to the center of mass of each of the links. The 14 links are: the ground link, - the torso link, and 3 links for each leg (1 + 1 + 12) with the 6 external forces. These elements are included only if at the environments initialization the argument `use_contact_forces` is set to `True`. - - * `desired_goal`: this key represents the final goal to be achieved. In this environment it is a 2-dimensional `ndarray`, `(2,)`, that consists of the two cartesian coordinates of the desired final ant torso position `[x,y]`. The elements of the array are the following: - - | Num | Observation | Min | Max | Site Name (in corresponding XML file) |Unit | - |-----|------------------------ |--------|--------|---------------------------------------|--------------| - | 0 | Final goal x coordinate | -Inf | Inf | target | position (m) | - | 1 | Final goal y coordinate | -Inf | Inf | target | position (m) | - - * `achieved_goal`: this key represents the current state of the ant's torso, as if it would have achieved a goal. This is useful for goal orientated learning algorithms such as those that use [Hindsight Experience Replay](https://arxiv.org/abs/1707.01495) (HER). - The value is an `ndarray` with shape `(2,)`. The elements of the array are the following: - - | Num | Observation | Min | Max | Site Name (in corresponding XML file) |Unit | - |-----|------------------------------------------------|--------|--------|---------------------------------------|--------------| - | 0 | Current goal ant position in the x coordinate | -Inf | Inf | torso | position (m) | - | 1 | Current goal ant position in the y coordinate | -Inf | Inf | torso | position (m) | - - ### Rewards - - The reward can be initialized as `sparse` or `dense`: - - *sparse*: the returned reward can have two values: `0` if the ant hasn't reached its final target position, and `1` if the ant is in the final target position (the ant is considered to have reached the goal if the Euclidean distance between both is lower than 0.5 m). - - *dense*: the returned reward is the negative Euclidean distance between the achieved goal position and the desired goal. - - To initialize this environment with one of the mentioned reward functions the type of reward must be specified in the id string when the environment is initialized. For `sparse` reward the id is the default of the environment, `AntMaze_UMaze-v5`. However, for `dense` - reward the id must be modified to `AntMaze_UMazeDense-v5` and initialized as follows: - - ```python - import gymnasium as gym - import gymnasium_robotics - - gym.register_envs(gymnasium_robotics) - - env = gym.make('AntMaze_UMaze-v5') - ``` - - ### Starting State - The goal and initial placement of the ant in the maze follows the same structure for all environments. A discrete cell `(i,j)` is selected for the goal and agent's initial position as previously menitoned in the **Maze** section. - Then this cell index is converted to its cell center as an `(x,y)` continuous Cartesian coordinates in the MuJoCo simulation. Finally, a sampled noise from a uniform distribution with range `[-0.25,0.25]m` is added to the - cell's center x and y coordinates. This allows to create a richer goal distribution. - - The goal and initial position of the agent can also be specified by the user when the episode is reset. This is done by passing the dictionary argument `options` to the gymnasium reset() function. This dictionary expects one or both of - the following keys: - - * `goal_cell`: `numpy.ndarray, shape=(2,0), type=int` - Specifies the desired `(i,j)` cell location of the goal. A uniform sampled noise will be added to the continuous coordinates of the center of the cell. - * `reset_cell`: `numpy.ndarray, shape=(2,0), type=int` - Specifies the desired `(i,j)` cell location of the reset initial agent position. A uniform sampled noise will be added to the continuous coordinates of the center of the cell. - - ### Episode End - * `truncated` - The episode will be `truncated` when the duration reaches a total of `max_episode_steps`. - * `terminated` - The task can be set to be continuing with the `continuing_task` argument. In this case the episode will never terminate, instead the goal location is randomly selected again. If the task is set not to be continuing the - episode will be terminated when the Euclidean distance to the goal is less or equal to 0.5. - - ### Arguments - * `maze_map` - Optional argument to initialize the environment with a custom maze map. - * `continuing_task` - If set to `True` the episode won't be terminated when reaching the goal, instead a new goal location will be generated (unless `reset_target` argument is `True`). If `False` the environment is terminated when the ant reaches the final goal. - * `reset_target` - If set to `True` and the argument `continuing_task` is also `True`, when the ant reaches the target goal the location of the goal will be kept the same and no new goal location will be generated. If `False` a new goal will be generated when reached. - * `xml_file` - Optional argument to Path of robot model. - * Optionally any other [Gymnasium/MuJoCo/Ant](https://gymnasium.farama.org/environments/mujoco/ant/#arguments/) argument such `ctrl_cost_weight`. - - Note that, the maximum number of timesteps before the episode is `truncated` can be increased or decreased by specifying the `max_episode_steps` argument at initialization. For example, - to increase the total number of timesteps to 100 make the environment as follows: - - ```python - import gymnasium as gym - import gymnasium_robotics - - gym.register_envs(gymnasium_robotics) - - env = gym.make('AntMaze_UMaze-v5', max_episode_steps=100) - ``` - - ### Version History - - v5: Is now based on `Gymnasium/MuJoCoAnt-v5/`, and inherits all features from it such as the `xml_file` argument for the loading of third party model. - - v4: Refactor compute_terminated in MazeEnv into a pure function compute_terminated and a new function update_goal which resets the goal position. Ant bug fix: Reward is now computed before reset (i.e. sparse reward is not always zero). Maze bug fix: Ant can no longer reset within the goal radius 0.45 due to maze_size_scaling factor missing in MazeEnv. info['success'] key added. - - v3: refactor version of the D4RL environment, also create dependency on newest [mujoco python bindings](https://mujoco.readthedocs.io/en/latest/python.html) maintained by the MuJoCo team in Deepmind. - - v2 & v1: legacy versions in the [D4RL](https://github.com/Farama-Foundation/D4RL). - """ - metadata = { "render_modes": [ "human", diff --git a/gymnasium_robotics/envs/maze/ant_maze_v6.py b/gymnasium_robotics/envs/maze/ant_maze_v6.py new file mode 100644 index 00000000..4e415574 --- /dev/null +++ b/gymnasium_robotics/envs/maze/ant_maze_v6.py @@ -0,0 +1,339 @@ +"""A maze environment with the Gymnasium Ant agent (https://github.com/Farama-Foundation/Gymnasium/blob/main/gymnasium/envs/mujoco/ant_v5.py). + +The code is inspired by the D4RL repository hosted on GitHub (https://github.com/Farama-Foundation/D4RL), published in the paper +'D4RL: Datasets for Deep Data-Driven Reinforcement Learning' by Justin Fu, Aviral Kumar, Ofir Nachum, George Tucker, Sergey Levine. + +Original Author of the code: Justin Fu + +The modifications made involve reusing the code in Gymnasium for the Ant environment and in `point_maze/maze_env.py`. +The new code also follows the Gymnasium API and Multi-goal API + +This project is covered by the Apache 2.0 License. +""" + +import sys +from os import path +from typing import Dict, List, Optional, Union + +import numpy as np +from gymnasium import spaces +from gymnasium.envs.mujoco.ant_v5 import AntEnv +from gymnasium.utils.ezpickle import EzPickle + +from gymnasium_robotics.envs.maze.maps import U_MAZE +from gymnasium_robotics.envs.maze.maze_v6 import MazeEnv +from gymnasium_robotics.utils.mujoco_utils import MujocoModelNames + + +class AntMazeEnv(MazeEnv, EzPickle): + """ + ### Description + + This environment was refactored from the [D4RL](https://github.com/Farama-Foundation/D4RL) repository, introduced by Justin Fu, Aviral Kumar, Ofir Nachum, George Tucker, and Sergey Levine + in ["D4RL: Datasets for Deep Data-Driven Reinforcement Learning"](https://arxiv.org/abs/2004.07219). + + The tasks found in the `AntMaze` environments are the same as the ones in the `PointMaze` environments. However, in this case the agent is the Ant quadruped from the main [Gymnaisum](https://gymnasium.farama.org/environments/mujoco/ant/) repository. + The control frequency of the ant is of `f = 20 Hz`. Each simulation timestep is of `dt=0.01` and the ant robot repeats the same action for 5 simulation steps. + + ### Maze Variations + + #### Maze size + The map variations for the mazes are the same as for `PointMaze`. The ant environments with fixed goal and reset locations are the following: + + * `AntMaze_UMaze-v5` + * `AntMaze_BigMaze-v5` + * `AntMaze_HardestMaze-v5` + + #### Diverse goal mazes + The environments with fixed reset position for the ant and randomly selected goals, also known as diverse goal, are: + + * `AntMaze_BigMaze_DG-v5` + * `AntMaze_HardestMaze_DG-v5` + + #### Diverse goal and reset mazes + + Finally, the environments that select the reset and goal locations randomly are: + + * `AntMaze_BigMaze_DGR-v5` + * `AntMaze_HardestMaze_DGR-v5` + + #### Custom maze + Also, any of the `AntMaze` environments can be initialized with a custom maze map by setting the `maze_map` argument like follows: + + ```python + import gymnasium as gym + import gymnasium_robotics + + gym.register_envs(gymnasium_robotics) + + example_map = [[1, 1, 1, 1, 1], + [1, C, 0, C, 1], + [1, 1, 1, 1, 1]] + + env = gym.make('AntMaze_UMaze-v5', maze_map=example_map) + ``` + + ### Action Space + The action space is the action space of [Gymnasium/MuJoCo/Ant](https://gymnasium.farama.org/environments/mujoco/ant/#action-space): + + The action space is a `Box(-1, 1, (8,), float32)`. An action represents the torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Type (Unit) | + | --- | ----------------------------------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ | + | 0 | Torque applied on the rotor between the torso and back right hip | -1 | 1 | hip_4 (right_back_leg) | hinge | torque (N m) | + | 1 | Torque applied on the rotor between the back right two links | -1 | 1 | angle_4 (right_back_leg) | hinge | torque (N m) | + | 2 | Torque applied on the rotor between the torso and front left hip | -1 | 1 | hip_1 (front_left_leg) | hinge | torque (N m) | + | 3 | Torque applied on the rotor between the front left two links | -1 | 1 | angle_1 (front_left_leg) | hinge | torque (N m) | + | 4 | Torque applied on the rotor between the torso and front right hip | -1 | 1 | hip_2 (front_right_leg) | hinge | torque (N m) | + | 5 | Torque applied on the rotor between the front right two links | -1 | 1 | angle_2 (front_right_leg) | hinge | torque (N m) | + | 6 | Torque applied on the rotor between the torso and back left hip | -1 | 1 | hip_3 (back_leg) | hinge | torque (N m) | + | 7 | Torque applied on the rotor between the back left two links | -1 | 1 | angle_3 (back_leg) | hinge | torque (N m) | + + ### Observation Space + The observation is a `goal-aware observation space`. It consists of a dictionary with information about the robot's position and goal. The dictionary consists of the following 3 keys: + + * `observation`: Observations consist of positional values of different body parts of the ant, followed by the velocities of those individual parts (their derivatives) with all + the positions ordered before all the velocities. + + By default, observations do not include the x- and y-coordinates of the ant's torso. These values are included in the `achieved_goal` key of the observation. + However, by default, an observation is a `ndarray` with shape `(111,)` if the external contact forces are included with the `use_contact_forces` arguments. Otherwise, the shape will be `(27, )` + The elements of the array correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + |-----|--------------------------------------------------------------|--------|--------|----------------------------------------|-------|--------------------------| + | 0 | z-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | + | 1 | x-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | + | 2 | y-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | + | 3 | z-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | + | 4 | w-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | + | 5 | angle between torso and first link on front left | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) | + | 6 | angle between the two links on the front left | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) | + | 7 | angle between torso and first link on front right | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) | + | 8 | angle between the two links on the front right | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) | + | 9 | angle between torso and first link on back left | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) | + | 10 | angle between the two links on the back left | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) | + | 11 | angle between torso and first link on back right | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) | + | 12 | angle between the two links on the back right | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) | + | 13 | x-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | + | 14 | y-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | + | 15 | z-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | + | 16 | x-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | + | 17 | y-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | + | 18 | z-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | + | 19 | angular velocity of angle between torso and front left link | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) | + | 20 | angular velocity of the angle between front left links | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) | + | 21 | angular velocity of angle between torso and front right link | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) | + | 22 | angular velocity of the angle between front right links | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) | + | 23 | angular velocity of angle between torso and back left link | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) | + | 24 | angular velocity of the angle between back left links | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) | + | 25 | angular velocity of angle between torso and back right link | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) | + | 26 |angular velocity of the angle between back right links | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) | + + The remaining 14*6 = 84 elements of the observation are contact forces (external forces - force x, y, z and torque x, y, z) applied to the center of mass of each of the links. The 14 links are: the ground link, + the torso link, and 3 links for each leg (1 + 1 + 12) with the 6 external forces. These elements are included only if at the environments initialization the argument `use_contact_forces` is set to `True`. + + * `desired_goal`: this key represents the final goal to be achieved. In this environment it is a 2-dimensional `ndarray`, `(2,)`, that consists of the two cartesian coordinates of the desired final ant torso position `[x,y]`. The elements of the array are the following: + + | Num | Observation | Min | Max | Site Name (in corresponding XML file) |Unit | + |-----|------------------------ |--------|--------|---------------------------------------|--------------| + | 0 | Final goal x coordinate | -Inf | Inf | target | position (m) | + | 1 | Final goal y coordinate | -Inf | Inf | target | position (m) | + + * `achieved_goal`: this key represents the current state of the ant's torso, as if it would have achieved a goal. This is useful for goal orientated learning algorithms such as those that use [Hindsight Experience Replay](https://arxiv.org/abs/1707.01495) (HER). + The value is an `ndarray` with shape `(2,)`. The elements of the array are the following: + + | Num | Observation | Min | Max | Site Name (in corresponding XML file) |Unit | + |-----|------------------------------------------------|--------|--------|---------------------------------------|--------------| + | 0 | Current goal ant position in the x coordinate | -Inf | Inf | torso | position (m) | + | 1 | Current goal ant position in the y coordinate | -Inf | Inf | torso | position (m) | + + ### Rewards + + The reward can be initialized as `sparse` or `dense`: + - *sparse*: the returned reward can have two values: `0` if the ant hasn't reached its final target position, and `1` if the ant is in the final target position (the ant is considered to have reached the goal if the Euclidean distance between both is lower than 0.5 m). + - *dense*: the returned reward is the negative Euclidean distance between the achieved goal position and the desired goal. + + To initialize this environment with one of the mentioned reward functions the type of reward must be specified in the id string when the environment is initialized. For `sparse` reward the id is the default of the environment, `AntMaze_UMaze-v5`. However, for `dense` + reward the id must be modified to `AntMaze_UMazeDense-v5` and initialized as follows: + + ```python + import gymnasium as gym + import gymnasium_robotics + + gym.register_envs(gymnasium_robotics) + + env = gym.make('AntMaze_UMaze-v5') + ``` + + ### Starting State + The goal and initial placement of the ant in the maze follows the same structure for all environments. A discrete cell `(i,j)` is selected for the goal and agent's initial position as previously menitoned in the **Maze** section. + Then this cell index is converted to its cell center as an `(x,y)` continuous Cartesian coordinates in the MuJoCo simulation. Finally, a sampled noise from a uniform distribution with range `[-0.25,0.25]m` is added to the + cell's center x and y coordinates. This allows to create a richer goal distribution. + + The goal and initial position of the agent can also be specified by the user when the episode is reset. This is done by passing the dictionary argument `options` to the gymnasium reset() function. This dictionary expects one or both of + the following keys: + + * `goal_cell`: `numpy.ndarray, shape=(2,0), type=int` - Specifies the desired `(i,j)` cell location of the goal. A uniform sampled noise will be added to the continuous coordinates of the center of the cell. + * `reset_cell`: `numpy.ndarray, shape=(2,0), type=int` - Specifies the desired `(i,j)` cell location of the reset initial agent position. A uniform sampled noise will be added to the continuous coordinates of the center of the cell. + + ### Episode End + * `truncated` - The episode will be `truncated` when the duration reaches a total of `max_episode_steps`. + * `terminated` - The task can be set to be continuing with the `continuing_task` argument. In this case the episode will never terminate, instead the goal location is randomly selected again. If the task is set not to be continuing the + episode will be terminated when the Euclidean distance to the goal is less or equal to 0.5. + + ### Arguments + * `maze_map` - Optional argument to initialize the environment with a custom maze map. + * `continuing_task` - If set to `True` the episode won't be terminated when reaching the goal, instead a new goal location will be generated (unless `reset_target` argument is `True`). If `False` the environment is terminated when the ant reaches the final goal. + * `reset_target` - If set to `True` and the argument `continuing_task` is also `True`, when the ant reaches the target goal the location of the goal will be kept the same and no new goal location will be generated. If `False` a new goal will be generated when reached. + * `xml_file` - Optional argument to Path of robot model. + * Optionally any other [Gymnasium/MuJoCo/Ant](https://gymnasium.farama.org/environments/mujoco/ant/#arguments/) argument such `ctrl_cost_weight`. + + Note that, the maximum number of timesteps before the episode is `truncated` can be increased or decreased by specifying the `max_episode_steps` argument at initialization. For example, + to increase the total number of timesteps to 100 make the environment as follows: + + ```python + import gymnasium as gym + import gymnasium_robotics + + gym.register_envs(gymnasium_robotics) + + env = gym.make('AntMaze_UMaze-v5', max_episode_steps=100) + ``` + + ### Version History + - v6: No changes thus far + - v5: Is now based on `Gymnasium/MuJoCoAnt-v5/`, and inherits all features from it such as the `xml_file` argument for the loading of third party model. + - v4: Refactor compute_terminated in MazeEnv into a pure function compute_terminated and a new function update_goal which resets the goal position. Ant bug fix: Reward is now computed before reset (i.e. sparse reward is not always zero). Maze bug fix: Ant can no longer reset within the goal radius 0.45 due to maze_size_scaling factor missing in MazeEnv. info['success'] key added. + - v3: refactor version of the D4RL environment, also create dependency on newest [mujoco python bindings](https://mujoco.readthedocs.io/en/latest/python.html) maintained by the MuJoCo team in Deepmind. + - v2 & v1: legacy versions in the [D4RL](https://github.com/Farama-Foundation/D4RL). + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": 50, + } + + def __init__( + self, + render_mode: Optional[str] = None, + maze_map: List[List[Union[str, int]]] = U_MAZE, + reward_type: str = "sparse", + continuing_task: bool = True, + reset_target: bool = False, + xml_file: Union[str, None] = None, + **kwargs, + ): + if xml_file is None: + # Get the ant.xml path from the Gymnasium package + ant_xml_file_path = path.join( + path.dirname(sys.modules[AntEnv.__module__].__file__), "assets/ant.xml" + ) + else: + ant_xml_file_path = xml_file + super().__init__( + agent_xml_path=ant_xml_file_path, + maze_map=maze_map, + maze_size_scaling=4, + maze_height=0.5, + reward_type=reward_type, + continuing_task=continuing_task, + reset_target=reset_target, + **kwargs, + ) + # Create the MuJoCo environment, include position observation of the Ant for GoalEnv + self.ant_env = AntEnv( + xml_file=self.tmp_xml_file_path, + exclude_current_positions_from_observation=False, + render_mode=render_mode, + reset_noise_scale=0.0, + **kwargs, + ) + self._model_names = MujocoModelNames(self.ant_env.model) + self.target_site_id = self._model_names.site_name2id["target"] + + self.action_space = self.ant_env.action_space + obs_shape: tuple = self.ant_env.observation_space.shape + self.observation_space = spaces.Dict( + dict( + observation=spaces.Box( + -np.inf, np.inf, shape=(obs_shape[0] - 2,), dtype="float64" + ), + achieved_goal=spaces.Box(-np.inf, np.inf, shape=(2,), dtype="float64"), + desired_goal=spaces.Box(-np.inf, np.inf, shape=(2,), dtype="float64"), + ) + ) + + self.render_mode = render_mode + EzPickle.__init__( + self, + render_mode, + maze_map, + reward_type, + continuing_task, + reset_target, + **kwargs, + ) + + def reset(self, *, seed: Optional[int] = None, **kwargs): + super().reset(seed=seed, **kwargs) + + self.ant_env.init_qpos[:2] = self.reset_pos + + obs, info = self.ant_env.reset(seed=seed) + obs_dict = self._get_obs(obs) + info["success"] = bool( + np.linalg.norm(obs_dict["achieved_goal"] - self.goal) <= 0.45 + ) + + return obs_dict, info + + def step(self, action): + ant_obs, _, _, _, info = self.ant_env.step(action) + obs = self._get_obs(ant_obs) + + reward = self.compute_reward(obs["achieved_goal"], self.goal, info) + terminated = self.compute_terminated(obs["achieved_goal"], self.goal, info) + truncated = self.compute_truncated(obs["achieved_goal"], self.goal, info) + info["success"] = bool(np.linalg.norm(obs["achieved_goal"] - self.goal) <= 0.45) + + if self.render_mode == "human": + self.render() + + # Update the goal position if necessary + self.update_goal(obs["achieved_goal"]) + + return obs, reward, terminated, truncated, info + + def _get_obs(self, ant_obs: np.ndarray) -> Dict[str, np.ndarray]: + achieved_goal = ant_obs[:2] + observation = ant_obs[2:] + + return { + "observation": observation.copy(), + "achieved_goal": achieved_goal.copy(), + "desired_goal": self.goal.copy(), + } + + def update_target_site_pos(self): + self.ant_env.model.site_pos[self.target_site_id] = np.append( + self.goal, self.maze.maze_height / 2 * self.maze.maze_size_scaling + ) + + def render(self): + return self.ant_env.render() + + def close(self): + super().close() + self.ant_env.close() + + @property + def model(self): + return self.ant_env.model + + @property + def data(self): + return self.ant_env.data diff --git a/gymnasium_robotics/envs/maze/maze_v6.py b/gymnasium_robotics/envs/maze/maze_v6.py new file mode 100644 index 00000000..0c2d394e --- /dev/null +++ b/gymnasium_robotics/envs/maze/maze_v6.py @@ -0,0 +1,422 @@ +"""A maze environment with Gymnasium API for the Gymnasium-Robotics PointMaze environments. + +The code is inspired by the D4RL repository hosted on GitHub (https://github.com/Farama-Foundation/D4RL), published in the paper +'D4RL: Datasets for Deep Data-Driven Reinforcement Learning' by Justin Fu, Aviral Kumar, Ofir Nachum, George Tucker, Sergey Levine. + +Original Author of the code: Justin Fu + +The modifications made involve organizing the code into different files: `maps.py`, `maze_env.py`, `point_env.py`, and `point_maze_env.py`. +As well as adding support for the Gymnasium API. + +This project is covered by the Apache 2.0 License. +""" + +import math +import tempfile +import time +import xml.etree.ElementTree as ET +from os import path +from typing import Dict, List, Optional, Union + +import numpy as np + +from gymnasium_robotics.core import GoalEnv +from gymnasium_robotics.envs.maze.maps import COMBINED, GOAL, RESET, U_MAZE + + +class Maze: + r"""This class creates and holds information about the maze in the MuJoCo simulation. + + The accessible attributes are the following: + - :attr:`maze_map` - The maze discrete data structure. + - :attr:`maze_size_scaling` - The maze scaling for the continuous coordinates in the MuJoCo simulation. + - :attr:`maze_height` - The height of the walls in the MuJoCo simulation. + - :attr:`unique_goal_locations` - All the `(i,j)` possible cell indices for goal locations. + - :attr:`unique_reset_locations` - All the `(i,j)` possible cell indices for agent initialization locations. + - :attr:`combined_locations` - All the `(i,j)` possible cell indices for goal and agent initialization locations. + - :attr:`map_length` - Maximum value of j cell index + - :attr:`map_width` - Mazimum value of i cell index + - :attr:`x_map_center` - The x coordinate of the map's center + - :attr:`y_map_center` - The y coordinate of the map's center + + The Maze class also presents a method to convert from cell indices to `(x,y)` coordinates in the MuJoCo simulation: + - :meth:`cell_rowcol_to_xy` - Convert from `(i,j)` to `(x,y)` + + ### Version History + * v4: Refactor compute_terminated into a pure function compute_terminated and a new function update_goal which resets the goal position. Bug fix: missing maze_size_scaling factor added in generate_reset_pos() -- only affects AntMaze. + * v3: refactor version of the D4RL environment, also create dependency on newest [mujoco python bindings](https://mujoco.readthedocs.io/en/latest/python.html) maintained by the MuJoCo team in Deepmind. + * v2 & v1: legacy versions in the [D4RL](https://github.com/Farama-Foundation/D4RL). + """ + + def __init__( + self, + maze_map: List[List[Union[str, int]]], + maze_size_scaling: float, + maze_height: float, + ): + + self._maze_map = maze_map + self._maze_size_scaling = maze_size_scaling + self._maze_height = maze_height + + self._unique_goal_locations = [] + self._unique_reset_locations = [] + self._combined_locations = [] + + # Get the center cell Cartesian position of the maze. This will be the origin + self._map_length = len(maze_map) + self._map_width = len(maze_map[0]) + self._x_map_center = self.map_width / 2 * maze_size_scaling + self._y_map_center = self.map_length / 2 * maze_size_scaling + + @property + def maze_map(self) -> List[List[Union[str, int]]]: + """Returns the list[list] data structure of the maze.""" + return self._maze_map + + @property + def maze_size_scaling(self) -> float: + """Returns the scaling value used to integrate the maze + encoding in the MuJoCo simulation. + """ + return self._maze_size_scaling + + @property + def maze_height(self) -> float: + """Returns the un-scaled height of the walls in the MuJoCo + simulation. + """ + return self._maze_height + + @property + def unique_goal_locations(self) -> List[np.ndarray]: + """Returns all the possible goal locations in discrete cell + coordinates (i,j) + """ + return self._unique_goal_locations + + @property + def unique_reset_locations(self) -> List[np.ndarray]: + """Returns all the possible reset locations for the agent in + discrete cell coordinates (i,j) + """ + return self._unique_reset_locations + + @property + def combined_locations(self) -> List[np.ndarray]: + """Returns all the possible goal/reset locations in discrete cell + coordinates (i,j) + """ + return self._combined_locations + + @property + def map_length(self) -> int: + """Returns the length of the maze in number of discrete vertical cells + or number of rows i. + """ + return self._map_length + + @property + def map_width(self) -> int: + """Returns the width of the maze in number of discrete horizontal cells + or number of columns j. + """ + return self._map_width + + @property + def x_map_center(self) -> float: + """Returns the x coordinate of the center of the maze in the MuJoCo simulation""" + return self._x_map_center + + @property + def y_map_center(self) -> float: + """Returns the x coordinate of the center of the maze in the MuJoCo simulation""" + return self._y_map_center + + def cell_rowcol_to_xy(self, rowcol_pos: np.ndarray) -> np.ndarray: + """Converts a cell index `(i,j)` to x and y coordinates in the MuJoCo simulation""" + x = (rowcol_pos[1] + 0.5) * self.maze_size_scaling - self.x_map_center + y = self.y_map_center - (rowcol_pos[0] + 0.5) * self.maze_size_scaling + + return np.array([x, y]) + + def cell_xy_to_rowcol(self, xy_pos: np.ndarray) -> np.ndarray: + """Converts a cell x and y coordinates to `(i,j)`""" + i = math.floor((self.y_map_center - xy_pos[1]) / self.maze_size_scaling) + j = math.floor((xy_pos[0] + self.x_map_center) / self.maze_size_scaling) + return np.array([i, j]) + + @classmethod + def make_maze( + cls, + agent_xml_path: str, + maze_map: list, + maze_size_scaling: float, + maze_height: float, + ): + """Class method that returns an instance of Maze with a decoded maze information and the temporal + path to the new MJCF (xml) file for the MuJoCo simulation. + + Args: + agent_xml_path (str): the goal that was achieved during execution + maze_map (list[list[str,int]]): the desired goal that we asked the agent to attempt to achieve + maze_size_scaling (float): an info dictionary with additional information + maze_height (float): an info dictionary with additional information + + Returns: + Maze: The reward that corresponds to the provided achieved goal w.r.t. to the desired + goal. Note that the following should always hold true: + str: The xml temporal file to the new mjcf model with the included maze. + """ + tree = ET.parse(agent_xml_path) + worldbody = tree.find(".//worldbody") + + maze = cls(maze_map, maze_size_scaling, maze_height) + empty_locations = [] + for i in range(maze.map_length): + for j in range(maze.map_width): + struct = maze_map[i][j] + # Store cell locations in simulation global Cartesian coordinates + x = (j + 0.5) * maze_size_scaling - maze.x_map_center + y = maze.y_map_center - (i + 0.5) * maze_size_scaling + if struct == 1: # Unmovable block. + # Offset all coordinates so that maze is centered. + ET.SubElement( + worldbody, + "geom", + name=f"block_{i}_{j}", + pos=f"{x} {y} {maze_height / 2 * maze_size_scaling}", + size=f"{0.5 * maze_size_scaling} {0.5 * maze_size_scaling} {maze_height / 2 * maze_size_scaling}", + type="box", + material="", + contype="1", + conaffinity="1", + rgba="0.7 0.5 0.3 1.0", + ) + + elif struct == RESET: + maze._unique_reset_locations.append(np.array([x, y])) + elif struct == GOAL: + maze._unique_goal_locations.append(np.array([x, y])) + elif struct == COMBINED: + maze._combined_locations.append(np.array([x, y])) + elif struct == 0: + empty_locations.append(np.array([x, y])) + + # Add target site for visualization + ET.SubElement( + worldbody, + "site", + name="target", + pos=f"0 0 {maze_height / 2 * maze_size_scaling}", + size=f"{0.2 * maze_size_scaling}", + rgba="1 0 0 0.7", + type="sphere", + ) + + # Add the combined cell locations (goal/reset) to goal and reset + if ( + not maze._unique_goal_locations + and not maze._unique_reset_locations + and not maze._combined_locations + ): + # If there are no given "r", "g" or "c" cells in the maze data structure, + # any empty cell can be a reset or goal location at initialization. + maze._combined_locations = empty_locations + elif not maze._unique_reset_locations and not maze._combined_locations: + # If there are no given "r" or "c" cells in the maze data structure, + # any empty cell can be a reset location at initialization. + maze._unique_reset_locations = empty_locations + elif not maze._unique_goal_locations and not maze._combined_locations: + # If there are no given "g" or "c" cells in the maze data structure, + # any empty cell can be a gaol location at initialization. + maze._unique_goal_locations = empty_locations + + maze._unique_goal_locations += maze._combined_locations + maze._unique_reset_locations += maze._combined_locations + + # Save new xml with maze to a temporary file + with tempfile.TemporaryDirectory() as tmp_dir: + temp_xml_name = f"ant_maze{str(time.time())}.xml" + temp_xml_path = path.join(path.dirname(tmp_dir), temp_xml_name) + tree.write(temp_xml_path) + + return maze, temp_xml_path + + +class MazeEnv(GoalEnv): + def __init__( + self, + agent_xml_path: str, + reward_type: str = "dense", + continuing_task: bool = True, + reset_target: bool = True, + maze_map: List[List[Union[int, str]]] = U_MAZE, + maze_size_scaling: float = 1.0, + maze_height: float = 0.5, + position_noise_range: float = 0.25, + **kwargs, + ): + + self.reward_type = reward_type + self.continuing_task = continuing_task + self.reset_target = reset_target + self.maze, self.tmp_xml_file_path = Maze.make_maze( + agent_xml_path, maze_map, maze_size_scaling, maze_height + ) + + self.position_noise_range = position_noise_range + + def generate_target_goal(self) -> np.ndarray: + assert len(self.maze.unique_goal_locations) > 0 + goal_index = self.np_random.integers( + low=0, high=len(self.maze.unique_goal_locations) + ) + goal = self.maze.unique_goal_locations[goal_index].copy() + return goal + + def generate_reset_pos(self) -> np.ndarray: + assert len(self.maze.unique_reset_locations) > 0 + + # While reset position is close to goal position + reset_pos = self.goal.copy() + while ( + np.linalg.norm(reset_pos - self.goal) <= 0.5 * self.maze.maze_size_scaling + ): + reset_index = self.np_random.integers( + low=0, high=len(self.maze.unique_reset_locations) + ) + reset_pos = self.maze.unique_reset_locations[reset_index].copy() + + return reset_pos + + def reset( + self, + *, + seed: Optional[int] = None, + options: Optional[Dict[str, Optional[np.ndarray]]] = None, + ): + """Reset the maze simulation. + + Args: + options (dict[str, np.ndarray]): the options dictionary can contain two items, "goal_cell" and "reset_cell" that will set the initial goal and reset location (i,j) in the self.maze.map list of list maze structure. + + """ + super().reset(seed=seed) + + if options is None: + goal = self.generate_target_goal() + # Add noise to goal position + self.goal = self.add_xy_position_noise(goal) + reset_pos = self.generate_reset_pos() + else: + if "goal_cell" in options and options["goal_cell"] is not None: + # assert that goal cell is valid + assert self.maze.map_length > options["goal_cell"][0] + assert self.maze.map_width > options["goal_cell"][1] + assert ( + self.maze.maze_map[options["goal_cell"][0]][options["goal_cell"][1]] + != 1 + ), f"Goal can't be placed in a wall cell, {options['goal_cell']}" + + goal = self.maze.cell_rowcol_to_xy(options["goal_cell"]) + + else: + goal = self.generate_target_goal() + + # Add noise to goal position + self.goal = self.add_xy_position_noise(goal) + + if "reset_cell" in options and options["reset_cell"] is not None: + # assert that goal cell is valid + assert self.maze.map_length > options["reset_cell"][0] + assert self.maze.map_width > options["reset_cell"][1] + assert ( + self.maze.maze_map[options["reset_cell"][0]][ + options["reset_cell"][1] + ] + != 1 + ), f"Reset can't be placed in a wall cell, {options['reset_cell']}" + + reset_pos = self.maze.cell_rowcol_to_xy(options["reset_cell"]) + + else: + reset_pos = self.generate_reset_pos() + + # Update the position of the target site for visualization + self.update_target_site_pos() + # Add noise to reset position + self.reset_pos = self.add_xy_position_noise(reset_pos) + + # Update the position of the target site for visualization + self.update_target_site_pos() + + def add_xy_position_noise(self, xy_pos: np.ndarray) -> np.ndarray: + """Pass an x,y coordinate and it will return the same coordinate with a noise addition + sampled from a uniform distribution + """ + noise_x = ( + self.np_random.uniform( + low=-self.position_noise_range, high=self.position_noise_range + ) + * self.maze.maze_size_scaling + ) + noise_y = ( + self.np_random.uniform( + low=-self.position_noise_range, high=self.position_noise_range + ) + * self.maze.maze_size_scaling + ) + xy_pos[0] += noise_x + xy_pos[1] += noise_y + + return xy_pos + + def compute_reward( + self, achieved_goal: np.ndarray, desired_goal: np.ndarray, info + ) -> float: + distance = np.linalg.norm(achieved_goal - desired_goal, axis=-1) + if self.reward_type == "dense": + return np.exp(-distance) + elif self.reward_type == "sparse": + return (distance <= 0.45).astype(np.float64) + + def compute_terminated( + self, achieved_goal: np.ndarray, desired_goal: np.ndarray, info + ) -> bool: + if not self.continuing_task: + # If task is episodic terminate the episode when the goal is reached + return bool(np.linalg.norm(achieved_goal - desired_goal) <= 0.45) + else: + # Continuing tasks don't terminate, episode will be truncated when time limit is reached (`max_episode_steps`) + return False + + def update_goal(self, achieved_goal: np.ndarray) -> None: + """Update goal position if continuing task and within goal radius.""" + + if ( + self.continuing_task + and self.reset_target + and bool(np.linalg.norm(achieved_goal - self.goal) <= 0.45) + and len(self.maze.unique_goal_locations) > 1 + ): + # Generate a goal while within 0.45 of achieved_goal. The distance check above + # is not redundant, it avoids calling update_target_site_pos() unless necessary + while np.linalg.norm(achieved_goal - self.goal) <= 0.45: + # Generate another goal + goal = self.generate_target_goal() + # Add noise to goal position + self.goal = self.add_xy_position_noise(goal) + + # Update the position of the target site for visualization + self.update_target_site_pos() + + def compute_truncated( + self, achieved_goal: np.ndarray, desired_goal: np.ndarray, info + ) -> bool: + return False + + def update_target_site_pos(self, pos): + """Override this method to update the site qpos in the MuJoCo simulation + after a new goal is selected. This is mainly for visualization purposes.""" + raise NotImplementedError