diff --git a/mo_gymnasium/envs/mountain_car/__init__.py b/mo_gymnasium/envs/mountain_car/__init__.py index f75fe751..523de281 100644 --- a/mo_gymnasium/envs/mountain_car/__init__.py +++ b/mo_gymnasium/envs/mountain_car/__init__.py @@ -6,3 +6,24 @@ entry_point="mo_gymnasium.envs.mountain_car.mountain_car:MOMountainCar", max_episode_steps=200, ) + +register( + id="mo-mountaincar-3d-v0", + entry_point="mo_gymnasium.envs.mountain_car.mountain_car:MOMountainCar", + max_episode_steps=200, + kwargs={"add_speed_objective": True, "merge_move_penalty": True}, +) + +register( + id="mo-mountaincar-timemove-v0", + entry_point="mo_gymnasium.envs.mountain_car.mountain_car:MOMountainCar", + max_episode_steps=200, + kwargs={"merge_move_penalty": True}, +) + +register( + id="mo-mountaincar-timespeed-v0", + entry_point="mo_gymnasium.envs.mountain_car.mountain_car:MOMountainCar", + max_episode_steps=200, + kwargs={"remove_move_penalty": True, "add_speed_objective": True}, +) diff --git a/mo_gymnasium/envs/mountain_car/mountain_car.py b/mo_gymnasium/envs/mountain_car/mountain_car.py index 6e88acca..0c2c5dbb 100644 --- a/mo_gymnasium/envs/mountain_car/mountain_car.py +++ b/mo_gymnasium/envs/mountain_car/mountain_car.py @@ -14,19 +14,50 @@ class MOMountainCar(MountainCarEnv, EzPickle): See [Gymnasium's env](https://gymnasium.farama.org/environments/classic_control/mountain_car_continuous/) for more information. ## Reward space: - The reward space is a 3D vector containing the time penalty, and penalties for reversing and going forward. + By default, the reward space is a 3D vector containing the time penalty, and penalties for reversing and going forward. - time penalty: -1.0 for each time step - reverse penalty: -1.0 for each time step the action is 0 (reverse) - forward penalty: -1.0 for each time step the action is 2 (forward) + + Alternatively, the reward can be changed with the following options: + - add_speed_objective: Add an extra objective corresponding to the speed of the car. + - remove_move_penalty: Remove the reverse and forward objectives. + - merge_move_penalty: Merge reverse and forward penalties into a single penalty. """ - def __init__(self, render_mode: Optional[str] = None, goal_velocity=0): + def __init__( + self, + render_mode: Optional[str] = None, + add_speed_objective: bool = False, + remove_move_penalty: bool = False, + merge_move_penalty: bool = False, + goal_velocity=0, + ): super().__init__(render_mode, goal_velocity) - EzPickle.__init__(self, render_mode, goal_velocity) + EzPickle.__init__(self, render_mode, add_speed_objective, remove_move_penalty, merge_move_penalty, goal_velocity) + self.add_speed_objective = add_speed_objective + self.remove_move_penalty = remove_move_penalty + self.merge_move_penalty = merge_move_penalty - self.reward_space = spaces.Box(low=np.array([-1, -1, -1]), high=np.array([-1, 0, 0]), shape=(3,), dtype=np.float32) self.reward_dim = 3 + if self.add_speed_objective: + self.reward_dim += 1 + + if self.remove_move_penalty: + self.reward_dim -= 2 + elif self.merge_move_penalty: + self.reward_dim -= 1 + + low = np.array([-1] * self.reward_dim) + high = np.zeros(self.reward_dim) + high[0] = -1 # Time penalty is always -1 + if self.add_speed_objective: + low[-1] = 0.0 + high[-1] = 1.1 + + self.reward_space = spaces.Box(low=low, high=high, shape=(self.reward_dim,), dtype=np.float32) + def step(self, action: int): assert self.action_space.contains(action), f"{action!r} ({type(action)}) invalid" @@ -39,11 +70,20 @@ def step(self, action: int): velocity = 0 terminated = bool(position >= self.goal_position and velocity >= self.goal_velocity) - # reward = -1.0 - reward = np.zeros(3, dtype=np.float32) + + reward = np.zeros(self.reward_dim, dtype=np.float32) + reward[0] = 0.0 if terminated else -1.0 # time penalty - reward[1] = 0.0 if action != 0 else -1.0 # reverse penalty - reward[2] = 0.0 if action != 2 else -1.0 # forward penalty + + if not self.remove_move_penalty: + if self.merge_move_penalty: + reward[1] = 0.0 if action == 1 else -1.0 + else: + reward[1] = 0.0 if action != 0 else -1.0 # reverse penalty + reward[2] = 0.0 if action != 2 else -1.0 # forward penalty + + if self.add_speed_objective: + reward[-1] = 15 * abs(velocity) self.state = (position, velocity) if self.render_mode == "human": diff --git a/pyproject.toml b/pyproject.toml index 160f2b53..92b17de6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ classifiers = [ ] dependencies = [ "gymnasium>=0.28.1,<0.30", - "numpy >=1.21.0", + "numpy >=1.21.0,<2", "pygame >=2.1.0", "scipy >=1.7.3", "pymoo >=0.6.0",