diff --git a/examples/train_ppoLstm_atari.ipynb b/examples/train_ppoLstm_atari.ipynb new file mode 100644 index 00000000..57744373 --- /dev/null +++ b/examples/train_ppoLstm_atari.ipynb @@ -0,0 +1,74 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "from rllte.agent import PPO_LSTM\n", + "from rllte.env import make_envpool_atari_env" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "device = \"cuda:0\"\n", + "num_envs = 16\n", + "\n", + "env = make_envpool_atari_env(\n", + " env_id=\"SpaceInvaders-v5\",\n", + " device=device,\n", + " num_envs=num_envs,\n", + " asynchronous=False\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "agent = PPO_LSTM(\n", + " env=env, \n", + " device=device,\n", + " tag=\"ppo_lstm_atari\",\n", + ")\n", + "\n", + "print(\"===== AGENT =====\")\n", + "print(agent.encoder)\n", + "print(agent.policy)\n", + "\n", + "agent.train(num_train_steps=10_000_000)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "rllte", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.18" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pyproject.toml b/pyproject.toml index a9213554..17abf814 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,7 @@ dependencies = [ "pynvml==11.5.0", "matplotlib==3.6.0", "seaborn==0.12.2", - "huggingface_hub==0.14.1" + "huggingface_hub==0.14.1", ] [project.optional-dependencies] diff --git a/rllte/agent/__init__.py b/rllte/agent/__init__.py index cd75157f..6ff3d8d0 100644 --- a/rllte/agent/__init__.py +++ b/rllte/agent/__init__.py @@ -36,3 +36,4 @@ from .legacy.sacd import SACDiscrete as SACDiscrete from .legacy.td3 import TD3 as TD3 from .ppg import PPG as PPG +from .ppo_lstm import PPO_LSTM as PPO_LSTM diff --git a/rllte/agent/daac.py b/rllte/agent/daac.py index 31e384ee..45b4fe41 100644 --- a/rllte/agent/daac.py +++ b/rllte/agent/daac.py @@ -95,7 +95,7 @@ def __init__( adv_coef: float = 0.25, max_grad_norm: float = 0.5, discount: float = 0.999, - init_fn: str = "xavier_uniform" + init_fn: str = "xavier_uniform", ) -> None: super().__init__( env=env, @@ -164,7 +164,7 @@ def __init__( storage_size=self.num_steps, num_envs=self.num_envs, batch_size=batch_size, - discount=discount + discount=discount, ) # set all the modules [essential operation!!!] diff --git a/rllte/agent/drac.py b/rllte/agent/drac.py index bb4e4dc8..f675961e 100644 --- a/rllte/agent/drac.py +++ b/rllte/agent/drac.py @@ -157,7 +157,7 @@ def __init__( storage_size=self.num_steps, num_envs=self.num_envs, batch_size=batch_size, - discount=discount + discount=discount, ) # set all the modules [essential operation!!!] diff --git a/rllte/agent/drdaac.py b/rllte/agent/drdaac.py index 70824549..3a9a74f8 100644 --- a/rllte/agent/drdaac.py +++ b/rllte/agent/drdaac.py @@ -170,7 +170,7 @@ def __init__( storage_size=self.num_steps, num_envs=self.num_envs, batch_size=batch_size, - discount=discount + discount=discount, ) # set all the modules [essential operation!!!] diff --git a/rllte/agent/legacy/a2c.py b/rllte/agent/legacy/a2c.py index 05a8b8c1..b556c5c3 100644 --- a/rllte/agent/legacy/a2c.py +++ b/rllte/agent/legacy/a2c.py @@ -139,7 +139,7 @@ def __init__( storage_size=self.num_steps, num_envs=self.num_envs, batch_size=batch_size, - discount=discount + discount=discount, ) # set all the modules [essential operation!!!] diff --git a/rllte/agent/legacy/ppo.py b/rllte/agent/legacy/ppo.py index 63a1afc5..0a08a393 100644 --- a/rllte/agent/legacy/ppo.py +++ b/rllte/agent/legacy/ppo.py @@ -88,7 +88,7 @@ def __init__( vf_coef: float = 0.5, ent_coef: float = 0.01, max_grad_norm: float = 0.5, - discount: float = 0.999, + discount: float = 0.99, init_fn: str = "orthogonal", ) -> None: super().__init__( @@ -151,7 +151,7 @@ def __init__( storage_size=self.num_steps, num_envs=self.num_envs, batch_size=batch_size, - discount=discount + discount=discount, ) # set all the modules [essential operation!!!] diff --git a/rllte/agent/legacy/sacd.py b/rllte/agent/legacy/sacd.py index 8533ebfe..98bf6c1d 100644 --- a/rllte/agent/legacy/sacd.py +++ b/rllte/agent/legacy/sacd.py @@ -242,7 +242,7 @@ def update_critic( with th.no_grad(): dist = self.policy.get_dist(next_obs) # deal with situation of 0.0 probabilities - action_probs, log_probs = self.deal_with_zero_probs(dist.probs) # type: ignore[attr-defined] + action_probs, log_probs = self.deal_with_zero_probs(dist.probs) # type: ignore[attr-defined] target_Q1, target_Q2 = self.policy.critic_target(next_obs) target_V = (th.min(target_Q1, target_Q2) - self.alpha.detach() * log_probs) * action_probs # TODO: add time limit mask @@ -278,7 +278,7 @@ def update_actor_and_alpha(self, obs: th.Tensor) -> None: """ # sample actions dist = self.policy.get_dist(obs) - action_probs, log_probs = self.deal_with_zero_probs(dist.probs) # type: ignore[attr-defined] + action_probs, log_probs = self.deal_with_zero_probs(dist.probs) # type: ignore[attr-defined] actor_Q1, actor_Q2 = self.policy.critic(obs) actor_Q = th.min(actor_Q1, actor_Q2) diff --git a/rllte/agent/ppg.py b/rllte/agent/ppg.py index 2a22bc95..00bffe19 100644 --- a/rllte/agent/ppg.py +++ b/rllte/agent/ppg.py @@ -97,7 +97,7 @@ def __init__( num_aux_mini_batch: int = 4, num_aux_grad_accum: int = 1, discount: float = 0.999, - init_fn: str = "xavier_uniform" + init_fn: str = "xavier_uniform", ) -> None: super().__init__( env=env, @@ -162,7 +162,7 @@ def __init__( storage_size=self.num_steps, num_envs=self.num_envs, batch_size=batch_size, - discount=discount + discount=discount, ) # set all the modules [essential operation!!!] diff --git a/rllte/agent/ppo_lstm.py b/rllte/agent/ppo_lstm.py new file mode 100644 index 00000000..05fa4b59 --- /dev/null +++ b/rllte/agent/ppo_lstm.py @@ -0,0 +1,210 @@ +# ============================================================================= +# MIT License + +# Copyright (c) 2023 Reinforcement Learning Evolution Foundation + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ============================================================================= + + +from typing import Optional + +import numpy as np +import torch as th +from torch import nn + +from rllte.common.prototype import OnPolicyAgent +from rllte.common.type_alias import VecEnv +from rllte.xploit.encoder import IdentityEncoder, MnihCnnEncoder +from rllte.xploit.policy import OnPolicySharedActorCriticLSTM +from rllte.xploit.storage import EpisodicRolloutStorage +from rllte.xplore.distribution import Bernoulli, Categorical, DiagonalGaussian, MultiCategorical + + +class PPO_LSTM(OnPolicyAgent): + """Proximal Policy Optimization (PPO) with LSTM agent. + Based on: https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/ppo_atari_lstm.py + + Args: + env (VecEnv): Vectorized environments for training. + eval_env (VecEnv): Vectorized environments for evaluation. + tag (str): An experiment tag. + seed (int): Random seed for reproduction. + device (str): Device (cpu, cuda, ...) on which the code should be run. + pretraining (bool): Turn on the pre-training mode. + num_steps (int): The sample length of per rollout. + feature_dim (int): Number of features extracted by the encoder. + batch_size (int): Number of samples per batch to load. + lr (float): The learning rate. + eps (float): Term added to the denominator to improve numerical stability. + hidden_dim (int): The size of the hidden layers. + clip_range (float): Clipping parameter. + clip_range_vf (Optional[float]): Clipping parameter for the value function. + n_epochs (int): Times of updating the policy. + vf_coef (float): Weighting coefficient of value loss. + ent_coef (float): Weighting coefficient of entropy bonus. + max_grad_norm (float): Maximum norm of gradients. + discount (float): Discount factor. + init_fn (str): Parameters initialization method. + + Returns: + PPO_LSTM agent instance. + """ + + def __init__( + self, + env: VecEnv, + eval_env: Optional[VecEnv] = None, + tag: str = "default", + seed: int = 1, + device: str = "cpu", + pretraining: bool = False, + num_steps: int = 128, + feature_dim: int = 512, + batch_size: int = 256, + lr: float = 2.5e-4, + eps: float = 1e-5, + hidden_dim: int = 512, + clip_range: float = 0.1, + clip_range_vf: Optional[float] = 0.1, + n_epochs: int = 4, + vf_coef: float = 0.5, + ent_coef: float = 0.01, + max_grad_norm: float = 0.5, + discount: float = 0.99, + init_fn: str = "orthogonal", + num_batches: int = 4, + ) -> None: + super().__init__( + env=env, + eval_env=eval_env, + tag=tag, + seed=seed, + device=device, + pretraining=pretraining, + num_steps=num_steps, + use_lstm=True, + ) + + # hyper parameters + self.lr = lr + self.eps = eps + self.n_epochs = n_epochs + self.clip_range = clip_range + self.clip_range_vf = clip_range_vf + self.vf_coef = vf_coef + self.ent_coef = ent_coef + self.max_grad_norm = max_grad_norm + + # default encoder + if len(self.obs_shape) == 3: + encoder = MnihCnnEncoder(observation_space=env.observation_space, feature_dim=feature_dim) + elif len(self.obs_shape) == 1: + feature_dim = self.obs_shape[0] # type: ignore + encoder = IdentityEncoder( + observation_space=env.observation_space, feature_dim=feature_dim # type: ignore[assignment] + ) + + # default distribution + if self.action_type == "Discrete": + dist = Categorical() + elif self.action_type == "Box": + dist = DiagonalGaussian() # type: ignore[assignment] + elif self.action_type == "MultiBinary": + dist = Bernoulli() # type: ignore[assignment] + elif self.action_type == "MultiDiscrete": + dist = MultiCategorical() # type: ignore[assignment] + else: + raise NotImplementedError(f"Unsupported action type {self.action_type}!") + + # create policy + policy = OnPolicySharedActorCriticLSTM( + observation_space=env.observation_space, + action_space=env.action_space, + feature_dim=feature_dim, + hidden_dim=hidden_dim, + opt_class=th.optim.Adam, + opt_kwargs=dict(lr=lr, eps=eps), + init_fn=init_fn, + ) + + # default storage + storage = EpisodicRolloutStorage( + observation_space=env.observation_space, + action_space=env.action_space, + device=device, + storage_size=self.num_steps, + num_envs=self.num_envs, + discount=discount, + num_batches=num_batches, + ) + + # set all the modules [essential operation!!!] + self.set(encoder=encoder, policy=policy, storage=storage, distribution=dist) + + def update(self) -> None: + """Update function that returns training metrics such as policy loss, value loss, etc..""" + total_policy_loss = [0.0] + total_value_loss = [0.0] + total_entropy_loss = [0.0] + + for _ in range(self.n_epochs): + for batch in self.storage.sample(): + done = th.logical_or(batch.terminateds, batch.truncateds) + + # evaluate sampled actions + new_values, new_log_probs, entropy = self.policy.evaluate_actions( + obs=batch.observations, + actions=batch.actions, + lstm_state=(self.initial_lstm_state[0][:, batch.env_inds], self.initial_lstm_state[1][:, batch.env_inds]), + done=done, + ) + + # policy loss part + ratio = th.exp(new_log_probs - batch.old_log_probs) + surr1 = ratio * batch.adv_targ + surr2 = th.clamp(ratio, 1.0 - self.clip_range, 1.0 + self.clip_range) * batch.adv_targ + policy_loss = -th.min(surr1, surr2).mean() + + # value loss part + if self.clip_range_vf is None: + value_loss = 0.5 * (new_values.flatten() - batch.returns).pow(2).mean() + else: + values_clipped = batch.values + (new_values.flatten() - batch.values).clamp( + -self.clip_range_vf, self.clip_range_vf + ) + values_losses = (new_values.flatten() - batch.returns).pow(2) + values_losses_clipped = (values_clipped - batch.returns).pow(2) + value_loss = 0.5 * th.max(values_losses, values_losses_clipped).mean() + + # update + self.policy.optimizers["opt"].zero_grad(set_to_none=True) + loss = value_loss * self.vf_coef + policy_loss - entropy * self.ent_coef + loss.backward() + nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm) + self.policy.optimizers["opt"].step() + + total_policy_loss.append(policy_loss.item()) + total_value_loss.append(value_loss.item()) + total_entropy_loss.append(entropy.item()) + + # record metrics + self.logger.record("train/policy_loss", np.mean(total_policy_loss)) + self.logger.record("train/value_loss", np.mean(total_value_loss)) + self.logger.record("train/entropy_loss", np.mean(total_entropy_loss)) diff --git a/rllte/common/prototype/base_agent.py b/rllte/common/prototype/base_agent.py index 5aba434b..1c7b0c12 100644 --- a/rllte/common/prototype/base_agent.py +++ b/rllte/common/prototype/base_agent.py @@ -28,7 +28,7 @@ from abc import ABC, abstractmethod from datetime import datetime from pathlib import Path -from typing import Dict, Optional, Any +from typing import Any, Dict, Optional import numpy as np import pynvml diff --git a/rllte/common/prototype/base_distribution.py b/rllte/common/prototype/base_distribution.py index 9cae29b1..6564a3a6 100644 --- a/rllte/common/prototype/base_distribution.py +++ b/rllte/common/prototype/base_distribution.py @@ -24,6 +24,7 @@ from typing import Any + import torch as th from torch.distributions import Distribution @@ -40,4 +41,4 @@ def __call__(self, *args, **kwargs) -> Any: """Call the distribution.""" def sample(self, *args, **kwargs) -> th.Tensor: # type: ignore - """Generate samples.""" \ No newline at end of file + """Generate samples.""" diff --git a/rllte/common/prototype/off_policy_agent.py b/rllte/common/prototype/off_policy_agent.py index 06c97055..a74f7140 100644 --- a/rllte/common/prototype/off_policy_agent.py +++ b/rllte/common/prototype/off_policy_agent.py @@ -73,7 +73,7 @@ def update(self) -> None: """Update the agent. Implemented by individual algorithms.""" raise NotImplementedError - def train( # noqa: C901 + def train( # noqa: C901 self, num_train_steps: int, init_model_path: Optional[str] = None, @@ -82,7 +82,7 @@ def train( # noqa: C901 save_interval: int = 5000, num_eval_episodes: int = 10, th_compile: bool = False, - anneal_lr: bool = False + anneal_lr: bool = False, ) -> None: """Training function. @@ -123,7 +123,7 @@ def train( # noqa: C901 actions = th.stack([th.as_tensor(self.action_space.sample()) for _ in range(self.num_envs)]) else: actions = self.policy(obs, training=True) - + # update the learning rate if anneal_lr: for key in self.policy.optimizers.keys(): @@ -151,10 +151,10 @@ def train( # noqa: C901 for idx, (term, trunc) in enumerate(zip(terms, truncs)): if term.item() or trunc.item(): # TODO: deal with dict observations - real_next_obs[idx] = th.as_tensor(infos["final_observation"][idx], device=self.device) # type: ignore[index] + real_next_obs[idx] = th.as_tensor(infos["final_observation"][idx], device=self.device) # type: ignore[index] # add new transitions - self.storage.add(obs, actions, rews, terms, truncs, infos, real_next_obs) + self.storage.add(obs, actions.unsqueeze(-1), rews, terms, truncs, infos, real_next_obs) self.global_step += self.num_envs # deal with the intrinsic reward module diff --git a/rllte/common/prototype/on_policy_agent.py b/rllte/common/prototype/on_policy_agent.py index 2a77f740..71e721f5 100644 --- a/rllte/common/prototype/on_policy_agent.py +++ b/rllte/common/prototype/on_policy_agent.py @@ -59,9 +59,11 @@ def __init__( device: str = "cpu", pretraining: bool = False, num_steps: int = 128, + use_lstm: bool = False, ) -> None: super().__init__(env=env, eval_env=eval_env, tag=tag, seed=seed, device=device, pretraining=pretraining) self.num_steps = num_steps + self.use_lstm = use_lstm # attr annotations self.policy: OnPolicyType self.storage: RolloutStorageType @@ -79,7 +81,7 @@ def train( save_interval: int = 100, num_eval_episodes: int = 10, th_compile: bool = True, - anneal_lr: bool = False + anneal_lr: bool = False, ) -> None: """Training function. @@ -106,13 +108,25 @@ def train( # get number of updates num_updates = int(num_train_steps // self.num_envs // self.num_steps) + # only if using lstm, initialize lstm state + if self.use_lstm: + lstm_state = ( + th.zeros(self.policy.lstm.num_layers, self.num_envs, self.policy.lstm.hidden_size).to(self.device), + th.zeros(self.policy.lstm.num_layers, self.num_envs, self.policy.lstm.hidden_size).to(self.device), + ) + done = th.zeros(self.num_envs, dtype=th.bool, device=self.device) + for update in range(num_updates): + # important for updating the policy lstm later + if self.use_lstm: + self.initial_lstm_state = (lstm_state[0].clone(), lstm_state[1].clone()) + # try to eval if (update % eval_interval) == 0 and (self.eval_env is not None): eval_metrics = self.eval(num_eval_episodes) # log to console self.logger.eval(msg=eval_metrics) - + # update the learning rate if anneal_lr: for key in self.policy.optimizers.keys(): @@ -121,10 +135,19 @@ def train( for _ in range(self.num_steps): # sample actions with th.no_grad(), utils.eval_mode(self): - actions, extra_policy_outputs = self.policy(obs, training=True) + if self.use_lstm: + actions, extra_policy_outputs = self.policy(obs, lstm_state, done, training=True) + lstm_state = extra_policy_outputs["lstm_state"] + del extra_policy_outputs["lstm_state"] + else: + actions, extra_policy_outputs = self.policy(obs, training=True) + # observe rewards and next obs next_obs, rews, terms, truncs, infos = self.env.step(actions) + if self.use_lstm: + done = th.logical_or(terms, truncs) + # pre-training mode if self.pretraining: rews = th.zeros_like(rews, device=self.device) @@ -142,7 +165,10 @@ def train( # get the value estimation of the last step with th.no_grad(): - last_values = self.policy.get_value(next_obs).detach() + if self.use_lstm: + last_values = self.policy.get_value(next_obs, lstm_state, done).detach() + else: + last_values = self.policy.get_value(next_obs).detach() # perform return and advantage estimation self.storage.compute_returns_and_advantages(last_values) @@ -225,18 +251,41 @@ def eval(self, num_eval_episodes: int) -> Dict[str, Any]: episode_rewards: List[float] = [] episode_steps: List[int] = [] + if self.use_lstm: + lstm_state = ( + th.zeros(self.policy.lstm.num_layers, self.num_envs, self.policy.lstm.hidden_size).to(self.device), + th.zeros(self.policy.lstm.num_layers, self.num_envs, self.policy.lstm.hidden_size).to(self.device), + ) + done = th.zeros(self.num_envs, dtype=th.bool, device=self.device) + # evaluation loop while len(episode_rewards) < num_eval_episodes: with th.no_grad(), utils.eval_mode(self): - actions, _ = self.policy(obs, training=False) + if self.use_lstm: + actions, extra_policy_outputs = self.policy(obs, lstm_state, done, training=False) + lstm_state = extra_policy_outputs["lstm_state"] + del extra_policy_outputs["lstm_state"] + else: + actions, _ = self.policy(obs, training=False) + next_obs, rews, terms, truncs, infos = self.eval_env.step(actions) + if self.use_lstm: + done = th.logical_or(terms, truncs) + # get episode information if "episode" in infos: eps_r, eps_l = utils.get_episode_statistics(infos) episode_rewards.extend(eps_r) episode_steps.extend(eps_l) + if self.use_lstm: + lstm_state = ( + th.zeros(self.policy.lstm.num_layers, self.num_envs, self.policy.lstm.hidden_size).to(self.device), + th.zeros(self.policy.lstm.num_layers, self.num_envs, self.policy.lstm.hidden_size).to(self.device), + ) + done = th.zeros(self.num_envs, dtype=th.bool, device=self.device) + # set the current observation obs = next_obs diff --git a/rllte/common/type_alias.py b/rllte/common/type_alias.py index c0ad24f9..d4114fc6 100644 --- a/rllte/common/type_alias.py +++ b/rllte/common/type_alias.py @@ -139,6 +139,18 @@ class VanillaRolloutBatch(NamedTuple): adv_targ: th.Tensor +class EpisodicRolloutBatch(NamedTuple): + observations: th.Tensor + actions: th.Tensor + values: th.Tensor + returns: th.Tensor + terminateds: th.Tensor + truncateds: th.Tensor + old_log_probs: th.Tensor + adv_targ: th.Tensor + env_inds: th.Tensor + + class DictRolloutBatch(NamedTuple): observations: Dict[str, th.Tensor] actions: th.Tensor diff --git a/rllte/common/utils.py b/rllte/common/utils.py index 5c1bf90f..8c65c01e 100644 --- a/rllte/common/utils.py +++ b/rllte/common/utils.py @@ -130,7 +130,7 @@ def get_episode_statistics(infos: Dict) -> Tuple[List, List]: r: List = [] l: List = [] # to handle with the Atari environments - for info in infos['final_info']: + for info in infos["final_info"]: if info is not None and "episode" in info.keys(): r.extend(info["episode"]["r"].tolist()) l.extend(info["episode"]["l"].tolist()) @@ -191,10 +191,10 @@ def linear_lr_scheduler(optimizer, steps, total_num_steps, initial_lr) -> None: steps (int): Current step. total_num_steps (int): Total number of steps. initial_lr (float): Initial learning rate. - + Returns: None. """ lr = initial_lr - (initial_lr * (steps / float(total_num_steps))) for param_group in optimizer.param_groups: - param_group['lr'] = lr \ No newline at end of file + param_group["lr"] = lr diff --git a/rllte/env/__init__.py b/rllte/env/__init__.py index 1cf0e2e8..f9eb800b 100644 --- a/rllte/env/__init__.py +++ b/rllte/env/__init__.py @@ -24,11 +24,10 @@ from .testing import make_bitflipping_env as make_bitflipping_env -from .testing import make_multibinary_env as make_multibinary_env -from .testing import make_multidiscrete_env as make_multidiscrete_env from .testing import make_box_env as make_box_env from .testing import make_discrete_env as make_discrete_env - +from .testing import make_multibinary_env as make_multibinary_env +from .testing import make_multidiscrete_env as make_multidiscrete_env from .utils import make_rllte_env as make_rllte_env try: diff --git a/rllte/env/atari/__init__.py b/rllte/env/atari/__init__.py index 5f9b3717..ab558c00 100644 --- a/rllte/env/atari/__init__.py +++ b/rllte/env/atari/__init__.py @@ -28,17 +28,15 @@ import gymnasium as gym import numpy as np from gymnasium.vector import AsyncVectorEnv, SyncVectorEnv -from gymnasium.wrappers import (FrameStack, - GrayScaleObservation, - RecordEpisodeStatistics, - ResizeObservation, - TransformReward) - -from rllte.env.atari.wrappers import (EpisodicLifeEnv, - FireResetEnv, - MaxAndSkipEnv, - NoopResetEnv, - RecordEpisodeStatistics4EnvPool) +from gymnasium.wrappers import FrameStack, GrayScaleObservation, RecordEpisodeStatistics, ResizeObservation, TransformReward + +from rllte.env.atari.wrappers import ( + EpisodicLifeEnv, + FireResetEnv, + MaxAndSkipEnv, + NoopResetEnv, + RecordEpisodeStatistics4EnvPool, +) from rllte.env.utils import EnvPoolAsync2Gymnasium, EnvPoolSync2Gymnasium, Gymnasium2Torch @@ -65,7 +63,7 @@ def make_envpool_atari_env( batch_size=num_envs, seed=seed, episodic_life=True, - reward_clip=True + reward_clip=True, ) if asynchronous: diff --git a/rllte/env/atari/wrappers.py b/rllte/env/atari/wrappers.py index a3e85488..f6557f3a 100644 --- a/rllte/env/atari/wrappers.py +++ b/rllte/env/atari/wrappers.py @@ -23,7 +23,7 @@ # ============================================================================= -from typing import Any, Dict, Tuple, Optional +from typing import Any, Dict, Optional, Tuple import gymnasium as gym import numpy as np @@ -194,22 +194,23 @@ def step(self, action: int) -> Tuple[Any, float, bool, bool, Dict]: class RecordEpisodeStatistics4EnvPool(gym.Wrapper): - """Keep track of cumulative rewards and episode lengths. + """Keep track of cumulative rewards and episode lengths. This wrapper is dedicated to EnvPool-based Atari games. Args: env (gym.Env): Environment to wrap. deque_size (int): The size of the buffers :attr:`return_queue` and :attr:`length_queue` - + Returns: RecordEpisodeStatistics4EnvPool instance. """ + def __init__(self, env: gym.Env, deque_size: int = 100) -> None: super().__init__(env) self.num_envs = getattr(env, "num_envs", 1) self.episode_returns: Optional[np.ndarray] = None self.episode_lengths: Optional[np.ndarray] = None - + def reset(self, **kwargs): observations, infos = super().reset(**kwargs) self.episode_returns = np.zeros(self.num_envs, dtype=np.float32) @@ -217,7 +218,7 @@ def reset(self, **kwargs): self.returned_episode_returns = np.zeros(self.num_envs, dtype=np.float32) self.returned_episode_lengths = np.zeros(self.num_envs, dtype=np.int32) return observations, infos - + def step(self, actions): observations, rewards, terms, truncs, infos = super().step(actions) self.episode_returns += infos["reward"] @@ -231,8 +232,8 @@ def step(self, actions): infos["episode"]["l"] = self.returned_episode_lengths for idx, d in enumerate(terms): - if not d or infos["lives"][idx] != 0: - infos["episode"]["r"][idx] = 0 - infos["episode"]["l"][idx] = 0 + if not d or infos["lives"][idx] != 0: + infos["episode"]["r"][idx] = 0 + infos["episode"]["l"][idx] = 0 - return observations, rewards, terms, truncs, infos \ No newline at end of file + return observations, rewards, terms, truncs, infos diff --git a/rllte/env/dmc/wrappers.py b/rllte/env/dmc/wrappers.py index 5b784176..05948e42 100644 --- a/rllte/env/dmc/wrappers.py +++ b/rllte/env/dmc/wrappers.py @@ -227,7 +227,7 @@ def _extract_min_max(s): mins, maxs = [], [] for s in spec: - mn, mx = _extract_min_max(s) # type: ignore + mn, mx = _extract_min_max(s) # type: ignore mins.append(mn) maxs.append(mx) low = np.concatenate(mins, axis=0).astype(dtype) diff --git a/rllte/env/testing/__init__.py b/rllte/env/testing/__init__.py index 5b8dbc97..02095983 100644 --- a/rllte/env/testing/__init__.py +++ b/rllte/env/testing/__init__.py @@ -23,8 +23,8 @@ # ============================================================================= +from .bitflipping import make_bitflipping_env as make_bitflipping_env from .box import make_box_env as make_box_env from .discrete import make_discrete_env as make_discrete_env -from .bitflipping import make_bitflipping_env as make_bitflipping_env from .multibinary import make_multibinary_env as make_multibinary_env from .multidiscrete import make_multidiscrete_env as make_multidiscrete_env diff --git a/rllte/env/testing/box.py b/rllte/env/testing/box.py index dd33f40b..61ac5cec 100644 --- a/rllte/env/testing/box.py +++ b/rllte/env/testing/box.py @@ -118,15 +118,18 @@ def step(self, action: Any) -> Tuple[Any, SupportsFloat, bool, bool, Dict[str, A return obs, reward, terminated, truncated, info + class DictEnv(gym.Env): """Environment with dict-based observation space and `Box` action space for testing.""" def __init__(self) -> None: super().__init__() - self.observation_space = gym.spaces.Dict(spaces={ - "image": gym.spaces.Box(low=-1.0, high=1.0, shape=(3, 84, 84), dtype=np.float32), - "state": gym.spaces.Box(low=-1.0, high=1.0, shape=(49,), dtype=np.float32), - }) + self.observation_space = gym.spaces.Dict( + spaces={ + "image": gym.spaces.Box(low=-1.0, high=1.0, shape=(3, 84, 84), dtype=np.float32), + "state": gym.spaces.Box(low=-1.0, high=1.0, shape=(49,), dtype=np.float32), + } + ) self.action_space = gym.spaces.Box(low=-1.0, high=1.0, shape=(7,), dtype=np.float32) def reset(self, seed: Optional[int] = None, options=Optional[Dict[str, Any]]) -> Tuple[Any, Dict[str, Any]]: @@ -161,6 +164,7 @@ def step(self, action: Any) -> Tuple[Any, SupportsFloat, bool, bool, Dict[str, A return obs, reward, terminated, truncated, info + def make_box_env( env_id: str = "StateObsEnv", num_envs: int = 1, device: str = "cpu", seed: int = 0, asynchronous: bool = True ) -> Gymnasium2Torch: diff --git a/rllte/env/testing/discrete.py b/rllte/env/testing/discrete.py index d890817a..202ff482 100644 --- a/rllte/env/testing/discrete.py +++ b/rllte/env/testing/discrete.py @@ -114,15 +114,18 @@ def step(self, action: Any) -> Tuple[Any, SupportsFloat, bool, bool, Dict[str, A return obs, reward, terminated, truncated, info + class DictEnv(gym.Env): """Environment with dict-based observation space and `Discrete` action space for testing.""" def __init__(self) -> None: super().__init__() - self.observation_space = gym.spaces.Dict(spaces={ - "image": gym.spaces.Box(low=-1.0, high=1.0, shape=(3, 84, 84), dtype=np.float32), - "state": gym.spaces.Box(low=-1.0, high=1.0, shape=(49,), dtype=np.float32), - }) + self.observation_space = gym.spaces.Dict( + spaces={ + "image": gym.spaces.Box(low=-1.0, high=1.0, shape=(3, 84, 84), dtype=np.float32), + "state": gym.spaces.Box(low=-1.0, high=1.0, shape=(49,), dtype=np.float32), + } + ) self.action_space = gym.spaces.Discrete(n=7) def reset(self, seed: Optional[int] = None, options=Optional[Dict[str, Any]]) -> Tuple[Any, Dict[str, Any]]: @@ -157,6 +160,7 @@ def step(self, action: Any) -> Tuple[Any, SupportsFloat, bool, bool, Dict[str, A return obs, reward, terminated, truncated, info + def make_discrete_env( env_id: str = "StateObsEnv", num_envs: int = 1, device: str = "cpu", seed: int = 0, asynchronous: bool = True ) -> Gymnasium2Torch: diff --git a/rllte/env/testing/multibinary.py b/rllte/env/testing/multibinary.py index d7e93ee3..173e98eb 100644 --- a/rllte/env/testing/multibinary.py +++ b/rllte/env/testing/multibinary.py @@ -114,15 +114,18 @@ def step(self, action: Any) -> Tuple[Any, SupportsFloat, bool, bool, Dict[str, A return obs, reward, terminated, truncated, info + class DictEnv(gym.Env): """Environment with dict-based observation space and `MultiBinary` action space for testing.""" def __init__(self) -> None: super().__init__() - self.observation_space = gym.spaces.Dict(spaces={ - "image": gym.spaces.Box(low=-1.0, high=1.0, shape=(3, 84, 84), dtype=np.float32), - "state": gym.spaces.Box(low=-1.0, high=1.0, shape=(49,), dtype=np.float32), - }) + self.observation_space = gym.spaces.Dict( + spaces={ + "image": gym.spaces.Box(low=-1.0, high=1.0, shape=(3, 84, 84), dtype=np.float32), + "state": gym.spaces.Box(low=-1.0, high=1.0, shape=(49,), dtype=np.float32), + } + ) self.action_space = gym.spaces.MultiBinary(n=3) def reset(self, seed: Optional[int] = None, options=Optional[Dict[str, Any]]) -> Tuple[Any, Dict[str, Any]]: @@ -157,6 +160,7 @@ def step(self, action: Any) -> Tuple[Any, SupportsFloat, bool, bool, Dict[str, A return obs, reward, terminated, truncated, info + def make_multibinary_env( env_id: str = "StateObsEnv", num_envs: int = 1, device: str = "cpu", seed: int = 0, asynchronous: bool = True ) -> Gymnasium2Torch: diff --git a/rllte/env/testing/multidiscrete.py b/rllte/env/testing/multidiscrete.py index b9c1b04a..f27283b6 100644 --- a/rllte/env/testing/multidiscrete.py +++ b/rllte/env/testing/multidiscrete.py @@ -120,10 +120,12 @@ class DictEnv(gym.Env): def __init__(self) -> None: super().__init__() - self.observation_space = gym.spaces.Dict(spaces={ - "image": gym.spaces.Box(low=-1.0, high=1.0, shape=(3, 84, 84), dtype=np.float32), - "state": gym.spaces.Box(low=-1.0, high=1.0, shape=(49,), dtype=np.float32), - }) + self.observation_space = gym.spaces.Dict( + spaces={ + "image": gym.spaces.Box(low=-1.0, high=1.0, shape=(3, 84, 84), dtype=np.float32), + "state": gym.spaces.Box(low=-1.0, high=1.0, shape=(49,), dtype=np.float32), + } + ) self.action_space = gym.spaces.MultiDiscrete(nvec=(2, 3, 4)) def reset(self, seed: Optional[int] = None, options=Optional[Dict[str, Any]]) -> Tuple[Any, Dict[str, Any]]: diff --git a/rllte/hub/__init__.py b/rllte/hub/__init__.py index eb5d97df..0b8233f3 100644 --- a/rllte/hub/__init__.py +++ b/rllte/hub/__init__.py @@ -26,4 +26,4 @@ from .atari import Atari as Atari from .dmc import DMControl as DMControl from .minigrid import MiniGrid as MiniGrid -from .procgen import Procgen as Procgen \ No newline at end of file +from .procgen import Procgen as Procgen diff --git a/rllte/hub/atari.py b/rllte/hub/atari.py index c2484fcc..ef3f3c0b 100644 --- a/rllte/hub/atari.py +++ b/rllte/hub/atari.py @@ -23,17 +23,17 @@ # ============================================================================= -from huggingface_hub import hf_hub_download -from typing import Dict, Callable -from torch import nn +from typing import Dict import numpy as np import torch as th +from huggingface_hub import hf_hub_download +from torch import nn -from rllte.hub.bucket import Bucket -from rllte.agent import A2C, PPO, IMPALA -from rllte.env import make_atari_env, make_envpool_atari_env +from rllte.agent import A2C, IMPALA, PPO from rllte.common.prototype import BaseAgent +from rllte.env import make_atari_env, make_envpool_atari_env +from rllte.hub.bucket import Bucket class Atari(Bucket): @@ -44,39 +44,87 @@ class Atari(Bucket): Number of seeds: 10 Added algorithms: [PPO] """ + def __init__(self) -> None: super().__init__() - self.sup_env = ['Alien-v5', 'Amidar-v5', 'Assault-v5', 'Asterix-v5', 'Asteroids-v5', 'Atlantis-v5', 'YarsRevenge-v5', - 'BankHeist-v5', 'BattleZone-v5', 'BeamRider-v5', 'Berzerk-v5', 'Bowling-v5', 'Boxing-v5', 'Breakout-v5', - 'Centipede-v5', 'ChopperCommand-v5', 'CrazyClimber-v5', 'Defender-v5', 'DemonAttack-v5', 'DoubleDunk-v5', 'Zaxxon-v5', - 'Enduro-v5', 'FishingDerby-v5', 'Freeway-v5', 'Frostbite-v5', 'Gopher-v5', 'Gravitar-v5', 'Hero-v5', - 'IceHockey-v5', 'Jamesbond-v5', 'Kangaroo-v5', 'Krull-v5', 'KungFuMaster-v5', 'MontezumaRevenge-v5', 'Pitfall-v5', - 'PrivateEye-v5', 'Qbert-v5', 'Riverraid-v5', 'RoadRunner-v5', 'Robotank-v5', 'Seaquest-v5', 'Phoenix-v5', 'Pong-v5', - 'Skiing-v5', 'Solaris-v5', 'SpaceInvaders-v5', 'StarGunner-v5', 'Surround-v5', 'Tennis-v5', 'TimePilot-v5', - 'Tutankham-v5', 'UpNDown-v5', 'Venture-v5', 'VideoPinball-v5', 'WizardOfWor-v5', 'MsPacman-v5', 'NameThisGame-v5' - ] - self.sup_algo = ['ppo'] + self.sup_env = [ + "Alien-v5", + "Amidar-v5", + "Assault-v5", + "Asterix-v5", + "Asteroids-v5", + "Atlantis-v5", + "YarsRevenge-v5", + "BankHeist-v5", + "BattleZone-v5", + "BeamRider-v5", + "Berzerk-v5", + "Bowling-v5", + "Boxing-v5", + "Breakout-v5", + "Centipede-v5", + "ChopperCommand-v5", + "CrazyClimber-v5", + "Defender-v5", + "DemonAttack-v5", + "DoubleDunk-v5", + "Zaxxon-v5", + "Enduro-v5", + "FishingDerby-v5", + "Freeway-v5", + "Frostbite-v5", + "Gopher-v5", + "Gravitar-v5", + "Hero-v5", + "IceHockey-v5", + "Jamesbond-v5", + "Kangaroo-v5", + "Krull-v5", + "KungFuMaster-v5", + "MontezumaRevenge-v5", + "Pitfall-v5", + "PrivateEye-v5", + "Qbert-v5", + "Riverraid-v5", + "RoadRunner-v5", + "Robotank-v5", + "Seaquest-v5", + "Phoenix-v5", + "Pong-v5", + "Skiing-v5", + "Solaris-v5", + "SpaceInvaders-v5", + "StarGunner-v5", + "Surround-v5", + "Tennis-v5", + "TimePilot-v5", + "Tutankham-v5", + "UpNDown-v5", + "Venture-v5", + "VideoPinball-v5", + "WizardOfWor-v5", + "MsPacman-v5", + "NameThisGame-v5", + ] + self.sup_algo = ["ppo"] def load_scores(self, env_id: str, agent: str) -> np.ndarray: """Returns final performance. - + Args: env_id (str): Environment ID. agent_id (str): Agent name. - + Returns: Test scores data array with shape (N_SEEDS, N_POINTS). """ self.is_available(env_id=env_id, agent=agent.lower()) - scores_file = f'{agent.lower()}_atari_{env_id}_scores.npy' + scores_file = f"{agent.lower()}_atari_{env_id}_scores.npy" file = hf_hub_download( - repo_id="RLE-Foundation/rllte-hub", - repo_type="model", - filename=scores_file, - subfolder="atari/scores" + repo_id="RLE-Foundation/rllte-hub", repo_type="model", filename=scores_file, subfolder="atari/scores" ) return np.load(file) @@ -87,7 +135,7 @@ def load_curves(self, env_id: str, agent: str) -> Dict[str, np.ndarray]: Args: env_id (str): Environment ID. agent_id (str): Agent name. - + Returns: Learning curves data with structure: curves @@ -96,26 +144,18 @@ def load_curves(self, env_id: str, agent: str) -> Dict[str, np.ndarray]: """ self.is_available(env_id=env_id, agent=agent.lower()) - curves_file = f'{agent.lower()}_atari_{env_id}_curves.npz' + curves_file = f"{agent.lower()}_atari_{env_id}_curves.npz" file = hf_hub_download( - repo_id="RLE-Foundation/rllte-hub", - repo_type="model", - filename=curves_file, - subfolder="atari/curves" + repo_id="RLE-Foundation/rllte-hub", repo_type="model", filename=curves_file, subfolder="atari/curves" ) curves_dict = np.load(file, allow_pickle=True) curves_dict = dict(curves_dict) return curves_dict - - def load_models(self, - env_id: str, - agent: str, - seed: int, - device: str = "cpu" - ) -> nn.Module: + + def load_models(self, env_id: str, agent: str, seed: int, device: str = "cpu") -> nn.Module: """Load the model from the hub. Args: @@ -128,20 +168,15 @@ def load_models(self, The loaded model. """ self.is_available(env_id=env_id, agent=agent.lower()) - + model_file = f"{agent.lower()}_atari_{env_id}_seed_{seed}.pth" subfolder = f"atari/{agent}" file = hf_hub_download(repo_id="RLE-Foundation/rllte-hub", repo_type="model", filename=model_file, subfolder=subfolder) model = th.load(file, map_location=device) return model.eval() - - def load_apis(self, - env_id: str, - agent: str, - seed: int, - device: str = "cpu" - ) -> BaseAgent: + + def load_apis(self, env_id: str, agent: str, seed: int, device: str = "cpu") -> BaseAgent: """Load the a training API. Args: @@ -156,7 +191,7 @@ def load_apis(self, if agent.lower() == "ppo": # The following hyperparameters are from the repository: # https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail - # Since the asynchronous mode achieved much lower training performance than the synchronous mode, + # Since the asynchronous mode achieved much lower training performance than the synchronous mode, # we recommend using the synchronous mode currently. envs = make_envpool_atari_env(env_id=env_id, num_envs=8, device=device, seed=seed, asynchronous=False) eval_envs = make_envpool_atari_env(env_id=env_id, num_envs=8, device=device, seed=seed, asynchronous=False) @@ -179,7 +214,7 @@ def load_apis(self, ent_coef=0.01, max_grad_norm=0.5, discount=0.99, - init_fn="orthogonal" + init_fn="orthogonal", ) elif agent.lower() == "a2c": # The following hyperparameters are from the repository: @@ -187,7 +222,7 @@ def load_apis(self, envs = make_envpool_atari_env(env_id=env_id, num_envs=16, device=device, seed=seed) eval_envs = make_envpool_atari_env(env_id=env_id, num_envs=16, device=device, seed=seed, asynchronous=False) - api = A2C( # type: ignore[assignment] + api = A2C( # type: ignore[assignment] env=envs, eval_env=eval_envs, tag=f"a2c_atari_{env_id}_seed_{seed}", @@ -210,7 +245,7 @@ def load_apis(self, # https://github.com/facebookresearch/torchbeast envs = make_atari_env(env_id=env_id, device=device, seed=seed, num_envs=45, asynchronous=False) eval_envs = make_atari_env(env_id=env_id, device=device, seed=seed, num_envs=1, asynchronous=False) - self.agent = IMPALA( # type: ignore[assignment] + self.agent = IMPALA( # type: ignore[assignment] env=envs, eval_env=eval_envs, tag=f"impala_atari_{env_id}_seed_{seed}", @@ -225,4 +260,4 @@ def load_apis(self, else: raise NotImplementedError(f"Agent {agent} is not supported currently, available agents are: [A2C, PPO, IMPALA].") - return api \ No newline at end of file + return api diff --git a/rllte/hub/bucket.py b/rllte/hub/bucket.py index faeb56da..7d00cf07 100644 --- a/rllte/hub/bucket.py +++ b/rllte/hub/bucket.py @@ -24,39 +24,40 @@ from abc import ABC, abstractmethod -from typing import Callable, Dict, List, Optional -from torch import nn +from typing import Dict, List, Optional + import numpy as np +from torch import nn from rllte.common.prototype import BaseAgent + class Bucket(ABC): """Bucket class for storing scores, learning curves, and models.""" + def __init__(self) -> None: super().__init__() self.sup_env: List = [] self.sup_algo: List = [] - def is_available(self, env_id: str, agent: Optional[str] = None) -> None: """Check if the dataset is available.""" - assert env_id in self.sup_env and agent in self.sup_algo, \ - f"Datasets for `{env_id}` and `{agent}` are not available currently!" - + assert ( + env_id in self.sup_env and agent in self.sup_algo + ), f"Datasets for `{env_id}` and `{agent}` are not available currently!" @abstractmethod def load_scores(self, env_id: str, agent: str) -> np.ndarray: """Returns final performance. - + Args: env_id (str): Environment ID. agent_id (str): Agent name. - + Returns: Test scores data array with shape (N_SEEDS, N_POINTS). """ - @abstractmethod def load_curves(self, env_id: str, agent: str) -> Dict[str, np.ndarray]: @@ -65,21 +66,16 @@ def load_curves(self, env_id: str, agent: str) -> Dict[str, np.ndarray]: Args: env_id (str): Environment ID. agent_id (str): Agent name. - + Returns: Learning curves data with structure: curves ├── train: np.ndarray(shape=(N_SEEDS, N_POINTS)) └── eval: np.ndarray(shape=(N_SEEDS, N_POINTS)) """ - + @abstractmethod - def load_models(self, - env_id: str, - agent: str, - seed: int, - device: str = "cpu" - ) -> nn.Module: + def load_models(self, env_id: str, agent: str, seed: int, device: str = "cpu") -> nn.Module: """Load the model from the hub. Args: @@ -93,12 +89,7 @@ def load_models(self, """ @abstractmethod - def load_apis(self, - env_id: str, - agent: str, - seed: int, - device: str = "cpu" - ) -> BaseAgent: + def load_apis(self, env_id: str, agent: str, seed: int, device: str = "cpu") -> BaseAgent: """Load the a training API. Args: @@ -109,4 +100,4 @@ def load_apis(self, Returns: The loaded API. - """ \ No newline at end of file + """ diff --git a/rllte/hub/dmc.py b/rllte/hub/dmc.py index a795cd3b..94aaf12f 100644 --- a/rllte/hub/dmc.py +++ b/rllte/hub/dmc.py @@ -23,21 +23,23 @@ # ============================================================================= -from huggingface_hub import hf_hub_download from typing import Dict, Optional -from torch import nn import numpy as np import torch as th -from rllte.hub.bucket import Bucket +from huggingface_hub import hf_hub_download +from torch import nn + from rllte.agent import SAC, DrQv2 -from rllte.env import make_dmc_env from rllte.common.prototype import BaseAgent +from rllte.env import make_dmc_env +from rllte.hub.bucket import Bucket -# cheetah_run quadruped_walk quadruped_run walker_walk walker_run hopper_hop arcobot_swingup cup_catch +# cheetah_run quadruped_walk quadruped_run walker_walk walker_run hopper_hop arcobot_swingup cup_catch # cartpole_balance cartpole_balance_sparse cartpole_swingup cartpole_swingup_sparse finger_spin finger_turn_easy # finger_turn_hard fish_swim fish_upright hopper_stand pendulum_swingup quadruped_run reacher_easy reacher_hard swimmer_swimmer6 swimmer_swimmer15 + class DMControl(Bucket): """Scores and learning cures of various RL algorithms on the full DeepMind Control Suite benchmark. @@ -47,53 +49,70 @@ class DMControl(Bucket): Number of seeds: 10 Added algorithms: [SAC, DrQ-v2] """ + def __init__(self) -> None: super().__init__() - self.sup_env = ['acrobot_swingup', 'cartpole_balance', 'cartpole_balance_sparse', - 'cartpole_swingup', 'cartpole_swingup_sparse', 'cheetah_run', - 'cup_catch', 'finger_spin', 'finger_turn_easy', - 'finger_turn_hard', 'fish_swim', 'fish_upright', - 'hopper_hop', 'hopper_stand', 'pendulum_swingup', - 'quadruped_run', 'quadruped_walk', 'reacher_easy', - 'reacher_hard', 'swimmer_swimmer6', 'swimmer_swimmer15', - 'walker_run', 'walker_walk', 'walker_stand', - 'humanoid_walk', 'humanoid_run', 'humanoid_stand' - ] - self.sup_algo = ['sac'] - + self.sup_env = [ + "acrobot_swingup", + "cartpole_balance", + "cartpole_balance_sparse", + "cartpole_swingup", + "cartpole_swingup_sparse", + "cheetah_run", + "cup_catch", + "finger_spin", + "finger_turn_easy", + "finger_turn_hard", + "fish_swim", + "fish_upright", + "hopper_hop", + "hopper_stand", + "pendulum_swingup", + "quadruped_run", + "quadruped_walk", + "reacher_easy", + "reacher_hard", + "swimmer_swimmer6", + "swimmer_swimmer15", + "walker_run", + "walker_walk", + "walker_stand", + "humanoid_walk", + "humanoid_run", + "humanoid_stand", + ] + self.sup_algo = ["sac"] + def get_obs_type(self, agent: str) -> str: """Returns the observation type of the agent. - + Args: agent (str): Agent name. - + Returns: Observation type. """ - obs_type = 'state' if agent in ['sac'] else 'pixel' + obs_type = "state" if agent in ["sac"] else "pixel" return obs_type def load_scores(self, env_id: str, agent: str) -> Dict[str, np.ndarray]: """Returns final performance. - + Args: env_id (str): Environment ID. agent_id (str): Agent name. - + Returns: Test scores data array with shape (N_SEEDS, N_POINTS). """ self.is_available(env_id=env_id, agent=agent.lower()) obs_type = self.get_obs_type(agent=agent.lower()) - scores_file = f'{agent.lower()}_dmc_{obs_type}_{env_id}_scores.npy' + scores_file = f"{agent.lower()}_dmc_{obs_type}_{env_id}_scores.npy" file = hf_hub_download( - repo_id="RLE-Foundation/rllte-hub", - repo_type="model", - filename=scores_file, - subfolder="dmc/scores" + repo_id="RLE-Foundation/rllte-hub", repo_type="model", filename=scores_file, subfolder="dmc/scores" ) return np.load(file) @@ -105,7 +124,7 @@ def load_curves(self, env_id: str, agent: str) -> Dict[str, np.ndarray]: env_id (str): Environment ID. agent_id (str): Agent name. obs_type (str): A type from ['state', 'pixel']. - + Returns: Learning curves data with structure: curves @@ -115,13 +134,10 @@ def load_curves(self, env_id: str, agent: str) -> Dict[str, np.ndarray]: self.is_available(env_id=env_id, agent=agent.lower()) obs_type = self.get_obs_type(agent=agent.lower()) - curves_file = f'{agent.lower()}_dmc_{obs_type}_{env_id}_curves.npz' + curves_file = f"{agent.lower()}_dmc_{obs_type}_{env_id}_curves.npz" file = hf_hub_download( - repo_id="RLE-Foundation/rllte-hub", - repo_type="model", - filename=curves_file, - subfolder="dmc/curves" + repo_id="RLE-Foundation/rllte-hub", repo_type="model", filename=curves_file, subfolder="dmc/curves" ) curves_dict = np.load(file, allow_pickle=True) @@ -129,12 +145,7 @@ def load_curves(self, env_id: str, agent: str) -> Dict[str, np.ndarray]: return curves_dict - def load_models(self, - env_id: str, - agent: str, - seed: int, - device: str = "cpu" - ) -> nn.Module: + def load_models(self, env_id: str, agent: str, seed: int, device: str = "cpu") -> nn.Module: """Load the model from the hub. Args: @@ -156,13 +167,7 @@ def load_models(self, return model.eval() - - def load_apis(self, - env_id: str, - agent: str, - seed: int, - device: str = "cpu" - ) -> BaseAgent: + def load_apis(self, env_id: str, agent: str, seed: int, device: str = "cpu") -> BaseAgent: """Load the a training API. Args: @@ -212,7 +217,7 @@ def load_apis(self, visualize_reward=False, frame_stack=3, action_repeat=2, - asynchronous=False + asynchronous=False, ) eval_envs = make_dmc_env( env_id=env_id, @@ -223,10 +228,10 @@ def load_apis(self, visualize_reward=False, frame_stack=3, action_repeat=2, - asynchronous=False + asynchronous=False, ) # create agent - api = DrQv2( # type: ignore[assignment] + api = DrQv2( # type: ignore[assignment] env=envs, eval_env=eval_envs, tag=f"drqv2_dmc_pixel_{env_id}_seed_{seed}", @@ -246,4 +251,4 @@ def load_apis(self, f"Agent {agent} is not supported currently, available agents are: [SAC, DDPG, TD3, DrQv2]." ) - return api \ No newline at end of file + return api diff --git a/rllte/hub/minigrid.py b/rllte/hub/minigrid.py index ab31ba53..4cd7e12a 100644 --- a/rllte/hub/minigrid.py +++ b/rllte/hub/minigrid.py @@ -23,16 +23,17 @@ # ============================================================================= -from huggingface_hub import hf_hub_download from typing import Dict, Optional -from torch import nn import numpy as np import torch as th -from rllte.hub.bucket import Bucket +from huggingface_hub import hf_hub_download +from torch import nn + from rllte.agent import A2C, PPO -from rllte.env import make_minigrid_env from rllte.common.prototype import BaseAgent +from rllte.env import make_minigrid_env +from rllte.hub.bucket import Bucket class MiniGrid(Bucket): @@ -43,31 +44,29 @@ class MiniGrid(Bucket): Number of seeds: 10 Added algorithms: [A2C] """ + def __init__(self) -> None: super().__init__() - self.sup_env = ['Empty-6x6-v0'] - self.sup_algo = ['ppo'] + self.sup_env = ["Empty-6x6-v0"] + self.sup_algo = ["ppo"] def load_scores(self, env_id: str, agent: str) -> np.ndarray: """Returns final performance. - + Args: env_id (str): Environment ID. agent_id (str): Agent name. - + Returns: Test scores data array with shape (N_SEEDS, N_POINTS). """ self.is_available(env_id=env_id, agent=agent.lower()) - scores_file = f'{agent.lower()}_minigrid_{env_id}_scores.npy' + scores_file = f"{agent.lower()}_minigrid_{env_id}_scores.npy" file = hf_hub_download( - repo_id="RLE-Foundation/rllte-hub", - repo_type="model", - filename=scores_file, - subfolder="minigrid/scores" + repo_id="RLE-Foundation/rllte-hub", repo_type="model", filename=scores_file, subfolder="minigrid/scores" ) return np.load(file) @@ -78,7 +77,7 @@ def load_curves(self, env_id: str, agent: str) -> Dict[str, np.ndarray]: Args: env_id (str): Environment ID. agent_id (str): Agent name. - + Returns: Learning curves data with structure: curves @@ -87,13 +86,10 @@ def load_curves(self, env_id: str, agent: str) -> Dict[str, np.ndarray]: """ self.is_available(env_id=env_id, agent=agent.lower()) - curves_file = f'{agent.lower()}_minigrid_{env_id}_curves.npz' + curves_file = f"{agent.lower()}_minigrid_{env_id}_curves.npz" file = hf_hub_download( - repo_id="RLE-Foundation/rllte-hub", - repo_type="model", - filename=curves_file, - subfolder="minigrid/curves" + repo_id="RLE-Foundation/rllte-hub", repo_type="model", filename=curves_file, subfolder="minigrid/curves" ) curves_dict = np.load(file, allow_pickle=True) @@ -101,13 +97,7 @@ def load_curves(self, env_id: str, agent: str) -> Dict[str, np.ndarray]: return curves_dict - - def load_models(self, - env_id: str, - agent: str, - seed: int, - device: str = "cpu" - ) -> nn.Module: + def load_models(self, env_id: str, agent: str, seed: int, device: str = "cpu") -> nn.Module: """Load the model from the hub. Args: @@ -120,21 +110,15 @@ def load_models(self, The loaded model. """ self.is_available(env_id=env_id, agent=agent.lower()) - + model_file = f"{agent.lower()}_minigrid_{env_id}_seed_{seed}.pth" subfolder = f"minigrid/{agent}" file = hf_hub_download(repo_id="RLE-Foundation/rllte-hub", repo_type="model", filename=model_file, subfolder=subfolder) model = th.load(file, map_location=device) return model.eval() - - - def load_apis(self, - env_id: str, - agent: str, - seed: int, - device: str = "cpu" - ) -> BaseAgent: + + def load_apis(self, env_id: str, agent: str, seed: int, device: str = "cpu") -> BaseAgent: """Load the a training API. Args: @@ -177,7 +161,7 @@ def load_apis(self, # https://github.com/lcswillems/rl-starter-files envs = make_minigrid_env(env_id=env_id, num_envs=1, device=device, seed=seed) eval_envs = make_minigrid_env(env_id=env_id, num_envs=1, device=device, seed=seed) - api = A2C( # type: ignore[assignment] + api = A2C( # type: ignore[assignment] env=envs, eval_env=eval_envs, tag=f"a2c_{env_id}_seed_{seed}", @@ -198,4 +182,4 @@ def load_apis(self, else: raise NotImplementedError(f"Agent {agent} is not supported currently, available agents are: [A2C, PPO].") - return api \ No newline at end of file + return api diff --git a/rllte/hub/procgen.py b/rllte/hub/procgen.py index 47e88fbb..8dd16ea6 100644 --- a/rllte/hub/procgen.py +++ b/rllte/hub/procgen.py @@ -23,17 +23,18 @@ # ============================================================================= -from huggingface_hub import hf_hub_download from typing import Dict, Optional -from torch import nn import numpy as np import torch as th +from huggingface_hub import hf_hub_download +from torch import nn + +from rllte.agent import DAAC, PPG, PPO +from rllte.common.prototype import BaseAgent +from rllte.env import make_envpool_procgen_env from rllte.hub.bucket import Bucket -from rllte.agent import PPO, PPG, DAAC from rllte.xploit.encoder import EspeholtResidualEncoder -from rllte.env import make_envpool_procgen_env -from rllte.common.prototype import BaseAgent class Procgen(Bucket): @@ -44,36 +45,46 @@ class Procgen(Bucket): Number of seeds: 10 Added algorithms: [PPO] """ + def __init__(self) -> None: super().__init__() - self.sup_env = ['bigfish', 'bossfight', 'caveflyer', 'chaser', - 'climber', 'coinrun', 'dodgeball', 'fruitbot', - 'heist', 'jumper', 'leaper', 'maze', - 'miner', 'ninja', 'plunder', 'starpilot' - ] - self.sup_algo = ['ppo'] - + self.sup_env = [ + "bigfish", + "bossfight", + "caveflyer", + "chaser", + "climber", + "coinrun", + "dodgeball", + "fruitbot", + "heist", + "jumper", + "leaper", + "maze", + "miner", + "ninja", + "plunder", + "starpilot", + ] + self.sup_algo = ["ppo"] def load_scores(self, env_id: str, agent: str) -> np.ndarray: """Returns final performance. - + Args: env_id (str): Environment ID. agent_id (str): Agent name. - + Returns: Test scores data array with shape (N_SEEDS, N_POINTS). """ self.is_available(env_id=env_id, agent=agent.lower()) - scores_file = f'{agent.lower()}_procgen_{env_id}_scores.npy' + scores_file = f"{agent.lower()}_procgen_{env_id}_scores.npy" file = hf_hub_download( - repo_id="RLE-Foundation/rllte-hub", - repo_type="model", - filename=scores_file, - subfolder="procgen/scores" + repo_id="RLE-Foundation/rllte-hub", repo_type="model", filename=scores_file, subfolder="procgen/scores" ) return np.load(file) @@ -84,7 +95,7 @@ def load_curves(self, env_id: str, agent: str) -> Dict[str, np.ndarray]: Args: env_id (str): Environment ID. agent_id (str): Agent name. - + Returns: Learning curves data with structure: curves @@ -93,13 +104,10 @@ def load_curves(self, env_id: str, agent: str) -> Dict[str, np.ndarray]: """ self.is_available(env_id=env_id, agent=agent.lower()) - curves_file = f'{agent.lower()}_procgen_{env_id}_curves.npz' + curves_file = f"{agent.lower()}_procgen_{env_id}_curves.npz" file = hf_hub_download( - repo_id="RLE-Foundation/rllte-hub", - repo_type="model", - filename=curves_file, - subfolder="procgen/curves" + repo_id="RLE-Foundation/rllte-hub", repo_type="model", filename=curves_file, subfolder="procgen/curves" ) curves_dict = np.load(file, allow_pickle=True) @@ -107,12 +115,7 @@ def load_curves(self, env_id: str, agent: str) -> Dict[str, np.ndarray]: return curves_dict - def load_models(self, - env_id: str, - agent: str, - seed: int, - device: str = "cpu" - ) -> nn.Module: + def load_models(self, env_id: str, agent: str, seed: int, device: str = "cpu") -> nn.Module: """Load the model from the hub. Args: @@ -125,7 +128,7 @@ def load_models(self, The loaded model. """ self.is_available(env_id=env_id, agent=agent.lower()) - + model_file = f"{agent.lower()}_procgen_{env_id}_seed_{seed}.pth" subfolder = f"procgen/{agent}" file = hf_hub_download(repo_id="RLE-Foundation/rllte-hub", repo_type="model", filename=model_file, subfolder=subfolder) @@ -133,13 +136,7 @@ def load_models(self, return model.eval() - - def load_apis(self, - env_id: str, - agent: str, - seed: int, - device: str = "cpu" - ) -> BaseAgent: + def load_apis(self, env_id: str, agent: str, seed: int, device: str = "cpu") -> BaseAgent: """Load the a training API. Args: @@ -152,27 +149,27 @@ def load_apis(self, The loaded API. """ envs = make_envpool_procgen_env( - env_id=env_id, - num_envs=64, - device=device, - seed=seed, - gamma=0.99, - num_levels=200, - start_level=0, - distribution_mode="easy", - asynchronous=False - ) + env_id=env_id, + num_envs=64, + device=device, + seed=seed, + gamma=0.99, + num_levels=200, + start_level=0, + distribution_mode="easy", + asynchronous=False, + ) eval_envs = make_envpool_procgen_env( - env_id=env_id, - num_envs=1, - device=device, - seed=seed, - gamma=0.99, - num_levels=0, - start_level=0, - distribution_mode="easy", - asynchronous=False, - ) + env_id=env_id, + num_envs=1, + device=device, + seed=seed, + gamma=0.99, + num_levels=0, + start_level=0, + distribution_mode="easy", + asynchronous=False, + ) feature_dim = 256 if agent.lower() == "ppo": @@ -200,28 +197,28 @@ def load_apis(self, elif agent.lower() == "daac": # Best hyperparameters for DAAC reported in # https://github.com/rraileanu/idaac/blob/main/hyperparams.py - if env_id in ['plunder', 'chaser']: + if env_id in ["plunder", "chaser"]: value_epochs = 1 else: value_epochs = 9 - - if env_id in ['miner', 'bigfish', 'dodgeball']: + + if env_id in ["miner", "bigfish", "dodgeball"]: value_freq = 32 - elif env_id == 'plunder': + elif env_id == "plunder": value_freq = 8 else: value_freq = 1 - - if env_id == 'plunder': + + if env_id == "plunder": adv_coef = 0.3 - elif env_id == 'chaser': + elif env_id == "chaser": adv_coef = 0.15 - elif env_id in ['climber', 'bigfish']: + elif env_id in ["climber", "bigfish"]: adv_coef = 0.05 else: adv_coef = 0.25 - api = DAAC( # type: ignore[assignment] + api = DAAC( # type: ignore[assignment] env=envs, eval_env=eval_envs, tag=f"daac_procgen_{env_id}_seed_{seed}", @@ -250,4 +247,4 @@ def load_apis(self, encoder = EspeholtResidualEncoder(observation_space=envs.observation_space, feature_dim=feature_dim) api.set(encoder=encoder) - return api \ No newline at end of file + return api diff --git a/rllte/xploit/encoder/mnih_cnn_encoder.py b/rllte/xploit/encoder/mnih_cnn_encoder.py index aa0deed7..b0b76446 100644 --- a/rllte/xploit/encoder/mnih_cnn_encoder.py +++ b/rllte/xploit/encoder/mnih_cnn_encoder.py @@ -56,7 +56,7 @@ def __init__(self, observation_space: gym.Space, feature_dim: int = 0) -> None: nn.ReLU(), nn.Conv2d(32, 64, 4, stride=2), nn.ReLU(), - nn.Conv2d(64, 32, 3, stride=1), + nn.Conv2d(64, 64, 3, stride=1), nn.ReLU(), nn.Flatten(), ) diff --git a/rllte/xploit/policy/__init__.py b/rllte/xploit/policy/__init__.py index 4c36b7be..2c3d24e9 100644 --- a/rllte/xploit/policy/__init__.py +++ b/rllte/xploit/policy/__init__.py @@ -30,3 +30,4 @@ from .off_policy_stoch_actor_double_critic import OffPolicyStochActorDoubleCritic as OffPolicyStochActorDoubleCritic from .on_policy_decoupled_actor_critic import OnPolicyDecoupledActorCritic as OnPolicyDecoupledActorCritic from .on_policy_shared_actor_critic import OnPolicySharedActorCritic as OnPolicySharedActorCritic +from .on_policy_shared_actor_critic_lstm import OnPolicySharedActorCriticLSTM as OnPolicySharedActorCriticLSTM diff --git a/rllte/xploit/policy/off_policy_stoch_actor_double_critic.py b/rllte/xploit/policy/off_policy_stoch_actor_double_critic.py index 5b63f750..07a10510 100644 --- a/rllte/xploit/policy/off_policy_stoch_actor_double_critic.py +++ b/rllte/xploit/policy/off_policy_stoch_actor_double_critic.py @@ -83,7 +83,7 @@ def __init__( # build actor and critic actor_kwargs = {"action_dim": self.policy_action_dim, "hidden_dim": self.hidden_dim, "feature_dim": self.feature_dim} if self.action_type == "Box": - actor_kwargs["log_std_range"] = log_std_range # type: ignore[assignment] + actor_kwargs["log_std_range"] = log_std_range # type: ignore[assignment] self.actor = get_off_policy_actor(action_type=self.action_type, actor_kwargs=actor_kwargs) diff --git a/rllte/xploit/policy/on_policy_shared_actor_critic_lstm.py b/rllte/xploit/policy/on_policy_shared_actor_critic_lstm.py new file mode 100644 index 00000000..91ead5ee --- /dev/null +++ b/rllte/xploit/policy/on_policy_shared_actor_critic_lstm.py @@ -0,0 +1,277 @@ +# ============================================================================= +# MIT License + +# Copyright (c) 2023 Reinforcement Learning Evolution Foundation + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ============================================================================= +from copy import deepcopy +from pathlib import Path +from typing import Any, Dict, Optional, Tuple, Type + +import gymnasium as gym +import torch as th +from torch import nn + +from rllte.common.prototype import BaseDistribution as Distribution +from rllte.common.prototype import BasePolicy +from rllte.common.utils import ExportModel + +from .utils import OnPolicyCritic, get_on_policy_actor + +# from torch.distributions import Distribution + + +class OnPolicySharedActorCriticLSTM(BasePolicy): + """Actor-Critic network for on-policy algorithms like `PPO` and `A2C`. Contains LSTM. + + Args: + observation_space (gym.Space): Observation space. + action_space (gym.Space): Action space. + feature_dim (int): Number of features accepted. + hidden_dim (int): Number of units per hidden layer. + opt_class (Type[th.optim.Optimizer]): Optimizer class. + opt_kwargs (Dict[str, Any]): Optimizer keyword arguments. + aux_critic (bool): Use auxiliary critic or not, for `PPG` agent. + init_fn (str): Parameters initialization method. + + Returns: + Actor-Critic-LSTM network instance. + """ + + def __init__( + self, + observation_space: gym.Space, + action_space: gym.Space, + feature_dim: int, + hidden_dim: int = 512, + opt_class: Type[th.optim.Optimizer] = th.optim.Adam, + opt_kwargs: Optional[Dict[str, Any]] = None, + aux_critic: bool = False, + init_fn: str = "orthogonal", + ) -> None: + if opt_kwargs is None: + opt_kwargs = {} + super().__init__( + observation_space=observation_space, + action_space=action_space, + feature_dim=feature_dim, + hidden_dim=hidden_dim, + opt_class=opt_class, + opt_kwargs=opt_kwargs, + init_fn=init_fn, + ) + + assert self.action_type in [ + "Discrete", + "Box", + "MultiBinary", + "MultiDiscrete", + ], f"Unsupported action type {self.action_type}!" + + # build lstm + self.lstm = nn.LSTM(feature_dim, feature_dim // 4) + for name, param in self.lstm.named_parameters(): + if "bias" in name: + nn.init.constant_(param, 0) + elif "weight" in name: + nn.init.orthogonal_(param, 1.0) + + # build actor and critic + actor_kwargs = dict( + obs_shape=self.obs_shape, + action_dim=self.policy_action_dim, + feature_dim=feature_dim // 4, + hidden_dim=self.hidden_dim, + ) + if self.nvec is not None: + actor_kwargs["nvec"] = self.nvec + + self.actor = get_on_policy_actor(action_type=self.action_type, actor_kwargs=actor_kwargs) + + self.critic = OnPolicyCritic( + obs_shape=self.obs_shape, + action_dim=self.policy_action_dim, + feature_dim=feature_dim // 4, + hidden_dim=self.hidden_dim, + ) + if aux_critic: + self.aux_critic = deepcopy(self.critic) + + @staticmethod + def describe() -> None: + """Describe the policy.""" + print("\n") + print("=" * 80) + print(f"{'Name'.ljust(10)} : OnPolicySharedActorCritic") + print(f"{'Structure'.ljust(10)} : self.encoder (shared by actor and critic), self.actor, self.critic") + print(f"{''.ljust(10)} : self.aux_critic (optional, for PPG)") + print(f"{'Forward'.ljust(10)} : obs -> self.encoder -> self.actor -> actions") + print(f"{''.ljust(10)} : obs -> self.encoder -> self.critic -> values") + print(f"{''.ljust(10)} : actions -> log_probs") + print(f"{'Optimizers'.ljust(10)} : self.optimizers['opt'] -> (self.encoder, self.actor, self.critic)") + print(f"{''.ljust(10)} : self.optimizers['opt'] -> self.aux_critic (optional, for PPG)") + print("=" * 80) + print("\n") + + def freeze(self, encoder: nn.Module, dist: Distribution) -> None: + """Freeze all the elements like `encoder` and `dist`. + + Args: + encoder (nn.Module): Encoder network. + dist (Distribution): Distribution class. + + Returns: + None. + """ + # set encoder + assert encoder is not None, "Encoder should not be None!" + self.encoder = encoder + # set distribution + assert dist is not None, "Distribution should not be None!" + self.dist = dist + # initialize parameters + self.apply(self.init_fn) + # build optimizers + self._optimizers["opt"] = self.opt_class(self.parameters(), **self.opt_kwargs) + + def get_states(self, obs: th.Tensor, lstm_state: th.Tensor, done: th.Tensor): + hidden = self.encoder(obs) + + # LSTM logic + batch_size = lstm_state[0].shape[1] + hidden = hidden.reshape((-1, batch_size, self.lstm.input_size)) + done = done.reshape((-1, batch_size)).long() + new_hidden = [] + for h, d in zip(hidden, done): + h, lstm_state = self.lstm( + h.unsqueeze(0), + ( + (1.0 - d).view(1, -1, 1) * lstm_state[0], + (1.0 - d).view(1, -1, 1) * lstm_state[1], + ), + ) + new_hidden += [h] + new_hidden = th.flatten(th.cat(new_hidden), 0, 1) + return new_hidden, lstm_state + + def forward( + self, obs: th.Tensor, lstm_state: th.Tensor, done: th.Tensor, training: bool = True + ) -> Tuple[th.Tensor, Dict[str, th.Tensor]]: + """Get actions and estimated values for observations. + + Args: + obs (th.Tensor): Observations. + training (bool): training mode, `True` or `False`. + + Returns: + Sampled actions, estimated values, and log of probabilities for observations when `training` is `True`, + else only deterministic actions. + """ + h, lstm_state = self.get_states(obs, lstm_state, done) + + policy_outputs = self.actor.get_policy_outputs(h) + dist = self.dist(*policy_outputs) + + if training: + actions = dist.sample() + log_probs = dist.log_prob(actions) + return actions, {"values": self.critic(h), "log_probs": log_probs, "lstm_state": lstm_state} + else: + actions = dist.mean + return actions, {"lstm_state": lstm_state} + + def get_value(self, obs: th.Tensor, lstm_state: th.Tensor, done: th.Tensor) -> th.Tensor: + """Get estimated values for observations. + + Args: + obs (th.Tensor): Observations. + + Returns: + Estimated values. + """ + return self.critic(self.get_states(obs, lstm_state, done)[0]) + + def evaluate_actions( + self, obs: th.Tensor, actions: th.Tensor, lstm_state: th.Tensor, done: th.Tensor + ) -> Tuple[th.Tensor, th.Tensor, th.Tensor]: + """Evaluate actions according to the current policy given the observations. + + Args: + obs (th.Tensor): Sampled observations. + actions (th.Tensor): Sampled actions. + + Returns: + Estimated values, log of the probability evaluated at `actions`, entropy of distribution. + """ + h, _ = self.get_states(obs, lstm_state, done) + policy_outputs = self.actor.get_policy_outputs(h) + dist = self.dist(*policy_outputs) + + log_probs = dist.log_prob(actions) + entropy = dist.entropy().mean() + + return self.critic(h), log_probs, entropy + + def get_policy_outputs(self, obs: th.Tensor, lstm_state: th.Tensor, done: th.Tensor) -> th.Tensor: + """Get policy outputs for training. + + Args: + obs (Tensor): Observations. + + Returns: + Policy outputs like unnormalized probabilities for `Discrete` tasks. + """ + h, _ = self.get_states(obs, lstm_state, done) + policy_outputs = self.actor.get_policy_outputs(h) + return th.cat(policy_outputs, dim=1) + + def get_dist_and_aux_value( + self, obs: th.Tensor, lstm_state: th.Tensor, done: th.Tensor + ) -> Tuple[Distribution, th.Tensor, th.Tensor]: + """Get probs and auxiliary estimated values for auxiliary phase update. + + Args: + obs: Sampled observations. + + Returns: + Sample distribution, estimated values, auxiliary estimated values. + """ + h, _ = self.get_states(obs, lstm_state, done) + policy_outputs = self.actor.get_policy_outputs(h) + dist = self.dist(*policy_outputs) + + return dist, self.critic(h.detach()), self.aux_critic(h) + + def save(self, path: Path, pretraining: bool, global_step: int) -> None: + """Save models. + + Args: + path (Path): Save path. + pretraining (bool): Pre-training mode. + global_step (int): Global training step. + + Returns: + None. + """ + if pretraining: # pretraining + th.save(self.state_dict(), path / f"pretrained_{global_step}.pth") + else: + export_model = ExportModel(encoder=self.encoder, actor=self.actor) + th.save(export_model, path / f"agent_{global_step}.pth") diff --git a/rllte/xploit/policy/utils.py b/rllte/xploit/policy/utils.py index e093cf87..9a02a980 100644 --- a/rllte/xploit/policy/utils.py +++ b/rllte/xploit/policy/utils.py @@ -29,7 +29,7 @@ from torch import nn from torch.nn import functional as F -from rllte.common.type_alias import ObsShape, BaseDistribution +from rllte.common.type_alias import BaseDistribution, ObsShape class OnPolicyDiscreteActor(nn.Module): @@ -548,7 +548,7 @@ def get_off_policy_actor(action_type: str, actor_kwargs: Dict) -> Union[OffPolic if action_type in ["Discrete"]: actor_class = OffPolicyDiscreteActor elif action_type == "Box": - actor_class = OffPolicyBoxActor # type: ignore[assignment] + actor_class = OffPolicyBoxActor # type: ignore[assignment] else: raise NotImplementedError(f"Unsupported action type {action_type}!") return actor_class(**actor_kwargs) diff --git a/rllte/xploit/storage/__init__.py b/rllte/xploit/storage/__init__.py index fbc3683d..31fcf4f3 100644 --- a/rllte/xploit/storage/__init__.py +++ b/rllte/xploit/storage/__init__.py @@ -24,6 +24,7 @@ from .dict_replay_storage import DictReplayStorage as DictReplayStorage from .dict_rollout_storage import DictRolloutStorage as DictRolloutStorage +from .episodic_rollout_storage import EpisodicRolloutStorage as EpisodicRolloutStorage from .her_replay_storage import HerReplayStorage as HerReplayStorage from .nstep_replay_storage import NStepReplayStorage as NStepReplayStorage from .prioritized_replay_storage import PrioritizedReplayStorage as PrioritizedReplayStorage diff --git a/rllte/xploit/storage/dict_replay_storage.py b/rllte/xploit/storage/dict_replay_storage.py index fde1f3f0..9ce464ac 100644 --- a/rllte/xploit/storage/dict_replay_storage.py +++ b/rllte/xploit/storage/dict_replay_storage.py @@ -29,8 +29,9 @@ import numpy as np import torch as th -from rllte.xploit.storage.vanilla_replay_storage import VanillaReplayStorage from rllte.common.type_alias import DictReplayBatch +from rllte.xploit.storage.vanilla_replay_storage import VanillaReplayStorage + class DictReplayStorage(VanillaReplayStorage): """Dict replay storage for off-policy algorithms and dictionary observations. diff --git a/rllte/xploit/storage/episodic_rollout_storage.py b/rllte/xploit/storage/episodic_rollout_storage.py new file mode 100644 index 00000000..04261109 --- /dev/null +++ b/rllte/xploit/storage/episodic_rollout_storage.py @@ -0,0 +1,196 @@ +# ============================================================================= +# MIT License + +# Copyright (c) 2023 Reinforcement Learning Evolution Foundation + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ============================================================================= + + +from typing import Dict, Generator + +import gymnasium as gym +import numpy as np +import torch as th +from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler + +from rllte.common.prototype import BaseStorage +from rllte.common.type_alias import EpisodicRolloutBatch + + +class EpisodicRolloutStorage(BaseStorage): + """Episodic rollout storage for on-policy algorithms that use an LSTM. + It is the same as VanillaRolloutStorage but samples enitre trajectories instead of batches of different steps. + + Args: + observation_space (gym.Space): The observation space of environment. + action_space (gym.Space): The action space of environment. + device (str): Device to convert the data. + storage_size (int): The capacity of the storage. Here it refers to the length of per rollout. + batch_size (int): Batch size of samples. + num_envs (int): The number of parallel environments. + discount (float): The discount factor. + gae_lambda (float): Weighting coefficient for generalized advantage estimation (GAE). + + Returns: + Vanilla rollout storage. + """ + + def __init__( + self, + observation_space: gym.Space, + action_space: gym.Space, + device: str = "cpu", + storage_size: int = 256, + num_envs: int = 8, + discount: float = 0.999, + gae_lambda: float = 0.95, + num_batches: int = 4, + ) -> None: + batch_size = (num_envs * storage_size) // num_batches + super().__init__(observation_space, action_space, device, storage_size, batch_size, num_envs) + self.discount = discount + self.gae_lambda = gae_lambda + self.num_batches = num_batches + self.reset() + + def reset(self) -> None: + """Reset the storage.""" + # data containers + self.observations = th.empty( + size=(self.storage_size + 1, self.num_envs, *self.obs_shape), dtype=th.float32, device=self.device + ) + self.actions = th.empty(size=(self.storage_size, self.num_envs, self.action_dim), dtype=th.float32, device=self.device) + self.rewards = th.empty(size=(self.storage_size, self.num_envs), dtype=th.float32, device=self.device) + self.terminateds = th.empty(size=(self.storage_size + 1, self.num_envs), dtype=th.float32, device=self.device) + self.truncateds = th.empty(size=(self.storage_size + 1, self.num_envs), dtype=th.float32, device=self.device) + # first next_terminated + self.terminateds[0].fill_(0.0) + self.truncateds[0].fill_(0.0) + # extra part + self.log_probs = th.empty(size=(self.storage_size, self.num_envs), dtype=th.float32, device=self.device) + self.values = th.empty(size=(self.storage_size, self.num_envs), dtype=th.float32, device=self.device) + self.returns = th.empty(size=(self.storage_size, self.num_envs), dtype=th.float32, device=self.device) + self.advantages = th.empty(size=(self.storage_size, self.num_envs), dtype=th.float32, device=self.device) + super().reset() + + def add( + self, + observations: th.Tensor, + actions: th.Tensor, + rewards: th.Tensor, + terminateds: th.Tensor, + truncateds: th.Tensor, + infos: Dict, + next_observations: th.Tensor, + log_probs: th.Tensor, + values: th.Tensor, + ) -> None: + """Add sampled transitions into storage. + + Args: + observations (th.Tensor): Observations. + actions (th.Tensor): Actions. + rewards (th.Tensor): Rewards. + terminateds (th.Tensor): Termination signals. + truncateds (th.Tensor): Truncation signals. + infos (Dict): Extra information. + next_observations (th.Tensor): Next observations. + log_probs (th.Tensor): Log of the probability evaluated at `actions`. + values (th.Tensor): Estimated values. + + Returns: + None. + """ + self.observations[self.step].copy_(observations) + self.actions[self.step].copy_(actions.view(self.num_envs, self.action_dim)) + self.rewards[self.step].copy_(rewards) + self.terminateds[self.step + 1].copy_(terminateds) + self.truncateds[self.step + 1].copy_(truncateds) + self.observations[self.step + 1].copy_(next_observations) + self.log_probs[self.step].copy_(log_probs) + self.values[self.step].copy_(values.flatten()) + + self.full = True if self.step == self.storage_size - 1 else False + self.step = (self.step + 1) % self.storage_size + + def update(self) -> None: + """Update the terminal state of each env.""" + self.terminateds[0].copy_(self.terminateds[-1]) + self.truncateds[0].copy_(self.truncateds[-1]) + + def compute_returns_and_advantages(self, last_values: th.Tensor) -> None: + """Perform generalized advantage estimation (GAE). + + Args: + last_values (th.Tensor): Estimated values of the last step. + + Returns: + None. + """ + gae = 0 + for step in reversed(range(self.storage_size)): + if step == self.storage_size - 1: + next_values = last_values[:, 0] + else: + next_values = self.values[step + 1] + next_non_terminal = 1.0 - self.terminateds[step + 1] + delta = self.rewards[step] + self.discount * next_values * next_non_terminal - self.values[step] + gae = delta + self.discount * self.gae_lambda * next_non_terminal * gae + # time limit + gae = gae * (1.0 - self.truncateds[step + 1]) + self.advantages[step] = gae + + self.returns = self.advantages + self.values + self.advantages = (self.advantages - self.advantages.mean()) / (self.advantages.std() + 1e-5) + + def sample(self) -> Generator: + """ + Choose a minibatch of environment indices and sample the entire rollout for those minibatches. + By not sampling uniform transitions, we can now train an LSTM model on entire trajectories + """ + assert self.full, "Cannot sample when the storage is not full!" + _batch_size = self.num_envs // self.num_batches + sampler = BatchSampler(SubsetRandomSampler(range(self.num_envs)), _batch_size, drop_last=True) + + b_obs = self.observations[:-1].reshape(-1, *self.obs_shape) + b_act = self.actions.reshape(-1, *self.action_shape) + b_val = self.values.reshape(-1) + b_ret = self.returns.reshape(-1) + b_ter = self.terminateds[:-1].reshape(-1) + b_tru = self.truncateds[:-1].reshape(-1) + b_log = self.log_probs.reshape(-1) + b_adv = self.advantages.reshape(-1) + + flat_idcs = np.arange(self.num_envs * self.storage_size).reshape(self.storage_size, self.num_envs) + + for indices in sampler: + mb_inds = flat_idcs[:, indices].ravel() + + yield EpisodicRolloutBatch( + observations=b_obs[mb_inds], + actions=b_act[mb_inds], + values=b_val[mb_inds], + returns=b_ret[mb_inds], + terminateds=b_ter[mb_inds], + truncateds=b_tru[mb_inds], + old_log_probs=b_log[mb_inds], + adv_targ=b_adv[mb_inds], + env_inds=indices, + ) diff --git a/rllte/xplore/reward/ride.py b/rllte/xplore/reward/ride.py index 837f6211..089989f6 100644 --- a/rllte/xplore/reward/ride.py +++ b/rllte/xplore/reward/ride.py @@ -330,6 +330,6 @@ def update(self, samples: Dict) -> None: self.encoder_opt.step() self.im_opt.step() self.fm_opt.step() - + def add(self, samples: Dict) -> None: """Add new samples to the intrinsic reward module.""" diff --git a/tests/test_env.py b/tests/test_env.py index a6a0b907..b0057ac2 100644 --- a/tests/test_env.py +++ b/tests/test_env.py @@ -13,14 +13,7 @@ @pytest.mark.parametrize( "env_cls", - [ - make_atari_env, - make_minigrid_env, - make_procgen_env, - make_dmc_env, - make_envpool_atari_env, - make_envpool_procgen_env - ], + [make_atari_env, make_minigrid_env, make_procgen_env, make_dmc_env, make_envpool_atari_env, make_envpool_procgen_env], ) @pytest.mark.parametrize("device", ["cuda", "cpu"]) def test_discrete_env(env_cls, device): @@ -28,7 +21,7 @@ def test_discrete_env(env_cls, device): if env_cls in [make_procgen_env]: env = env_cls(device=device, num_envs=num_envs) else: - # when set `asynchronous=True` for all the envs, + # when set `asynchronous=True` for all the envs, # the test will raise an EOF error env = env_cls(device=device, num_envs=num_envs, asynchronous=False) _ = env.reset() @@ -38,8 +31,7 @@ def test_discrete_env(env_cls, device): for _ in range(10): action = env.action_space.sample() - if env_cls in [make_atari_env, make_minigrid_env, make_procgen_env, - make_envpool_atari_env, make_envpool_procgen_env]: + if env_cls in [make_atari_env, make_minigrid_env, make_procgen_env, make_envpool_atari_env, make_envpool_procgen_env]: action = th.randint(0, env.action_space.n, (num_envs,)).to(device) else: action = th.rand(size=(num_envs, env.action_space.shape[0])).to(device) diff --git a/tests/test_reward.py b/tests/test_reward.py index 4951be73..2082fff2 100644 --- a/tests/test_reward.py +++ b/tests/test_reward.py @@ -1,11 +1,7 @@ import pytest import torch as th -from rllte.env.testing import (make_box_env, - make_discrete_env, - make_multibinary_env, - make_multidiscrete_env - ) +from rllte.env.testing import make_box_env, make_discrete_env, make_multibinary_env, make_multidiscrete_env from rllte.xplore.reward import GIRM, ICM, NGU, RE3, REVD, RIDE, RISE, RND, PseudoCounts @@ -36,13 +32,7 @@ def test_reward(reward, env_cls, device): action = th.randint(0, 2, (num_steps, num_envs, env.action_space.n)).float().to(device) for i in range(num_steps): - irs.add( - samples={ - 'obs': obs[i], - 'actions': action[i], - 'next_obs': obs[i] - } - ) + irs.add(samples={"obs": obs[i], "actions": action[i], "next_obs": obs[i]}) samples = { "obs": obs, diff --git a/tests/test_storage.py b/tests/test_storage.py index 9cdb6872..476589aa 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -1,7 +1,7 @@ import pytest import torch as th -from rllte.env.testing import make_box_env, make_bitflipping_env +from rllte.env.testing import make_bitflipping_env, make_box_env from rllte.xploit.storage import ( DictReplayStorage, DictRolloutStorage, @@ -22,7 +22,7 @@ VanillaRolloutStorage, DictReplayStorage, DictRolloutStorage, - HerReplayStorage + HerReplayStorage, ], ) @pytest.mark.parametrize("device", ["cpu", "cuda"]) @@ -45,7 +45,7 @@ def test_storage(storage_cls, device): device=device, num_envs=num_envs, storage_size=num_steps, - batch_size=batch_size + batch_size=batch_size, ) elif storage_cls is HerReplayStorage: storage = storage_cls( @@ -54,7 +54,7 @@ def test_storage(storage_cls, device): device=device, num_envs=num_envs, batch_size=batch_size, - reward_fn=lambda x, y, z: th.rand(size=(int(batch_size * 0.8), 1)) + reward_fn=lambda x, y, z: th.rand(size=(int(batch_size * 0.8), 1)), ) else: storage = storage_cls( @@ -62,7 +62,7 @@ def test_storage(storage_cls, device): action_space=env.action_space, device=device, num_envs=num_envs, - batch_size=batch_size + batch_size=batch_size, ) obs, infos = env.reset()