diff --git a/sota-implementations/cql/discrete_cql_config.yaml b/sota-implementations/cql/discrete_cql_config.yaml index 6db31a9aa81..a9fb9bfed0c 100644 --- a/sota-implementations/cql/discrete_cql_config.yaml +++ b/sota-implementations/cql/discrete_cql_config.yaml @@ -14,7 +14,7 @@ collector: multi_step: 0 init_random_frames: 1000 env_per_collector: 1 - device: cpu + device: max_frames_per_traj: 200 annealing_frames: 10000 eps_start: 1.0 diff --git a/sota-implementations/cql/online_config.yaml b/sota-implementations/cql/online_config.yaml index 5a8be9616a0..5c9e649f17f 100644 --- a/sota-implementations/cql/online_config.yaml +++ b/sota-implementations/cql/online_config.yaml @@ -15,7 +15,7 @@ collector: multi_step: 0 init_random_frames: 5_000 env_per_collector: 1 - device: cpu + device: max_frames_per_traj: 1000 diff --git a/sota-implementations/cql/utils.py b/sota-implementations/cql/utils.py index fa7ed3119b5..306f1cdb7f1 100644 --- a/sota-implementations/cql/utils.py +++ b/sota-implementations/cql/utils.py @@ -124,6 +124,12 @@ def make_collector( cudagraph=False, ): """Make collector.""" + device = cfg.collector.device + if device in ("", None): + if torch.cuda.is_available(): + device = torch.device("cuda:0") + else: + device = torch.device("cpu") collector = SyncDataCollector( train_env, actor_model_explore, @@ -131,7 +137,7 @@ def make_collector( frames_per_batch=cfg.collector.frames_per_batch, max_frames_per_traj=cfg.collector.max_frames_per_traj, total_frames=cfg.collector.total_frames, - device=cfg.collector.device, + device=device, compile_policy={"mode": compile_mode} if compile else False, cudagraph_policy=cudagraph, ) diff --git a/sota-implementations/gail/config.yaml b/sota-implementations/gail/config.yaml index cf6c8053037..2e057b08220 100644 --- a/sota-implementations/gail/config.yaml +++ b/sota-implementations/gail/config.yaml @@ -41,6 +41,11 @@ gail: gp_lambda: 10.0 device: null +compile: + compile: False + compile_mode: + cudagraphs: False + replay_buffer: dataset: halfcheetah-expert-v2 batch_size: 256 diff --git a/sota-implementations/gail/gail.py b/sota-implementations/gail/gail.py index b4856fa7d0d..39eaf3a929d 100644 --- a/sota-implementations/gail/gail.py +++ b/sota-implementations/gail/gail.py @@ -11,6 +11,8 @@ """ from __future__ import annotations +import warnings + import hydra import numpy as np import torch @@ -18,13 +20,15 @@ from gail_utils import log_metrics, make_gail_discriminator, make_offline_replay_buffer from ppo_utils import eval_model, make_env, make_ppo_models +from tensordict import TensorDict +from tensordict.nn import CudaGraphModule from torchrl.collectors import SyncDataCollector from torchrl.data import LazyMemmapStorage, TensorDictReplayBuffer from torchrl.data.replay_buffers.samplers import SamplerWithoutReplacement from torchrl.envs import set_gym_backend from torchrl.envs.utils import ExplorationType, set_exploration_type -from torchrl.objectives import ClipPPOLoss, GAILLoss +from torchrl.objectives import ClipPPOLoss, GAILLoss, group_optimizers from torchrl.objectives.value.advantages import GAE from torchrl.record import VideoRecorder from torchrl.record.loggers import generate_exp_name, get_logger @@ -71,18 +75,8 @@ def main(cfg: "DictConfig"): # noqa: F821 np.random.seed(cfg.env.seed) # Create models (check utils_mujoco.py) - actor, critic = make_ppo_models(cfg.env.env_name) - actor, critic = actor.to(device), critic.to(device) - - # Create collector - collector = SyncDataCollector( - create_env_fn=make_env(cfg.env.env_name, device), - policy=actor, - frames_per_batch=cfg.ppo.collector.frames_per_batch, - total_frames=cfg.ppo.collector.total_frames, - device=device, - storing_device=device, - max_frames_per_traj=-1, + actor, critic = make_ppo_models( + cfg.env.env_name, compile=cfg.compile.compile, device=device ) # Create data buffer @@ -111,8 +105,36 @@ def main(cfg: "DictConfig"): # noqa: F821 ) # Create optimizers - actor_optim = torch.optim.Adam(actor.parameters(), lr=cfg.ppo.optim.lr, eps=1e-5) - critic_optim = torch.optim.Adam(critic.parameters(), lr=cfg.ppo.optim.lr, eps=1e-5) + actor_optim = torch.optim.Adam( + actor.parameters(), lr=torch.tensor(cfg.ppo.optim.lr, device=device), eps=1e-5 + ) + critic_optim = torch.optim.Adam( + critic.parameters(), lr=torch.tensor(cfg.ppo.optim.lr, device=device), eps=1e-5 + ) + optim = group_optimizers(actor_optim, critic_optim) + del actor_optim, critic_optim + + compile_mode = None + if cfg.compile.compile: + compile_mode = cfg.compile.compile_mode + if compile_mode in ("", None): + if cfg.compile.cudagraphs: + compile_mode = "default" + else: + compile_mode = "reduce-overhead" + + # Create collector + collector = SyncDataCollector( + create_env_fn=make_env(cfg.env.env_name, device), + policy=actor, + frames_per_batch=cfg.ppo.collector.frames_per_batch, + total_frames=cfg.ppo.collector.total_frames, + device=device, + storing_device=device, + max_frames_per_traj=-1, + compile_policy={"mode": compile_mode} if compile_mode is not None else False, + cudagraph_policy=cfg.compile.cudagraphs, + ) # Create replay buffer replay_buffer = make_offline_replay_buffer(cfg.replay_buffer) @@ -140,32 +162,9 @@ def main(cfg: "DictConfig"): # noqa: F821 VideoRecorder(logger, tag="rendering/test", in_keys=["pixels"]) ) test_env.eval() + num_network_updates = torch.zeros((), dtype=torch.int64, device=device) - # Training loop - collected_frames = 0 - num_network_updates = 0 - pbar = tqdm.tqdm(total=cfg.ppo.collector.total_frames) - - # extract cfg variables - cfg_loss_ppo_epochs = cfg.ppo.loss.ppo_epochs - cfg_optim_anneal_lr = cfg.ppo.optim.anneal_lr - cfg_optim_lr = cfg.ppo.optim.lr - cfg_loss_anneal_clip_eps = cfg.ppo.loss.anneal_clip_epsilon - cfg_loss_clip_epsilon = cfg.ppo.loss.clip_epsilon - cfg_logger_test_interval = cfg.logger.test_interval - cfg_logger_num_test_episodes = cfg.logger.num_test_episodes - - for i, data in enumerate(collector): - - log_info = {} - frames_in_batch = data.numel() - collected_frames += frames_in_batch - pbar.update(data.numel()) - - # Update discriminator - # Get expert data - expert_data = replay_buffer.sample() - expert_data = expert_data.to(device) + def update(data, expert_data, num_network_updates=num_network_updates): # Add collector data to expert data expert_data.set( discriminator_loss.tensor_keys.collector_action, @@ -178,9 +177,9 @@ def main(cfg: "DictConfig"): # noqa: F821 d_loss = discriminator_loss(expert_data) # Backward pass - discriminator_optim.zero_grad() d_loss.get("loss").backward() discriminator_optim.step() + discriminator_optim.zero_grad(set_to_none=True) # Compute discriminator reward with torch.no_grad(): @@ -190,40 +189,25 @@ def main(cfg: "DictConfig"): # noqa: F821 # Set discriminator rewards to tensordict data.set(("next", "reward"), d_rewards) - # Get training rewards and episode lengths - episode_rewards = data["next", "episode_reward"][data["next", "done"]] - if len(episode_rewards) > 0: - episode_length = data["next", "step_count"][data["next", "done"]] - log_info.update( - { - "train/reward": episode_rewards.mean().item(), - "train/episode_length": episode_length.sum().item() - / len(episode_length), - } - ) # Update PPO for _ in range(cfg_loss_ppo_epochs): - # Compute GAE with torch.no_grad(): data = adv_module(data) data_reshape = data.reshape(-1) # Update the data buffer + data_buffer.empty() data_buffer.extend(data_reshape) - for _, batch in enumerate(data_buffer): - - # Get a data batch - batch = batch.to(device) + for batch in data_buffer: + optim.zero_grad(set_to_none=True) # Linearly decrease the learning rate and clip epsilon - alpha = 1.0 + alpha = torch.ones((), device=device) if cfg_optim_anneal_lr: alpha = 1 - (num_network_updates / total_network_updates) - for group in actor_optim.param_groups: - group["lr"] = cfg_optim_lr * alpha - for group in critic_optim.param_groups: + for group in optim.param_groups: group["lr"] = cfg_optim_lr * alpha if cfg_loss_anneal_clip_eps: loss_module.clip_epsilon.copy_(cfg_loss_clip_epsilon * alpha) @@ -235,20 +219,68 @@ def main(cfg: "DictConfig"): # noqa: F821 actor_loss = loss["loss_objective"] + loss["loss_entropy"] # Backward pass - actor_loss.backward() - critic_loss.backward() + (actor_loss + critic_loss).backward() # Update the networks - actor_optim.step() - critic_optim.step() - actor_optim.zero_grad() - critic_optim.zero_grad() + optim.step() + return TensorDict(dloss=d_loss, alpha=alpha).detach() + + if cfg.compile.compile: + update = torch.compile(update, mode=compile_mode) + if cfg.compile.cudagraphs: + warnings.warn( + "CudaGraphModule is experimental and may lead to silently wrong results. Use with caution.", + category=UserWarning, + ) + update = CudaGraphModule(update, warmup=50) + + # Training loop + collected_frames = 0 + pbar = tqdm.tqdm(total=cfg.ppo.collector.total_frames) + + # extract cfg variables + cfg_loss_ppo_epochs = cfg.ppo.loss.ppo_epochs + cfg_optim_anneal_lr = cfg.ppo.optim.anneal_lr + cfg_optim_lr = cfg.ppo.optim.lr + cfg_loss_anneal_clip_eps = cfg.ppo.loss.anneal_clip_epsilon + cfg_loss_clip_epsilon = cfg.ppo.loss.clip_epsilon + cfg_logger_test_interval = cfg.logger.test_interval + cfg_logger_num_test_episodes = cfg.logger.num_test_episodes + + for i, data in enumerate(collector): + + log_info = {} + frames_in_batch = data.numel() + collected_frames += frames_in_batch + pbar.update(data.numel()) + + # Update discriminator + # Get expert data + expert_data = replay_buffer.sample() + expert_data = expert_data.to(device) + + metadata = update(data, expert_data) + d_loss = metadata["d_loss"] + alpha = metadata["alpha"] + + # Get training rewards and episode lengths + episode_rewards = data["next", "episode_reward"][data["next", "done"]] + if len(episode_rewards) > 0: + episode_length = data["next", "step_count"][data["next", "done"]] + + log_info.update( + { + "train/reward": episode_rewards.mean().item(), + "train/episode_length": episode_length.sum().item() + / len(episode_length), + } + ) log_info.update( { - "train/actor_loss": actor_loss.item(), - "train/critic_loss": critic_loss.item(), - "train/discriminator_loss": d_loss["loss"].item(), + # "train/actor_loss": actor_loss.item(), + # "train/critic_loss": critic_loss.item(), + "train/discriminator_loss": d_loss["loss"], "train/lr": alpha * cfg_optim_lr, "train/clip_epsilon": ( alpha * cfg_loss_clip_epsilon diff --git a/sota-implementations/gail/ppo_utils.py b/sota-implementations/gail/ppo_utils.py index e7eb4534c45..6ba12acdf9c 100644 --- a/sota-implementations/gail/ppo_utils.py +++ b/sota-implementations/gail/ppo_utils.py @@ -43,7 +43,7 @@ def make_env(env_name="HalfCheetah-v4", device="cpu", from_pixels: bool = False) # -------------------------------------------------------------------- -def make_ppo_models_state(proof_environment): +def make_ppo_models_state(proof_environment, compile, device): # Define input shape input_shape = proof_environment.observation_spec["observation"].shape @@ -52,9 +52,10 @@ def make_ppo_models_state(proof_environment): num_outputs = proof_environment.action_spec_unbatched.shape[-1] distribution_class = TanhNormal distribution_kwargs = { - "low": proof_environment.action_spec_unbatched.space.low, - "high": proof_environment.action_spec_unbatched.space.high, + "low": proof_environment.action_spec_unbatched.space.low.to(device), + "high": proof_environment.action_spec_unbatched.space.high.to(device), "tanh_loc": False, + "safe_tanh": not compile, } # Define policy architecture @@ -63,6 +64,7 @@ def make_ppo_models_state(proof_environment): activation_class=torch.nn.Tanh, out_features=num_outputs, # predict only loc num_cells=[64, 64], + device=device, ) # Initialize policy weights @@ -87,7 +89,7 @@ def make_ppo_models_state(proof_environment): out_keys=["loc", "scale"], ), in_keys=["loc", "scale"], - spec=proof_environment.single_full_action_spec, + spec=proof_environment.full_action_spec_unbatched.to(device), distribution_class=distribution_class, distribution_kwargs=distribution_kwargs, return_log_prob=True, @@ -117,9 +119,11 @@ def make_ppo_models_state(proof_environment): return policy_module, value_module -def make_ppo_models(env_name): - proof_environment = make_env(env_name, device="cpu") - actor, critic = make_ppo_models_state(proof_environment) +def make_ppo_models(env_name, compile, device): + proof_environment = make_env(env_name, device=device) + actor, critic = make_ppo_models_state( + proof_environment, compile=compile, device=device + ) return actor, critic diff --git a/sota-implementations/iql/discrete_iql.yaml b/sota-implementations/iql/discrete_iql.yaml index 9245d4c4832..d28c02cf499 100644 --- a/sota-implementations/iql/discrete_iql.yaml +++ b/sota-implementations/iql/discrete_iql.yaml @@ -15,7 +15,7 @@ collector: total_frames: 20000 init_random_frames: 1000 env_per_collector: 1 - device: cpu + device: max_frames_per_traj: 200 # logger diff --git a/sota-implementations/iql/online_config.yaml b/sota-implementations/iql/online_config.yaml index 1f7bb361e6c..64ad7466192 100644 --- a/sota-implementations/iql/online_config.yaml +++ b/sota-implementations/iql/online_config.yaml @@ -15,7 +15,7 @@ collector: multi_step: 0 init_random_frames: 5000 env_per_collector: 1 - device: cpu + device: max_frames_per_traj: 200 # logger diff --git a/sota-implementations/iql/utils.py b/sota-implementations/iql/utils.py index b817d4345c1..261cb912de0 100644 --- a/sota-implementations/iql/utils.py +++ b/sota-implementations/iql/utils.py @@ -120,6 +120,12 @@ def make_environment(cfg, train_num_envs=1, eval_num_envs=1, logger=None): def make_collector(cfg, train_env, actor_model_explore): """Make collector.""" + device = cfg.collector.device + if device in ("", None): + if torch.cuda.is_available(): + device = torch.device("cuda:0") + else: + device = torch.device("cpu") collector = SyncDataCollector( train_env, actor_model_explore, @@ -127,7 +133,7 @@ def make_collector(cfg, train_env, actor_model_explore): init_random_frames=cfg.collector.init_random_frames, max_frames_per_traj=cfg.collector.max_frames_per_traj, total_frames=cfg.collector.total_frames, - device=cfg.collector.device, + device=device, ) collector.set_seed(cfg.env.seed) return collector diff --git a/sota-implementations/sac/config.yaml b/sota-implementations/sac/config.yaml index 29586f2e9a7..5cf531a3be2 100644 --- a/sota-implementations/sac/config.yaml +++ b/sota-implementations/sac/config.yaml @@ -12,7 +12,7 @@ collector: init_random_frames: 25000 frames_per_batch: 1000 init_env_steps: 1000 - device: cpu + device: env_per_collector: 1 reset_at_each_iter: False diff --git a/sota-implementations/sac/utils.py b/sota-implementations/sac/utils.py index 9760793c9cd..6d37f5ec3d8 100644 --- a/sota-implementations/sac/utils.py +++ b/sota-implementations/sac/utils.py @@ -107,13 +107,19 @@ def make_environment(cfg, logger=None): def make_collector(cfg, train_env, actor_model_explore): """Make collector.""" + device = cfg.collector.device + if device in ("", None): + if torch.cuda.is_available(): + device = torch.device("cuda:0") + else: + device = torch.device("cpu") collector = SyncDataCollector( train_env, actor_model_explore, init_random_frames=cfg.collector.init_random_frames, frames_per_batch=cfg.collector.frames_per_batch, total_frames=cfg.collector.total_frames, - device=cfg.collector.device, + device=device, ) collector.set_seed(cfg.env.seed) return collector diff --git a/sota-implementations/td3/config.yaml b/sota-implementations/td3/config.yaml index 7f7854b68b3..5bdf22ea6fa 100644 --- a/sota-implementations/td3/config.yaml +++ b/sota-implementations/td3/config.yaml @@ -13,7 +13,7 @@ collector: init_env_steps: 1000 frames_per_batch: 1000 reset_at_each_iter: False - device: cpu + device: env_per_collector: 1 num_workers: 1 diff --git a/sota-implementations/td3/utils.py b/sota-implementations/td3/utils.py index a9bc8140291..df81a522b3c 100644 --- a/sota-implementations/td3/utils.py +++ b/sota-implementations/td3/utils.py @@ -118,6 +118,12 @@ def make_environment(cfg, logger=None): def make_collector(cfg, train_env, actor_model_explore): """Make collector.""" + device = cfg.collector.device + if device in ("", None): + if torch.cuda.is_available(): + device = torch.device("cuda:0") + else: + device = torch.device("cpu") collector = SyncDataCollector( train_env, actor_model_explore, @@ -125,7 +131,7 @@ def make_collector(cfg, train_env, actor_model_explore): frames_per_batch=cfg.collector.frames_per_batch, total_frames=cfg.collector.total_frames, reset_at_each_iter=cfg.collector.reset_at_each_iter, - device=cfg.collector.device, + device=device, ) collector.set_seed(cfg.env.seed) return collector