From f396dfc7b834f8a16a40b94ebb01f2a689062685 Mon Sep 17 00:00:00 2001 From: taochenshh Date: Thu, 9 Jul 2020 18:13:41 -0400 Subject: [PATCH 01/35] initial commit with sac agent --- easyrl/agents/ppo_agent.py | 14 ++--- easyrl/agents/sac_agent.py | 77 +++++++++++++++++++++++++-- easyrl/configs/sac_config.py | 7 ++- easyrl/models/diag_gaussian_policy.py | 15 ++++-- easyrl/models/mlp.py | 3 ++ easyrl/replays/circular_buffer.py | 24 ++++----- easyrl/runner/episodic_runner.py | 4 -- easyrl/runner/rnn_runner.py | 6 +-- easyrl/runner/step_runner.py | 77 +++++++++++++++++++++++++++ easyrl/utils/torch_util.py | 53 ++++++++---------- 10 files changed, 213 insertions(+), 67 deletions(-) create mode 100644 easyrl/runner/step_runner.py diff --git a/easyrl/agents/ppo_agent.py b/easyrl/agents/ppo_agent.py index b0d0bb0..2c13f83 100644 --- a/easyrl/agents/ppo_agent.py +++ b/easyrl/agents/ppo_agent.py @@ -19,14 +19,16 @@ from easyrl.utils.torch_util import load_torch_model from easyrl.utils.torch_util import torch_float from easyrl.utils.torch_util import torch_to_np +from easyrl.utils.torch_util import move_to class PPOAgent(BaseAgent): def __init__(self, actor, critic, same_body=False): self.actor = actor self.critic = critic - self.actor.to(ppo_cfg.device) - self.critic.to(ppo_cfg.device) + move_to([self.actor, self.critic], + device=ppo_cfg.device) + self.same_body = same_body if ppo_cfg.vf_loss_type == 'mse': self.val_loss_criterion = nn.MSELoss().to(ppo_cfg.device) @@ -38,7 +40,8 @@ def __init__(self, actor, critic, same_body=False): # keep unique elements only. The following code works for python >=3.7 # for earlier version of python, u need to use OrderedDict self.all_params = dict.fromkeys(all_params).keys() - if ppo_cfg.max_steps > ppo_cfg.max_decay_steps: + if (ppo_cfg.linear_decay_lr or ppo_cfg.linear_decay_clip_range) and \ + ppo_cfg.max_steps > ppo_cfg.max_decay_steps: raise ValueError('max_steps should be no greater than max_decay_steps.') total_epochs = int(np.ceil(ppo_cfg.max_decay_steps / (ppo_cfg.num_envs * ppo_cfg.episode_steps))) @@ -76,12 +79,11 @@ def __init__(self, actor, critic, same_body=False): total_epochs=total_epochs) self.lr_scheduler = LambdaLR(optimizer=self.optimizer, lr_lambda=[p_lr_lambda, v_lr_lambda]) - self.in_training = False @torch.no_grad() def get_action(self, ob, sample=True, *args, **kwargs): self.eval_mode() - t_ob = torch.from_numpy(ob).float().to(ppo_cfg.device) + t_ob = torch_float(ob, device=ppo_cfg.device) act_dist, val = self.get_act_val(t_ob) action = action_from_dist(act_dist, sample=sample) @@ -192,12 +194,10 @@ def cal_val_loss(self, val, old_val, ret): return vf_loss def train_mode(self): - self.in_training = True self.actor.train() self.critic.train() def eval_mode(self): - self.in_training = False self.actor.eval() self.critic.eval() diff --git a/easyrl/agents/sac_agent.py b/easyrl/agents/sac_agent.py index 7649388..cbe8cc1 100644 --- a/easyrl/agents/sac_agent.py +++ b/easyrl/agents/sac_agent.py @@ -1,6 +1,77 @@ from easyrl.agents.base_agent import BaseAgent - +from easyrl.configs.sac_config import sac_cfg +import numpy as np +import torch +import torch.nn as nn +import torch.optim as optim +from torch.optim.lr_scheduler import LambdaLR +from copy import deepcopy +from easyrl.utils.torch_util import freeze_model +from easyrl.utils.torch_util import move_to +import itertools +from easyrl.utils.torch_util import torch_float +from easyrl.utils.torch_util import action_from_dist +from easyrl.utils.torch_util import action_log_prob +from easyrl.utils.torch_util import torch_to_np class SACAgent(BaseAgent): - def __init__(self, actor, critic, same_body=False): - pass + def __init__(self, actor, q1, q2): + self.actor = actor + self.q1 = q1 + self.q2 = q2 + self.q1_tgt = deepcopy(self.q1) + self.q2_tgt = deepcopy(self.q2) + freeze_model(self.q1_tgt) + freeze_model(self.q2_tgt) + self.q1_tgt.eval() + self.q2_tgt.eval() + + move_to([self.actor, self.q1, self.q2, self.q1_tgt, self.q2_tgt], + device=sac_cfg.device) + + optim_args = dict( + lr=sac_cfg.actor_lr, + weight_decay=sac_cfg.weight_decay, + amsgrad=sac_cfg.use_amsgrad + ) + + self.p_optimizer = optim.Adam(self.actor.parameters(), + **optim_args) + q_params = itertools.chain(self.q1.parameters(), self.q2.parameters()) + optim_args['lr'] = sac_cfg.critic_lr + self.q_optimizer = optim.Adam(q_params, **optim_args) + + @torch.no_grad() + def get_action(self, ob, sample=True, *args, **kwargs): + self.eval_mode() + ob = torch_float(ob, device=sac_cfg.device) + act_dist = self.actor(ob)[0] + action = action_from_dist(act_dist, + sample=sample) + action_info = dict() + return torch_to_np(action), action_info + + @torch.no_grad() + def get_val(self, ob, action, tgt=False, q1=True, *args, **kwargs): + self.eval_mode() + ob = torch_float(ob, device=sac_cfg.device) + action = torch_float(action, device=sac_cfg.device) + idx = 1 if q1 else 2 + tgt_suffix = '_tgt' if tgt else '' + q_func = getattr(self, f'q{idx}{tgt_suffix}') + val = q_func((ob, action))[0] + val = val.squeeze(-1) + return val + + + + def train_mode(self): + self.actor.train() + self.q1.train() + self.q2.train() + + def eval_mode(self): + self.actor.eval() + self.q1.eval() + self.q2.eval() + diff --git a/easyrl/configs/sac_config.py b/easyrl/configs/sac_config.py index 0a5c69b..ef80349 100644 --- a/easyrl/configs/sac_config.py +++ b/easyrl/configs/sac_config.py @@ -5,4 +5,9 @@ @dataclass class SACConfig(BasicConfig): - actor_lr: float = 1e-4 + actor_lr: float = 1e-3 + critic_lr: float = 1e-3 + use_amsgrad: bool = True + + +sac_cfg = SACConfig() diff --git a/easyrl/models/diag_gaussian_policy.py b/easyrl/models/diag_gaussian_policy.py index a4fd12f..5dff04f 100644 --- a/easyrl/models/diag_gaussian_policy.py +++ b/easyrl/models/diag_gaussian_policy.py @@ -4,8 +4,10 @@ from torch.distributions import Normal from torch.distributions import TransformedDistribution -from easyrl.utils.torch_util import TanhTransform +from torch.distributions.transforms import TanhTransform +LOG_STD_MAX = 2 +LOG_STD_MIN = -20 class DiagGaussianPolicy(nn.Module): def __init__(self, @@ -14,11 +16,13 @@ def __init__(self, init_log_std=-0.51, std_cond_in=False, tanh_on_dist=False, - in_features=None): # add tanh on the action distribution + in_features=None, + clamp_log_std=False): # add tanh on the action distribution super().__init__() self.std_cond_in = std_cond_in self.tanh_on_dist = tanh_on_dist self.body = body_net + self.clamp_log_std = clamp_log_std if in_features is None: for i in reversed(range(len(self.body.fcs))): @@ -39,11 +43,14 @@ def forward(self, x=None, body_x=None, **kwargs): raise ValueError('One of [x, body_x] should be provided!') if body_x is None: body_x = self.body(x, **kwargs) - mean = self.head_mean(body_x) + body_out = body_x[0] if isinstance(body_x, tuple) else body_x + mean = self.head_mean(body_out) if self.std_cond_in: - log_std = self.head_logstd(body_x) + log_std = self.head_logstd(body_out) else: log_std = self.head_logstd.expand_as(mean) + if self.clamp_log_std: + log_std = torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX) std = torch.exp(log_std) action_dist = Independent(Normal(loc=mean, scale=std), 1) if self.tanh_on_dist: diff --git a/easyrl/models/mlp.py b/easyrl/models/mlp.py index ccdb280..8ca34e6 100644 --- a/easyrl/models/mlp.py +++ b/easyrl/models/mlp.py @@ -1,3 +1,4 @@ +import torch import torch.nn as nn from torch.nn.utils.spectral_norm import spectral_norm @@ -40,6 +41,8 @@ def __init__(self, self.fcs.append(output_act()) def forward(self, x): + if isinstance(x, tuple) or isinstance(x, list): + x = torch.cat(x, dim=-1) for i, layer in enumerate(self.fcs): x = layer(x) return x diff --git a/easyrl/replays/circular_buffer.py b/easyrl/replays/circular_buffer.py index 3467907..75aec73 100644 --- a/easyrl/replays/circular_buffer.py +++ b/easyrl/replays/circular_buffer.py @@ -1,7 +1,7 @@ import numpy as np -class RingBuffer(object): +class RingBuffer: def __init__(self, maxlen, shape, dtype='float32'): self.maxlen = maxlen self.start = 0 @@ -39,20 +39,20 @@ def array_min2d(x): return x.reshape(-1, 1) -class RBMemory(object): +class CircularMemory: def __init__(self, limit, action_shape, observation_shape): self.limit = limit - self.observations0 = RingBuffer(limit, shape=observation_shape) + self.obs0 = RingBuffer(limit, shape=observation_shape) self.actions = RingBuffer(limit, shape=action_shape) self.rewards = RingBuffer(limit, shape=(1,)) self.terminals = RingBuffer(limit, shape=(1,)) - self.observations1 = RingBuffer(limit, shape=observation_shape) + self.obs1 = RingBuffer(limit, shape=observation_shape) def sample(self, batch_size): - batch_idxs = np.random.randint(0, self.nb_entries, size=batch_size) + batch_idxs = np.random.randint(0, len(self), size=batch_size) - obs0_batch = self.observations0.get_batch(batch_idxs) - obs1_batch = self.observations1.get_batch(batch_idxs) + obs0_batch = self.obs0.get_batch(batch_idxs) + obs1_batch = self.obs1.get_batch(batch_idxs) action_batch = self.actions.get_batch(batch_idxs) reward_batch = self.rewards.get_batch(batch_idxs) terminal1_batch = self.terminals.get_batch(batch_idxs) @@ -69,12 +69,12 @@ def sample(self, batch_size): def append(self, obs0, action, reward, obs1, terminal, training=True): if not training: return - self.observations0.append(obs0) + self.obs0.append(obs0) self.actions.append(action) self.rewards.append(reward) - self.observations1.append(obs1) + self.obs1.append(obs1) self.terminals.append(terminal) - @property - def nb_entries(self): - return len(self.observations0) + def __len__(self): + return len(self.obs0) + diff --git a/easyrl/runner/episodic_runner.py b/easyrl/runner/episodic_runner.py index bc7f1de..9b6b803 100644 --- a/easyrl/runner/episodic_runner.py +++ b/easyrl/runner/episodic_runner.py @@ -11,10 +11,6 @@ class EpisodicRunner(BasicRunner): - def __init__(self, agent, env, eval_env=None): - super().__init__(agent=agent, - env=env, eval_env=eval_env) - @torch.no_grad() def __call__(self, time_steps, sample=True, evaluation=False, return_on_done=False, render=False, render_image=False, diff --git a/easyrl/runner/rnn_runner.py b/easyrl/runner/rnn_runner.py index 36ea58f..7d540bd 100644 --- a/easyrl/runner/rnn_runner.py +++ b/easyrl/runner/rnn_runner.py @@ -11,10 +11,6 @@ class RNNRunner(BasicRunner): - def __init__(self, agent, env, eval_env=None): - super().__init__(agent=agent, - env=env, eval_env=eval_env) - @torch.no_grad() def __call__(self, time_steps, sample=True, evaluation=False, return_on_done=False, render=False, render_image=False, @@ -49,7 +45,7 @@ def __call__(self, time_steps, sample=True, evaluation=False, imgs = deepcopy(env.get_images()) ## TODO add masks on hidden state so that hidden state from - ## previous episode does get passed to the next episode after done=True + ## previous episode does not get passed to the next episode after done=True action, action_info, hidden_state = self.agent.get_action(ob['ob'], sample=sample, hidden_state=hidden_state, diff --git a/easyrl/runner/step_runner.py b/easyrl/runner/step_runner.py new file mode 100644 index 0000000..8eb3eef --- /dev/null +++ b/easyrl/runner/step_runner.py @@ -0,0 +1,77 @@ +import time +from copy import deepcopy + +import numpy as np +import torch +from gym.wrappers.time_limit import TimeLimit +from easyrl.runner.base_runner import BasicRunner +from easyrl.utils.data import StepData +from easyrl.utils.data import Trajectory +from easyrl.utils.torch_util import torch_to_np + +class StepRunner(BasicRunner): + # Simulate the environment for T steps, + # and in the next call, the environment will continue + # from where it's left in the previous call. + # only single env (no parallel envs) is supported for now. + # we also assume the environment is wrapped by TimeLimit + # from https://github.com/openai/gym/blob/master/gym/wrappers/time_limit.py + def __init__(self, agent, env, eval_env=None): + super().__init__(agent=agent, + env=env, + eval_env=eval_env) + self.step_data = None + if not (isinstance(env, TimeLimit) and isinstance(eval_env, TimeLimit)): + raise TypeError('Please add TimeLimit wrapper on the environment.') + + @torch.no_grad() + def __call__(self, time_steps, sample=True, evaluation=False, + return_on_done=False, render=False, render_image=False, + sleep_time=0, reset_kwargs=None, action_kwargs=None): + traj = Trajectory() + if reset_kwargs is None: + reset_kwargs = {} + if action_kwargs is None: + action_kwargs = {} + if evaluation: + env = self.eval_env + else: + env = self.train_env + if self.step_data is None or evaluation: + ob = env.reset(**reset_kwargs) + else: + ob = self.step_data.ob + ob = deepcopy(ob) + for t in range(time_steps): + if render: + env.render() + if sleep_time > 0: + time.sleep(sleep_time) + if render_image: + # get render images at the same time step as ob + imgs = deepcopy(env.get_images()) + + action, action_info = self.agent.get_action(ob, + sample=sample, + **action_kwargs) + next_ob, reward, done, info = env.step(action) + next_ob = deepcopy(next_ob) + if render_image: + for img, inf in zip(imgs, info): + inf['render_image'] = deepcopy(img) + + sd = StepData(ob=ob, + action=deepcopy(action), + action_info=deepcopy(action_info), + next_ob=next_ob, + reward=deepcopy(reward), + done=deepcopy(done) and not info.get('TimeLimit.truncated', + False), + info=deepcopy(info)) + ob = next_ob + traj.add(sd) + if return_on_done and done: + break + if done: + ob = deepcopy(env.reset(**reset_kwargs)) + self.step_data = deepcopy(traj[-1]) diff --git a/easyrl/utils/torch_util.py b/easyrl/utils/torch_util.py index 2dd145a..b807f87 100644 --- a/easyrl/utils/torch_util.py +++ b/easyrl/utils/torch_util.py @@ -28,6 +28,19 @@ def hard_update(target, source): target.load_state_dict(source.state_dict()) +def freeze_model(model): + for param in model.parameters(): + param.requires_grad = False + + +def move_to(models, device): + if isinstance(models, list): + for model in models: + model.to(device) + else: + models.to(device) + + def load_torch_model(model_file): logger.info(f'Loading model from {model_file}') if isinstance(model_file, str): @@ -76,6 +89,15 @@ def torch_long(array, device='cpu'): return torch.LongTensor(array).to(device) +def torch_bool(array, device='cpu'): + if isinstance(array, torch.Tensor): + return array.bool().to(device) + elif isinstance(array, np.ndarray): + return torch.from_numpy(array).bool().to(device) + elif isinstance(array, list): + return torch.BoolTensor(array).to(device) + + def action_from_dist(action_dist, sample=True): if isinstance(action_dist, Categorical): if sample: @@ -198,37 +220,6 @@ def preprocess(x): jac = jac[0] return jac - -class TanhTransform(Transform): - r""" - Transform via the mapping :math:`y = \tanh(x)`. - """ - domain = constraints.real - codomain = constraints.interval(-1.0, 1.0) - bijective = True - sign = +1 - - @staticmethod - def atanh(x): - return 0.5 * (x.log1p() - (-x).log1p()) - - def __eq__(self, other): - return isinstance(other, TanhTransform) - - def _call(self, x): - return x.tanh() - - def _inverse(self, y): - eps = torch.finfo(y.dtype).eps - return self.atanh(y.clamp(min=-1. + eps, max=1. - eps)) - - def log_abs_det_jacobian(self, x, y): - # We use a formula that is more numerically stable, - # see details in the following link - # https://github.com/tensorflow/probability/commit/ef6bb176e0ebd1cf6e25c6b5cecdd2428c22963f#diff-e120f70e92e6741bca649f04fcd907b7 - return 2. * (math.log(2.) - x - softplus(-2. * x)) - - def cosine_similarity(x1, x2): """ From 290135ae065860da05e3df42a20f8596dcf3240b Mon Sep 17 00:00:00 2001 From: taochenshh Date: Fri, 10 Jul 2020 22:31:27 -0400 Subject: [PATCH 02/35] initial working version of sac --- easyrl/agents/ppo_agent.py | 64 +++------- easyrl/agents/sac_agent.py | 189 ++++++++++++++++++++++++++++-- easyrl/configs/basic_config.py | 12 +- easyrl/configs/ppo_config.py | 2 - easyrl/configs/sac_config.py | 7 ++ easyrl/engine/basic_engine.py | 6 + easyrl/engine/ppo_engine.py | 7 +- easyrl/engine/sac_engine.py | 167 ++++++++++++++++++++++++++ easyrl/replays/circular_buffer.py | 93 ++++----------- easyrl/runner/step_runner.py | 68 +++++++---- easyrl/utils/common.py | 13 +- easyrl/utils/data.py | 10 +- easyrl/utils/gym_util.py | 24 ++++ easyrl/utils/torch_util.py | 100 +++++++++++++--- examples/ppo.py | 6 +- examples/sac.py | 80 +++++++++++++ 16 files changed, 652 insertions(+), 196 deletions(-) create mode 100644 examples/sac.py diff --git a/easyrl/agents/ppo_agent.py b/easyrl/agents/ppo_agent.py index 2c13f83..1853586 100644 --- a/easyrl/agents/ppo_agent.py +++ b/easyrl/agents/ppo_agent.py @@ -1,5 +1,4 @@ from functools import partial -from pathlib import Path import numpy as np import torch @@ -14,12 +13,13 @@ from easyrl.utils.torch_util import action_entropy from easyrl.utils.torch_util import action_from_dist from easyrl.utils.torch_util import action_log_prob -from easyrl.utils.torch_util import get_latest_ckpt +from easyrl.utils.torch_util import get_grad_norm +from easyrl.utils.torch_util import load_ckpt_data from easyrl.utils.torch_util import load_state_dict -from easyrl.utils.torch_util import load_torch_model +from easyrl.utils.torch_util import move_to +from easyrl.utils.torch_util import save_model from easyrl.utils.torch_util import torch_float from easyrl.utils.torch_util import torch_to_np -from easyrl.utils.torch_util import move_to class PPOAgent(BaseAgent): @@ -129,10 +129,13 @@ def optimize(self, data, *args, **kwargs): loss, pg_loss, vf_loss, ratio = loss_res self.optimizer.zero_grad() loss.backward() - grad_norm = None + if ppo_cfg.max_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_(self.all_params, ppo_cfg.max_grad_norm) + grad_norm = grad_norm.item() + else: + grad_norm = get_grad_norm(self.all_params) self.optimizer.step() with torch.no_grad(): approx_kl = 0.5 * torch.mean(torch.pow(old_log_prob - log_prob, 2)) @@ -145,8 +148,7 @@ def optimize(self, data, *args, **kwargs): approx_kl=approx_kl.item(), clip_frac=clip_frac ) - if grad_norm is not None: - optim_info['grad_norm'] = grad_norm + optim_info['grad_norm'] = grad_norm return optim_info def optim_preprocess(self, data): @@ -218,22 +220,6 @@ def decay_clip_range(self): ppo_cfg.clip_range -= self.clip_range_decay_rate def save_model(self, is_best=False, step=None): - if not ppo_cfg.save_best_only and step is not None: - ckpt_file = ppo_cfg.model_dir \ - .joinpath('ckpt_{:012d}.pt'.format(step)) - else: - ckpt_file = None - if is_best: - best_model_file = ppo_cfg.model_dir \ - .joinpath('model_best.pt') - else: - best_model_file = None - - if not ppo_cfg.save_best_only: - saved_model_files = sorted(ppo_cfg.model_dir.glob('*.pt')) - if len(saved_model_files) > ppo_cfg.max_saved_models: - saved_model_files[0].unlink() - data_to_save = { 'step': step, 'actor_state_dict': self.actor.state_dict(), @@ -245,39 +231,17 @@ def save_model(self, is_best=False, step=None): if ppo_cfg.linear_decay_clip_range: data_to_save['clip_range'] = ppo_cfg.clip_range data_to_save['clip_range_decay_rate'] = self.clip_range_decay_rate - logger.info(f'Exploration steps: {step}') - for fl in [ckpt_file, best_model_file]: - if fl is not None: - logger.info(f'Saving checkpoint: {fl}.') - torch.save(data_to_save, fl) + save_model(data_to_save, ppo_cfg, is_best=is_best, step=step) def load_model(self, step=None, pretrain_model=None): - if pretrain_model is not None: - # if the pretrain_model is the path of the folder - # that contains the checkpoint files, then it will - # load the most recent one. - if isinstance(pretrain_model, str): - pretrain_model = Path(pretrain_model) - if pretrain_model.suffix != '.pt': - pretrain_model = get_latest_ckpt(pretrain_model) - ckpt_data = load_torch_model(pretrain_model) - load_state_dict(self.actor, - ckpt_data['actor_state_dict']) - load_state_dict(self.critic, - ckpt_data['critic_state_dict']) - return - if step is None: - ckpt_file = Path(ppo_cfg.model_dir) \ - .joinpath('model_best.pt') - else: - ckpt_file = Path(ppo_cfg.model_dir) \ - .joinpath('ckpt_{:012d}.pt'.format(step)) - - ckpt_data = load_torch_model(ckpt_file) + ckpt_data = load_ckpt_data(ppo_cfg, step=step, + pretrain_model=pretrain_model) load_state_dict(self.actor, ckpt_data['actor_state_dict']) load_state_dict(self.critic, ckpt_data['critic_state_dict']) + if pretrain_model is not None: + return self.optimizer.load_state_dict(ckpt_data['optim_state_dict']) self.lr_scheduler.load_state_dict(ckpt_data['lr_scheduler_state_dict']) if ppo_cfg.linear_decay_clip_range: diff --git a/easyrl/agents/sac_agent.py b/easyrl/agents/sac_agent.py index cbe8cc1..dc43fd8 100644 --- a/easyrl/agents/sac_agent.py +++ b/easyrl/agents/sac_agent.py @@ -1,21 +1,29 @@ -from easyrl.agents.base_agent import BaseAgent -from easyrl.configs.sac_config import sac_cfg -import numpy as np +from copy import deepcopy + import torch import torch.nn as nn +import torch.nn.functional as F import torch.optim as optim -from torch.optim.lr_scheduler import LambdaLR -from copy import deepcopy + +from easyrl.agents.base_agent import BaseAgent +from easyrl.configs.sac_config import sac_cfg +from easyrl.utils.gym_util import num_space_dim +from easyrl.utils.torch_util import action_from_dist +from easyrl.utils.torch_util import action_log_prob from easyrl.utils.torch_util import freeze_model +from easyrl.utils.torch_util import get_grad_norm +from easyrl.utils.torch_util import load_ckpt_data +from easyrl.utils.torch_util import load_state_dict from easyrl.utils.torch_util import move_to -import itertools +from easyrl.utils.torch_util import save_model +from easyrl.utils.torch_util import soft_update from easyrl.utils.torch_util import torch_float -from easyrl.utils.torch_util import action_from_dist -from easyrl.utils.torch_util import action_log_prob from easyrl.utils.torch_util import torch_to_np +from easyrl.utils.torch_util import unfreeze_model + class SACAgent(BaseAgent): - def __init__(self, actor, q1, q2): + def __init__(self, actor, q1, q2, env): self.actor = actor self.q1 = q1 self.q2 = q2 @@ -35,11 +43,27 @@ def __init__(self, actor, q1, q2): amsgrad=sac_cfg.use_amsgrad ) - self.p_optimizer = optim.Adam(self.actor.parameters(), - **optim_args) - q_params = itertools.chain(self.q1.parameters(), self.q2.parameters()) + self.pi_optimizer = optim.Adam(self.actor.parameters(), + **optim_args) + q_params = list(self.q1.parameters()) + list(self.q2.parameters()) + # keep unique elements only. + self.q_params = dict.fromkeys(q_params).keys() optim_args['lr'] = sac_cfg.critic_lr - self.q_optimizer = optim.Adam(q_params, **optim_args) + self.q_optimizer = optim.Adam(self.q_params, **optim_args) + if sac_cfg.alpha is None: + self.tgt_entropy = -float(num_space_dim(env.action_space)) + self.log_alpha = nn.Parameter(torch.zeros(1, device=sac_cfg.device)) + self.alpha_optimizer = optim.Adam( + [self.log_alpha], + lr=sac_cfg.actor_lr, + ) + + @property + def alpha(self): + if sac_cfg.alpha is None: + return self.log_alpha.exp().item() + else: + return sac_cfg.alpha @torch.no_grad() def get_action(self, ob, sample=True, *args, **kwargs): @@ -63,7 +87,108 @@ def get_val(self, ob, action, tgt=False, q1=True, *args, **kwargs): val = val.squeeze(-1) return val + def optimize(self, data, *args, **kwargs): + self.train_mode() + for key, val in data.items(): + try: + data[key] = torch_float(val, device=sac_cfg.device) + except: + from IPython import embed + embed() + obs = data['obs'].squeeze(1) + actions = data['actions'].squeeze(1) + next_obs = data['next_obs'].squeeze(1) + rewards = data['rewards'] + dones = data['dones'] + q_info = self.update_q(obs=obs, + actions=actions, + next_obs=next_obs, + rewards=rewards, + dones=dones) + pi_info = self.update_pi(obs=obs) + alpha_info = self.update_alpha(pi_info['pi_entropy']) + optim_info = {**q_info, **pi_info, **alpha_info} + optim_info['alpha'] = self.alpha + if hasattr(self, 'log_alpha'): + optim_info['log_alpha'] = self.log_alpha.item() + + soft_update(self.q1_tgt, self.q1, sac_cfg.polyak) + soft_update(self.q2_tgt, self.q2, sac_cfg.polyak) + return optim_info + + def update_q(self, obs, actions, next_obs, rewards, dones): + q1 = self.q1((obs, actions))[0] + q2 = self.q2((obs, actions))[0] + with torch.no_grad(): + next_act_dist = self.actor(next_obs)[0] + next_actions = action_from_dist(next_act_dist, + sample=True) + nlog_prob = action_log_prob(next_actions, next_act_dist).unsqueeze(-1) + nq1_tgt_val = self.q1_tgt((next_obs, next_actions))[0] + nq2_tgt_val = self.q2_tgt((next_obs, next_actions))[0] + nq_tgt_val = torch.min(nq1_tgt_val, nq2_tgt_val) + q_tgt_val = rewards + sac_cfg.rew_discount * (1 - dones) * (nq_tgt_val - self.alpha * nlog_prob) + loss_q1 = F.mse_loss(q1, q_tgt_val) + loss_q2 = F.mse_loss(q2, q_tgt_val) + loss_q = loss_q1 + loss_q2 + self.q_optimizer.zero_grad() + loss_q.backward() + if sac_cfg.max_grad_norm is not None: + grad_norm = torch.nn.utils.clip_grad_norm_(self.q_params, + sac_cfg.max_grad_norm) + grad_norm = grad_norm.item() + else: + grad_norm = get_grad_norm(self.q_params) + self.q_optimizer.step() + q_info = dict( + q1_loss=loss_q1.item(), + q2_loss=loss_q2.item(), + q1_val=torch_to_np(q1), + q2_val=torch_to_np(q2) + ) + q_info['q_grad_norm'] = grad_norm + return q_info + def update_pi(self, obs): + freeze_model([self.q1, self.q2]) + act_dist = self.actor(obs)[0] + new_actions = action_from_dist(act_dist, + sample=True) + new_log_prob = action_log_prob(new_actions, act_dist).unsqueeze(-1) + new_q1 = self.q1((obs, new_actions))[0] + new_q2 = self.q2((obs, new_actions))[0] + new_q = torch.min(new_q1, new_q2) + + loss_pi = (self.alpha * new_log_prob - new_q).mean() + self.q_optimizer.zero_grad() + self.pi_optimizer.zero_grad() + loss_pi.backward() + if sac_cfg.max_grad_norm is not None: + grad_norm = torch.nn.utils.clip_grad_norm_(self.actor.parameters(), + sac_cfg.max_grad_norm) + grad_norm = grad_norm.item() + else: + grad_norm = get_grad_norm(self.actor.parameters()) + self.pi_optimizer.step() + pi_info = dict( + pi_loss=loss_pi.item(), + pi_entropy=-new_log_prob.mean().item() + ) + pi_info['pi_grad_norm'] = grad_norm + unfreeze_model([self.q1, self.q2]) + return pi_info + + def update_alpha(self, pi_entropy): + if sac_cfg.alpha is not None: + return dict() + alpha_loss = self.log_alpha.exp() * (pi_entropy - self.tgt_entropy) + self.alpha_optimizer.zero_grad() + alpha_loss.backward() + self.alpha_optimizer.step() + alpha_info = dict( + alpha_loss=alpha_loss.item() + ) + return alpha_info def train_mode(self): self.actor.train() @@ -75,3 +200,41 @@ def eval_mode(self): self.q1.eval() self.q2.eval() + def save_model(self, is_best=False, step=None): + data_to_save = { + 'step': step, + 'actor_state_dict': self.actor.state_dict(), + 'q1_state_dict': self.q1.state_dict(), + 'q1_tgt_state_dict': self.q1_tgt.state_dict(), + 'q2_state_dict': self.q2.state_dict(), + 'q2_tgt_state_dict': self.q2_tgt.state_dict(), + 'pi_optim_state_dict': self.pi_optimizer.state_dict(), + 'q_optim_state_dict': self.q_optimizer.state_dict(), + } + if sac_cfg.alpha is None: + data_to_save['log_alpha'] = self.log_alpha + data_to_save['alpha_optim_state_dict'] = self.alpha_optimizer.state_dict() + save_model(data_to_save, sac_cfg, is_best=is_best, step=step) + + def load_model(self, step=None, pretrain_model=None): + ckpt_data = load_ckpt_data(sac_cfg, step=step, + pretrain_model=pretrain_model) + load_state_dict(self.actor, + ckpt_data['actor_state_dict']) + load_state_dict(self.q1, + ckpt_data['q1_state_dict']) + load_state_dict(self.q1_tgt, + ckpt_data['q1_tgt_state_dict']) + load_state_dict(self.q2, + ckpt_data['q2_state_dict']) + load_state_dict(self.q2_tgt, + ckpt_data['q2_tgt_state_dict']) + if sac_cfg.alpha is None: + self.log_alpha = ckpt_data['log_alpha'] + if pretrain_model is not None: + return + self.pi_optimizer.load_state_dict(ckpt_data['pi_optim_state_dict']) + self.q_optimizer.load_state_dict(ckpt_data['q_optim_state_dict']) + if sac_cfg.alpha is None: + self.alpha_optimizer.load_state_dict(ckpt_data['alpha_optim_state_dict']) + return ckpt_data['step'] diff --git a/easyrl/configs/basic_config.py b/easyrl/configs/basic_config.py index f91fd5e..c3ebf55 100644 --- a/easyrl/configs/basic_config.py +++ b/easyrl/configs/basic_config.py @@ -10,7 +10,7 @@ @dataclass class BasicConfig: - env_id: str = None + env_name: str = None seed: int = 1 device: str = 'cuda' save_dir: str = 'data' @@ -18,8 +18,10 @@ class BasicConfig: log_interval: int = 10 weight_decay: float = 0.00 max_grad_norm: float = None - batch_size: int = 32 + batch_size: int = 128 save_best_only: bool = False + episode_steps: int = 1000 + max_steps: int = 1e6 smooth_eval_tau: float = 0.70 max_saved_models: int = 2 test: bool = False @@ -51,9 +53,9 @@ def data_dir(self): else: return data_dir.joinpath(f'seed_{self.seed}') data_dir = Path.cwd().joinpath(self.save_dir) - if self.env_id is not None: - data_dir = data_dir.joinpath(self.env_id) - skip_params = ['env_id', + if self.env_name is not None: + data_dir = data_dir.joinpath(self.env_name) + skip_params = ['env_name', 'save_dir', 'resume', 'resume_step', diff --git a/easyrl/configs/ppo_config.py b/easyrl/configs/ppo_config.py index 1365f76..dce7423 100644 --- a/easyrl/configs/ppo_config.py +++ b/easyrl/configs/ppo_config.py @@ -23,8 +23,6 @@ class PPOConfig(BasicConfig): linear_decay_clip_range: bool = True gae_lambda: float = 0.95 rew_discount: float = 0.99 - max_steps: int = 1e6 - episode_steps: int = 1000 use_amsgrad: bool = True sgd: bool = False momentum: float = 0.00 diff --git a/easyrl/configs/sac_config.py b/easyrl/configs/sac_config.py index ef80349..fe9b820 100644 --- a/easyrl/configs/sac_config.py +++ b/easyrl/configs/sac_config.py @@ -7,7 +7,14 @@ class SACConfig(BasicConfig): actor_lr: float = 1e-3 critic_lr: float = 1e-3 + warmup_steps: int = 10000 use_amsgrad: bool = True + opt_interval: int = 50 # perform optimization every n environment steps + opt_num: int = 50 # how many optimization loops in every optimization stage + alpha: float = None + rew_discount: float = 0.99 + replay_size: int = 1000000 + polyak: float = 0.995 sac_cfg = SACConfig() diff --git a/easyrl/engine/basic_engine.py b/easyrl/engine/basic_engine.py index d05d983..6c1c45b 100644 --- a/easyrl/engine/basic_engine.py +++ b/easyrl/engine/basic_engine.py @@ -1,7 +1,13 @@ +import numpy as np + + class BasicEngine: def __init__(self, agent, runner, **kwargs): self.agent = agent self.runner = runner + self.cur_step = 0 + self._best_eval_ret = -np.inf + self._eval_is_best = False def train(self, **kwargs): raise NotImplementedError diff --git a/easyrl/engine/ppo_engine.py b/easyrl/engine/ppo_engine.py index 191bbea..ded202b 100644 --- a/easyrl/engine/ppo_engine.py +++ b/easyrl/engine/ppo_engine.py @@ -21,9 +21,6 @@ class PPOEngine(BasicEngine): def __init__(self, agent, runner): super().__init__(agent=agent, runner=runner) - self.cur_step = 0 - self._best_eval_ret = -np.inf - self._eval_is_best = False if ppo_cfg.test or ppo_cfg.resume: self.cur_step = self.agent.load_model(step=ppo_cfg.resume_step) else: @@ -39,7 +36,7 @@ def __init__(self, agent, runner): def train(self): for iter_t in count(): - traj, rollout_time = self.rollout_once(sample=ppo_cfg.sample_action, + traj, rollout_time = self.rollout_once(sample=True, time_steps=ppo_cfg.episode_steps) train_log_info = self.train_once(traj) if iter_t % ppo_cfg.eval_interval == 0: @@ -192,6 +189,4 @@ def get_train_log(self, optim_infos, traj): train_log_info = dict() for key, val in log_info.items(): train_log_info['train/' + key] = val - # histogram_log = {'histogram': {'rollout_action': traj.actions}} - # self.tf_logger.save_dict(histogram_log, step=self.cur_step) return train_log_info diff --git a/easyrl/engine/sac_engine.py b/easyrl/engine/sac_engine.py index e69de29..7c6c498 100644 --- a/easyrl/engine/sac_engine.py +++ b/easyrl/engine/sac_engine.py @@ -0,0 +1,167 @@ +import time +from collections import deque +from copy import deepcopy +from itertools import count + +import numpy as np +import torch +from tqdm import tqdm + +from easyrl.configs.sac_config import sac_cfg +from easyrl.engine.basic_engine import BasicEngine +from easyrl.utils.common import get_list_stats +from easyrl.utils.common import save_traj +from easyrl.utils.data import Trajectory +from easyrl.utils.rl_logger import TensorboardLogger + + +class SACEngine(BasicEngine): + def __init__(self, agent, runner, memory): + super().__init__(agent=agent, + runner=runner) + self.memory = memory + if sac_cfg.test or sac_cfg.resume: + self.cur_step = self.agent.load_model(step=sac_cfg.resume_step) + else: + if sac_cfg.pretrain_model is not None: + self.agent.load_model(pretrain_model=sac_cfg.pretrain_model) + sac_cfg.create_model_log_dir() + self.train_ep_return = deque(maxlen=100) + self.smooth_eval_return = None + self.smooth_tau = sac_cfg.smooth_eval_tau + self.optim_stime = None + if not sac_cfg.test: + self.tf_logger = TensorboardLogger(log_dir=sac_cfg.log_dir) + + def train(self): + if len(self.memory) < sac_cfg.warmup_steps: + self.runner.reset() + traj, _ = self.rollout_once(random_action=True, + time_steps=sac_cfg.warmup_steps - len(self.memory)) + self.add_traj_to_memory(traj) + self.runner.reset() + for iter_t in count(): + traj, rollout_time = self.rollout_once(sample=True, + time_steps=sac_cfg.opt_interval) + self.add_traj_to_memory(traj) + train_log_info = self.train_once() + if iter_t % sac_cfg.eval_interval == 0: + det_log_info, _ = self.eval(eval_num=sac_cfg.test_num, sample=False) + sto_log_info, _ = self.eval(eval_num=sac_cfg.test_num, sample=True) + det_log_info = {f'det/{k}': v for k, v in det_log_info.items()} + sto_log_info = {f'sto/{k}': v for k, v in sto_log_info.items()} + eval_log_info = {**det_log_info, **sto_log_info} + self.agent.save_model(is_best=self._eval_is_best, + step=self.cur_step) + else: + eval_log_info = None + if iter_t % sac_cfg.log_interval == 0: + train_log_info['train/rollout_time'] = rollout_time + if eval_log_info is not None: + train_log_info.update(eval_log_info) + scalar_log = {'scalar': train_log_info} + self.tf_logger.save_dict(scalar_log, step=self.cur_step) + if self.cur_step > sac_cfg.max_steps: + break + + @torch.no_grad() + def eval(self, render=False, save_eval_traj=False, sample=True, + eval_num=1, sleep_time=0, smooth=True, no_tqdm=None): + time_steps = [] + rets = [] + lst_step_infos = [] + if no_tqdm: + disable_tqdm = bool(no_tqdm) + else: + disable_tqdm = not sac_cfg.test + for idx in tqdm(range(eval_num), disable=disable_tqdm): + traj, _ = self.rollout_once(time_steps=sac_cfg.episode_steps, + return_on_done=True, + sample=sac_cfg.sample_action and sample, + render=render, + sleep_time=sleep_time, + render_image=save_eval_traj, + evaluation=True) + tsps = traj.steps_til_done.copy().tolist() + rewards = traj.raw_rewards + infos = traj.infos + for ej in range(traj.num_envs): + ret = np.sum(rewards[:tsps[ej], ej]) + rets.append(ret) + lst_step_infos.append(infos[tsps[ej] - 1][ej]) + time_steps.extend(tsps) + if save_eval_traj: + save_traj(traj, sac_cfg.eval_dir) + + raw_traj_info = {'return': rets, + 'episode_length': time_steps, + 'lst_step_info': lst_step_infos} + log_info = dict() + for key, val in raw_traj_info.items(): + if 'info' in key: + continue + val_stats = get_list_stats(val) + for sk, sv in val_stats.items(): + log_info['eval/' + key + '/' + sk] = sv + if smooth: + if self.smooth_eval_return is None: + self.smooth_eval_return = log_info['eval/return/mean'] + else: + self.smooth_eval_return = self.smooth_eval_return * self.smooth_tau + self.smooth_eval_return += (1 - self.smooth_tau) * log_info['eval/return/mean'] + log_info['eval/smooth_return/mean'] = self.smooth_eval_return + if self.smooth_eval_return > self._best_eval_ret: + self._eval_is_best = True + self._best_eval_ret = self.smooth_eval_return + else: + self._eval_is_best = False + return log_info, raw_traj_info + + def rollout_once(self, *args, **kwargs): + t0 = time.perf_counter() + self.agent.eval_mode() + traj = self.runner(**kwargs) + t1 = time.perf_counter() + elapsed_time = t1 - t0 + return traj, elapsed_time + + def train_once(self): + self.optim_stime = time.perf_counter() + optim_infos = [] + for oe in range(sac_cfg.opt_num): + sampled_data = self.memory.sample(batch_size=sac_cfg.batch_size) + sampled_data = Trajectory(traj_data=sampled_data) + batch_data = dict( + obs=sampled_data.obs, + next_obs=sampled_data.next_obs, + actions=sampled_data.actions, + dones=sampled_data.dones, + rewards=sampled_data.rewards + ) + optim_info = self.agent.optimize(batch_data) + optim_infos.append(optim_info) + return self.get_train_log(optim_infos) + + def get_train_log(self, optim_infos): + log_info = dict() + for key in optim_infos[0].keys(): + if 'val' in key: + continue + log_info[key] = np.mean([inf[key] for inf in optim_infos if key in inf]) + + for key in ['q1_val', 'q2_val']: + k_stats = get_list_stats([inf[key] for inf in optim_infos if key in inf]) + for sk, sv in k_stats.items(): + log_info[f'{key}/' + sk] = sv + + t1 = time.perf_counter() + log_info['optim_time'] = t1 - self.optim_stime + train_log_info = dict() + for key, val in log_info.items(): + train_log_info['train/' + key] = val + return train_log_info + + def add_traj_to_memory(self, traj): + for sd in traj.traj_data: + self.memory.append(deepcopy(sd)) + self.cur_step += traj.total_steps diff --git a/easyrl/replays/circular_buffer.py b/easyrl/replays/circular_buffer.py index 75aec73..89e12dc 100644 --- a/easyrl/replays/circular_buffer.py +++ b/easyrl/replays/circular_buffer.py @@ -1,80 +1,35 @@ -import numpy as np +import random +from copy import deepcopy -class RingBuffer: - def __init__(self, maxlen, shape, dtype='float32'): - self.maxlen = maxlen - self.start = 0 - self.length = 0 - self.data = np.zeros((maxlen,) + shape).astype(dtype) +class CyclicBuffer: + def __init__(self, capacity): + self.buffer = [] + self.capacity = capacity + self.cur_pos = 0 def __len__(self): - return self.length + return len(self.buffer) - def __getitem__(self, idx): - if idx < 0 or idx >= self.length: - raise KeyError() - return self.data[(self.start + idx) % self.maxlen] + def __getitem__(self, item): + return self.buffer[item] - def get_batch(self, idxs): - return self.data[(self.start + idxs) % self.maxlen] - - def append(self, v): - if self.length < self.maxlen: - # We have space, simply increase the length. - self.length += 1 - elif self.length == self.maxlen: - # No space, "remove" the first item. - self.start = (self.start + 1) % self.maxlen + def append(self, data): + if len(self.buffer) < self.capacity: + self.buffer.append(data) else: - # This should never happen. - raise RuntimeError() - self.data[(self.start + self.length - 1) % self.maxlen] = v - - -def array_min2d(x): - x = np.array(x) - if x.ndim >= 2: - return x - return x.reshape(-1, 1) - - -class CircularMemory: - def __init__(self, limit, action_shape, observation_shape): - self.limit = limit - self.obs0 = RingBuffer(limit, shape=observation_shape) - self.actions = RingBuffer(limit, shape=action_shape) - self.rewards = RingBuffer(limit, shape=(1,)) - self.terminals = RingBuffer(limit, shape=(1,)) - self.obs1 = RingBuffer(limit, shape=observation_shape) + self.buffer[self.cur_pos] = data + self.cur_pos = int((self.cur_pos + 1) % self.capacity) def sample(self, batch_size): - batch_idxs = np.random.randint(0, len(self), size=batch_size) - - obs0_batch = self.obs0.get_batch(batch_idxs) - obs1_batch = self.obs1.get_batch(batch_idxs) - action_batch = self.actions.get_batch(batch_idxs) - reward_batch = self.rewards.get_batch(batch_idxs) - terminal1_batch = self.terminals.get_batch(batch_idxs) - - result = { - 'obs0': array_min2d(obs0_batch), - 'obs1': array_min2d(obs1_batch), - 'rewards': array_min2d(reward_batch), - 'actions': array_min2d(action_batch), - 'terminals': array_min2d(terminal1_batch), - } - return result - - def append(self, obs0, action, reward, obs1, terminal, training=True): - if not training: - return - self.obs0.append(obs0) - self.actions.append(action) - self.rewards.append(reward) - self.obs1.append(obs1) - self.terminals.append(terminal) + if batch_size == len(self.buffer): + return deepcopy(self.buffer) + else: + data = random.sample(self.buffer, batch_size) + return deepcopy(data) - def __len__(self): - return len(self.obs0) + def get_all(self): + return deepcopy(self.buffer) + def clear(self): + self.buffer.clear() diff --git a/easyrl/runner/step_runner.py b/easyrl/runner/step_runner.py index 8eb3eef..9510593 100644 --- a/easyrl/runner/step_runner.py +++ b/easyrl/runner/step_runner.py @@ -1,13 +1,14 @@ import time from copy import deepcopy -import numpy as np import torch from gym.wrappers.time_limit import TimeLimit + from easyrl.runner.base_runner import BasicRunner +from easyrl.utils.common import list_to_numpy from easyrl.utils.data import StepData from easyrl.utils.data import Trajectory -from easyrl.utils.torch_util import torch_to_np + class StepRunner(BasicRunner): # Simulate the environment for T steps, @@ -16,18 +17,21 @@ class StepRunner(BasicRunner): # only single env (no parallel envs) is supported for now. # we also assume the environment is wrapped by TimeLimit # from https://github.com/openai/gym/blob/master/gym/wrappers/time_limit.py - def __init__(self, agent, env, eval_env=None): + def __init__(self, agent, env, eval_env=None, max_steps=None): super().__init__(agent=agent, env=env, eval_env=eval_env) - self.step_data = None + self.cur_ob = None + self.max_steps = max_steps + self.cur_step = 0 if not (isinstance(env, TimeLimit) and isinstance(eval_env, TimeLimit)): raise TypeError('Please add TimeLimit wrapper on the environment.') @torch.no_grad() def __call__(self, time_steps, sample=True, evaluation=False, return_on_done=False, render=False, render_image=False, - sleep_time=0, reset_kwargs=None, action_kwargs=None): + sleep_time=0, reset_kwargs=None, + action_kwargs=None, random_action=False): traj = Trajectory() if reset_kwargs is None: reset_kwargs = {} @@ -37,10 +41,11 @@ def __call__(self, time_steps, sample=True, evaluation=False, env = self.eval_env else: env = self.train_env - if self.step_data is None or evaluation: + if self.cur_ob is None or evaluation: ob = env.reset(**reset_kwargs) + self.cur_step = 0 else: - ob = self.step_data.ob + ob = self.cur_ob ob = deepcopy(ob) for t in range(time_steps): if render: @@ -50,28 +55,47 @@ def __call__(self, time_steps, sample=True, evaluation=False, if render_image: # get render images at the same time step as ob imgs = deepcopy(env.get_images()) - - action, action_info = self.agent.get_action(ob, - sample=sample, - **action_kwargs) + if random_action: + action = env.action_space.sample() + action_info = dict() + else: + action, action_info = self.agent.get_action(ob, + sample=sample, + **action_kwargs) next_ob, reward, done, info = env.step(action) + self.cur_step += 1 next_ob = deepcopy(next_ob) if render_image: for img, inf in zip(imgs, info): inf['render_image'] = deepcopy(img) - - sd = StepData(ob=ob, - action=deepcopy(action), - action_info=deepcopy(action_info), - next_ob=next_ob, - reward=deepcopy(reward), - done=deepcopy(done) and not info.get('TimeLimit.truncated', - False), - info=deepcopy(info)) + true_done = done and not info.get('TimeLimit.truncated', + False) + sd = StepData(ob=list_to_numpy(deepcopy(ob), + expand_dims=0), + action=list_to_numpy(deepcopy(action), + expand_dims=0), + action_info=[deepcopy(action_info)], + next_ob=list_to_numpy(deepcopy(next_ob), + expand_dims=0), + reward=list_to_numpy(reward), + done=list_to_numpy(true_done), + info=[deepcopy(info)]) ob = next_ob traj.add(sd) if return_on_done and done: break - if done: + need_reset = done + if self.max_steps is not None: + need_reset = need_reset or self.cur_step > self.max_steps + if need_reset: ob = deepcopy(env.reset(**reset_kwargs)) - self.step_data = deepcopy(traj[-1]) + self.cur_step = 0 + self.cur_ob = deepcopy(ob) + return traj + + def reset(self, reset_kwargs=None): + if reset_kwargs is None: + reset_kwargs = {} + ob = self.train_env.reset(**reset_kwargs) + self.cur_step = 0 + self.cur_ob = deepcopy(ob) diff --git a/easyrl/utils/common.py b/easyrl/utils/common.py index cf8350f..97d6f53 100644 --- a/easyrl/utils/common.py +++ b/easyrl/utils/common.py @@ -1,4 +1,5 @@ import json +import numbers import pickle as pkl import random import shutil @@ -16,10 +17,18 @@ def set_random_seed(seed): np.random.seed(seed) torch.manual_seed(seed) random.seed(seed) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(seed) + torch.cuda.manual_seed_all(seed) +def list_to_numpy(data, expand_dims=None): + if isinstance(data, numbers.Number): + data = np.array([data]) + else: + data = np.array(data) + if expand_dims is not None: + data = np.expand_dims(data, axis=expand_dims) + return data + def save_traj(traj, save_dir): if isinstance(save_dir, str): save_dir = Path(save_dir) diff --git a/easyrl/utils/data.py b/easyrl/utils/data.py index e012cf4..c995c79 100644 --- a/easyrl/utils/data.py +++ b/easyrl/utils/data.py @@ -13,12 +13,12 @@ class StepData: state: Any = None action: Any = None # store action infomation such as log probability, entropy - action_info: Dict = None + action_info: Any = None next_ob: Any = None next_state: Any = None - reward: float = None - done: bool = None - info: Dict = None + reward: Any = None + done: Any = None + info: Any = None def __post_init__(self): """ @@ -115,7 +115,7 @@ def infos(self): @property def total_steps(self): - return self.traj_data[0].action.shape[0] * len(self.traj_data) + return len(self.traj_data[0].action) * len(self.traj_data) @property def num_envs(self): diff --git a/easyrl/utils/gym_util.py b/easyrl/utils/gym_util.py index ba03f46..de59910 100644 --- a/easyrl/utils/gym_util.py +++ b/easyrl/utils/gym_util.py @@ -1,4 +1,11 @@ import gym +import numpy as np +from gym.spaces import Box +from gym.spaces import Dict +from gym.spaces import Discrete +from gym.spaces import MultiBinary +from gym.spaces import MultiDiscrete +from gym.spaces import Tuple from easyrl.envs.dummy_vec_env import DummyVecEnv from easyrl.envs.shmem_vec_env import ShmemVecEnv @@ -6,6 +13,23 @@ from easyrl.utils.rl_logger import logger +def num_space_dim(space): + if isinstance(space, Box): + return int(np.prod(space.shape)) + elif isinstance(space, Discrete): + return int(space.n) + elif isinstance(space, Tuple): + return int(sum([num_space_dim(s) for s in space.spaces])) + elif isinstance(space, Dict): + return int(sum([num_space_dim(s) for s in space.spaces.values()])) + elif isinstance(space, MultiBinary): + return int(space.n) + elif isinstance(space, MultiDiscrete): + return int(np.prod(space.shape)) + else: + raise NotImplementedError + + def make_vec_env(env_id, num_envs, seed=1, no_timeout=True, env_kwargs=None): logger.info(f'Creating {num_envs} environments.') if env_kwargs is None: diff --git a/easyrl/utils/torch_util.py b/easyrl/utils/torch_util.py index b807f87..32af0d6 100644 --- a/easyrl/utils/torch_util.py +++ b/easyrl/utils/torch_util.py @@ -1,4 +1,3 @@ -import math import re from pathlib import Path @@ -8,10 +7,7 @@ import torch.nn.functional as F from torch.distributions import Categorical from torch.distributions import Independent -from torch.distributions import Transform from torch.distributions import TransformedDistribution -from torch.distributions import constraints -from torch.nn.functional import softplus from torch.utils.data import Dataset from easyrl.utils.rl_logger import logger @@ -20,7 +16,7 @@ def soft_update(target, source, tau): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_( - target_param.data * (1.0 - tau) + param.data * tau + target_param.data * tau + param.data * (1.0 - tau) ) @@ -29,8 +25,21 @@ def hard_update(target, source): def freeze_model(model): - for param in model.parameters(): - param.requires_grad = False + if isinstance(model, list) or isinstance(model, tuple): + for md in model: + freeze_model(md) + else: + for param in model.parameters(): + param.requires_grad = False + + +def unfreeze_model(model): + if isinstance(model, list) or isinstance(model, tuple): + for md in model: + unfreeze_model(md) + else: + for param in model.parameters(): + param.requires_grad = True def move_to(models, device): @@ -41,6 +50,38 @@ def move_to(models, device): models.to(device) +def get_grad_norm(model): + total_norm = 0 + iterator = model.parameters() if isinstance(model, nn.Module) else model + for p in iterator: + total_norm += p.grad.data.pow(2).sum().item() + total_norm = total_norm ** 0.5 + return total_norm + + +def save_model(data, cfg, is_best=False, step=None): + if not cfg.save_best_only and step is not None: + ckpt_file = cfg.model_dir \ + .joinpath('ckpt_{:012d}.pt'.format(step)) + else: + ckpt_file = None + if is_best: + best_model_file = cfg.model_dir.joinpath('model_best.pt') + else: + best_model_file = None + + if not cfg.save_best_only: + saved_model_files = sorted(cfg.model_dir.glob('*.pt')) + if len(saved_model_files) > cfg.max_saved_models: + saved_model_files[0].unlink() + + logger.info(f'Exploration steps: {step}') + for fl in [ckpt_file, best_model_file]: + if fl is not None: + logger.info(f'Saving checkpoint: {fl}.') + torch.save(data, fl) + + def load_torch_model(model_file): logger.info(f'Loading model from {model_file}') if isinstance(model_file, str): @@ -65,6 +106,26 @@ def load_state_dict(model, pretrained_dict): model.load_state_dict(model_dict) +def load_ckpt_data(cfg, step=None, pretrain_model=None): + if pretrain_model is not None: + # if the pretrain_model is the path of the folder + # that contains the checkpoint files, then it will + # load the most recent one. + if isinstance(pretrain_model, str): + pretrain_model = Path(pretrain_model) + if pretrain_model.suffix != '.pt': + pretrain_model = get_latest_ckpt(pretrain_model) + ckpt_data = load_torch_model(pretrain_model) + return ckpt_data + if step is None: + ckpt_file = Path(cfg.model_dir).joinpath('model_best.pt') + else: + ckpt_file = Path(cfg.model_dir).joinpath('ckpt_{:012d}.pt'.format(step)) + + ckpt_data = load_torch_model(ckpt_file) + return ckpt_data + + def torch_to_np(tensor): if not isinstance(tensor, torch.Tensor): raise TypeError('tensor has to be a torch tensor!') @@ -111,8 +172,13 @@ def action_from_dist(action_dist, sample=True): return action_dist.mean elif isinstance(action_dist, TransformedDistribution): if not sample: - raise TypeError('Deterministic sampling is not ' - 'defined for transformed distribution!') + if isinstance(action_dist.base_dist, Independent): + out = action_dist.base_dist.mean + out = action_dist.transforms[0](out) + return out + else: + raise TypeError('Deterministic sampling is not ' + 'defined for transformed distribution!') if action_dist.has_rsample: return action_dist.rsample() else: @@ -123,17 +189,12 @@ def action_from_dist(action_dist, sample=True): def action_log_prob(action, action_dist): - if isinstance(action_dist, Categorical): - log_prob = action_dist.log_prob(action) - return log_prob - elif isinstance(action_dist, - Independent) or isinstance(action_dist, - TransformedDistribution): + try: log_prob = action_dist.log_prob(action) - return log_prob - else: - raise TypeError('Getting log_prob of actions for the given ' - 'distribution is not implemented!') + except NotImplementedError: + raise NotImplementedError('Getting log_prob of actions for the ' + 'given distribution is not implemented!') + return log_prob def action_entropy(action_dist, log_prob=None): @@ -220,6 +281,7 @@ def preprocess(x): jac = jac[0] return jac + def cosine_similarity(x1, x2): """ diff --git a/examples/ppo.py b/examples/ppo.py index f37b68c..ff607ff 100644 --- a/examples/ppo.py +++ b/examples/ppo.py @@ -26,10 +26,10 @@ def main(): else: skip_params = [] ppo_cfg.restore_cfg(skip_params=skip_params) - if ppo_cfg.env_id is None: - ppo_cfg.env_id = 'Hopper-v2' + if ppo_cfg.env_name is None: + ppo_cfg.env_name = 'HalfCheetah-v2' set_random_seed(ppo_cfg.seed) - env = make_vec_env(ppo_cfg.env_id, + env = make_vec_env(ppo_cfg.env_name, ppo_cfg.num_envs, seed=ppo_cfg.seed) env.reset() diff --git a/examples/sac.py b/examples/sac.py new file mode 100644 index 0000000..94974e9 --- /dev/null +++ b/examples/sac.py @@ -0,0 +1,80 @@ +import gym +import torch.nn as nn + +from easyrl.agents.sac_agent import SACAgent +from easyrl.configs.command_line import cfg_from_cmd +from easyrl.configs.sac_config import sac_cfg +from easyrl.engine.sac_engine import SACEngine +from easyrl.replays.circular_buffer import CyclicBuffer +from easyrl.models.diag_gaussian_policy import DiagGaussianPolicy +from easyrl.models.mlp import MLP +from easyrl.models.value_net import ValueNet +from easyrl.runner.step_runner import StepRunner +from easyrl.utils.common import set_random_seed + + +def main(): + cfg_from_cmd(sac_cfg) + if sac_cfg.resume or sac_cfg.test: + if sac_cfg.test: + skip_params = [ + 'test_num', + 'num_envs', + 'sample_action', + ] + else: + skip_params = [] + sac_cfg.restore_cfg(skip_params=skip_params) + if sac_cfg.env_name is None: + sac_cfg.env_name = 'HalfCheetah-v2' + if not sac_cfg.test: + sac_cfg.test_num = 10 + set_random_seed(sac_cfg.seed) + env = gym.make(sac_cfg.env_name) + env.seed(sac_cfg.seed) + eval_env = gym.make(sac_cfg.env_name) + ob_size = env.observation_space.shape[0] + act_size = env.action_space.shape[0] + + actor_body = MLP(input_size=ob_size, + hidden_sizes=[256], + output_size=256, + hidden_act=nn.ReLU, + output_act=nn.ReLU) + q1_body = MLP(input_size=ob_size + act_size, + hidden_sizes=[256], + output_size=256, + hidden_act=nn.ReLU, + output_act=nn.ReLU) + q2_body = MLP(input_size=ob_size + act_size, + hidden_sizes=[256], + output_size=256, + hidden_act=nn.ReLU, + output_act=nn.ReLU) + actor = DiagGaussianPolicy(actor_body, action_dim=act_size, + tanh_on_dist=True, + std_cond_in=True, + clamp_log_std=True) + + q1 = ValueNet(q1_body) + q2 = ValueNet(q2_body) + agent = SACAgent(actor, q1=q1, q2=q2, env=env) + runner = StepRunner(agent=agent, env=env, eval_env=eval_env) + memory = CyclicBuffer(capacity=sac_cfg.replay_size) + engine = SACEngine(agent=agent, + runner=runner, + memory=memory) + if not sac_cfg.test: + engine.train() + else: + stat_info, raw_traj_info = engine.eval(render=sac_cfg.render, + save_eval_traj=sac_cfg.save_test_traj, + eval_num=sac_cfg.test_num, + sleep_time=0.04) + import pprint + pprint.pprint(stat_info) + env.close() + + +if __name__ == '__main__': + main() From c65011b8c761f2348453d12ef564f462ff4fa6b9 Mon Sep 17 00:00:00 2001 From: taochenshh Date: Sat, 11 Jul 2020 09:41:35 -0400 Subject: [PATCH 03/35] save replay buffer --- easyrl/agents/sac_agent.py | 34 +++++++++++++++++++++++++--------- easyrl/configs/basic_config.py | 8 ++++---- easyrl/configs/sac_config.py | 1 + easyrl/engine/sac_engine.py | 19 ++++++++++--------- examples/run_test.sh | 20 ++++++++++++++++++++ examples/sac.py | 11 ++++++----- 6 files changed, 66 insertions(+), 27 deletions(-) create mode 100755 examples/run_test.sh diff --git a/easyrl/agents/sac_agent.py b/easyrl/agents/sac_agent.py index dc43fd8..db03409 100644 --- a/easyrl/agents/sac_agent.py +++ b/easyrl/agents/sac_agent.py @@ -20,13 +20,17 @@ from easyrl.utils.torch_util import torch_float from easyrl.utils.torch_util import torch_to_np from easyrl.utils.torch_util import unfreeze_model - +from easyrl.utils.common import save_to_pickle +from easyrl.utils.rl_logger import logger +from easyrl.utils.common import load_from_pickle +import pickle class SACAgent(BaseAgent): - def __init__(self, actor, q1, q2, env): + def __init__(self, actor, q1, q2, env, memory): self.actor = actor self.q1 = q1 self.q2 = q2 + self.memory = memory self.q1_tgt = deepcopy(self.q1) self.q2_tgt = deepcopy(self.q2) freeze_model(self.q1_tgt) @@ -36,7 +40,7 @@ def __init__(self, actor, q1, q2, env): move_to([self.actor, self.q1, self.q2, self.q1_tgt, self.q2_tgt], device=sac_cfg.device) - + self.mem_file = sac_cfg.model_dir.joinpath('mem.pkl') optim_args = dict( lr=sac_cfg.actor_lr, weight_decay=sac_cfg.weight_decay, @@ -51,7 +55,10 @@ def __init__(self, actor, q1, q2, env): optim_args['lr'] = sac_cfg.critic_lr self.q_optimizer = optim.Adam(self.q_params, **optim_args) if sac_cfg.alpha is None: - self.tgt_entropy = -float(num_space_dim(env.action_space)) + if sac_cfg.tgt_entropy is None: + self.tgt_entropy = -float(num_space_dim(env.action_space)) + else: + self.tgt_entropy = sac_cfg.tgt_entropy self.log_alpha = nn.Parameter(torch.zeros(1, device=sac_cfg.device)) self.alpha_optimizer = optim.Adam( [self.log_alpha], @@ -90,11 +97,7 @@ def get_val(self, ob, action, tgt=False, q1=True, *args, **kwargs): def optimize(self, data, *args, **kwargs): self.train_mode() for key, val in data.items(): - try: - data[key] = torch_float(val, device=sac_cfg.device) - except: - from IPython import embed - embed() + data[key] = torch_float(val, device=sac_cfg.device) obs = data['obs'].squeeze(1) actions = data['actions'].squeeze(1) next_obs = data['next_obs'].squeeze(1) @@ -215,6 +218,9 @@ def save_model(self, is_best=False, step=None): data_to_save['log_alpha'] = self.log_alpha data_to_save['alpha_optim_state_dict'] = self.alpha_optimizer.state_dict() save_model(data_to_save, sac_cfg, is_best=is_best, step=step) + logger.info(f'Saving the replay buffer to: {self.mem_file}.') + save_to_pickle(self.memory, self.mem_file) + logger.info('The replay buffer is saved.') def load_model(self, step=None, pretrain_model=None): ckpt_data = load_ckpt_data(sac_cfg, step=step, @@ -237,4 +243,14 @@ def load_model(self, step=None, pretrain_model=None): self.q_optimizer.load_state_dict(ckpt_data['q_optim_state_dict']) if sac_cfg.alpha is None: self.alpha_optimizer.load_state_dict(ckpt_data['alpha_optim_state_dict']) + + logger.info(f'Loading the replay buffer from: {self.mem_file}.') + if not self.mem_file.exists(): + logger.warning('The replay buffer file is not founded!') + else: + try: + self.memory = load_from_pickle(self.mem_file) + except pickle.UnpicklingError: + logger.warning('The replay buffer file is corrupted, hence, not loaded!') + return ckpt_data['step'] diff --git a/easyrl/configs/basic_config.py b/easyrl/configs/basic_config.py index c3ebf55..9468226 100644 --- a/easyrl/configs/basic_config.py +++ b/easyrl/configs/basic_config.py @@ -11,7 +11,7 @@ @dataclass class BasicConfig: env_name: str = None - seed: int = 1 + seed: int = 0 device: str = 'cuda' save_dir: str = 'data' eval_interval: int = 100 @@ -65,6 +65,7 @@ def data_dir(self): 'eval_interval', 'render', 'seed', + 'max_steps', 'pretrain_model'] if hasattr(self, 'diff_cfg'): if 'test' in self.diff_cfg: @@ -81,10 +82,9 @@ def data_dir(self): else: path_name += f'_{key}_{val}' data_dir = data_dir.joinpath(path_name) - data_dir = data_dir.joinpath(f'seed_{self.seed}') else: - data_dir = data_dir.joinpath(f'seed_{self.seed}') - + data_dir = data_dir.joinpath('default') + data_dir = data_dir.joinpath(f'seed_{self.seed}') return data_dir @property diff --git a/easyrl/configs/sac_config.py b/easyrl/configs/sac_config.py index fe9b820..b26560c 100644 --- a/easyrl/configs/sac_config.py +++ b/easyrl/configs/sac_config.py @@ -15,6 +15,7 @@ class SACConfig(BasicConfig): rew_discount: float = 0.99 replay_size: int = 1000000 polyak: float = 0.995 + tgt_entropy: float = None sac_cfg = SACConfig() diff --git a/easyrl/engine/sac_engine.py b/easyrl/engine/sac_engine.py index 7c6c498..6dc943f 100644 --- a/easyrl/engine/sac_engine.py +++ b/easyrl/engine/sac_engine.py @@ -16,10 +16,9 @@ class SACEngine(BasicEngine): - def __init__(self, agent, runner, memory): + def __init__(self, agent, runner): super().__init__(agent=agent, runner=runner) - self.memory = memory if sac_cfg.test or sac_cfg.resume: self.cur_step = self.agent.load_model(step=sac_cfg.resume_step) else: @@ -34,10 +33,10 @@ def __init__(self, agent, runner, memory): self.tf_logger = TensorboardLogger(log_dir=sac_cfg.log_dir) def train(self): - if len(self.memory) < sac_cfg.warmup_steps: + if len(self.agent.memory) < sac_cfg.warmup_steps: self.runner.reset() traj, _ = self.rollout_once(random_action=True, - time_steps=sac_cfg.warmup_steps - len(self.memory)) + time_steps=sac_cfg.warmup_steps - len(self.agent.memory)) self.add_traj_to_memory(traj) self.runner.reset() for iter_t in count(): @@ -46,8 +45,10 @@ def train(self): self.add_traj_to_memory(traj) train_log_info = self.train_once() if iter_t % sac_cfg.eval_interval == 0: - det_log_info, _ = self.eval(eval_num=sac_cfg.test_num, sample=False) - sto_log_info, _ = self.eval(eval_num=sac_cfg.test_num, sample=True) + det_log_info, _ = self.eval(eval_num=sac_cfg.test_num, + sample=False, smooth=True) + sto_log_info, _ = self.eval(eval_num=sac_cfg.test_num, + sample=True, smooth=False) det_log_info = {f'det/{k}': v for k, v in det_log_info.items()} sto_log_info = {f'sto/{k}': v for k, v in sto_log_info.items()} eval_log_info = {**det_log_info, **sto_log_info} @@ -129,7 +130,7 @@ def train_once(self): self.optim_stime = time.perf_counter() optim_infos = [] for oe in range(sac_cfg.opt_num): - sampled_data = self.memory.sample(batch_size=sac_cfg.batch_size) + sampled_data = self.agent.memory.sample(batch_size=sac_cfg.batch_size) sampled_data = Trajectory(traj_data=sampled_data) batch_data = dict( obs=sampled_data.obs, @@ -148,7 +149,7 @@ def get_train_log(self, optim_infos): if 'val' in key: continue log_info[key] = np.mean([inf[key] for inf in optim_infos if key in inf]) - + for key in ['q1_val', 'q2_val']: k_stats = get_list_stats([inf[key] for inf in optim_infos if key in inf]) for sk, sv in k_stats.items(): @@ -163,5 +164,5 @@ def get_train_log(self, optim_infos): def add_traj_to_memory(self, traj): for sd in traj.traj_data: - self.memory.append(deepcopy(sd)) + self.agent.memory.append(deepcopy(sd)) self.cur_step += traj.total_steps diff --git a/examples/run_test.sh b/examples/run_test.sh new file mode 100755 index 0000000..faade31 --- /dev/null +++ b/examples/run_test.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +trap "exit" INT TERM ERR +trap "kill 0" EXIT + +# CUDA_VISIBLE_DEVICES=0 python sac.py --env_name=Walker2d-v3 --max_steps=3000000 & +# CUDA_VISIBLE_DEVICES=0 python sac.py --env_name=Walker2d-v3 --alpha=0.1 --max_steps=3000000& +# CUDA_VISIBLE_DEVICES=0 python sac.py --env_name=Walker2d-v3 --alpha=0.2 --max_steps=3000000 & +# CUDA_VISIBLE_DEVICES=0 python sac.py --env_name=Hopper-v3 --max_steps=3000000 & +# CUDA_VISIBLE_DEVICES=0 python sac.py --env_name=Hopper-v3 --alpha=0.1 --max_steps=3000000 & +# CUDA_VISIBLE_DEVICES=0 python sac.py --env_name=Hopper-v3 --alpha=0.2 --max_steps=3000000 & +# CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Humanoid-v3 --max_steps=3000000 & +# CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Humanoid-v3 --alpha=0.1 --max_steps=3000000 & +# CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Humanoid-v3 --alpha=0.2 --max_steps=3000000 & +CUDA_VISIBLE_DEVICES=0 python sac.py --env_name=Walker2d-v3 --seed=1 --max_steps=3000000 & +CUDA_VISIBLE_DEVICES=0 python sac.py --env_name=Walker2d-v3 --seed=1 --alpha=0.2 --max_steps=3000000 & +CUDA_VISIBLE_DEVICES=0 python sac.py --env_name=Hopper-v3 --seed=1 --max_steps=3000000 & +CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Hopper-v3 --seed=1 --alpha=0.2 --max_steps=3000000 & +CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Humanoid-v3 --seed=1 --max_steps=3000000 & +CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Humanoid-v3 --seed=1 --alpha=0.2 --max_steps=3000000 & +wait diff --git a/examples/sac.py b/examples/sac.py index 94974e9..3117138 100644 --- a/examples/sac.py +++ b/examples/sac.py @@ -1,6 +1,6 @@ import gym import torch.nn as nn - +import torch from easyrl.agents.sac_agent import SACAgent from easyrl.configs.command_line import cfg_from_cmd from easyrl.configs.sac_config import sac_cfg @@ -14,6 +14,7 @@ def main(): + torch.set_num_threads(1) cfg_from_cmd(sac_cfg) if sac_cfg.resume or sac_cfg.test: if sac_cfg.test: @@ -58,12 +59,12 @@ def main(): q1 = ValueNet(q1_body) q2 = ValueNet(q2_body) - agent = SACAgent(actor, q1=q1, q2=q2, env=env) - runner = StepRunner(agent=agent, env=env, eval_env=eval_env) memory = CyclicBuffer(capacity=sac_cfg.replay_size) + agent = SACAgent(actor, q1=q1, q2=q2, env=env, memory=memory) + runner = StepRunner(agent=agent, env=env, eval_env=eval_env) + engine = SACEngine(agent=agent, - runner=runner, - memory=memory) + runner=runner) if not sac_cfg.test: engine.train() else: From e64b2ae55587eed288abcbc214f22eacf8477e1e Mon Sep 17 00:00:00 2001 From: taochenshh Date: Sat, 11 Jul 2020 09:47:01 -0400 Subject: [PATCH 04/35] save replay buffer --- easyrl/configs/sac_config.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/easyrl/configs/sac_config.py b/easyrl/configs/sac_config.py index b26560c..377c23f 100644 --- a/easyrl/configs/sac_config.py +++ b/easyrl/configs/sac_config.py @@ -17,5 +17,8 @@ class SACConfig(BasicConfig): polyak: float = 0.995 tgt_entropy: float = None + def __post_init__(self): + self.eval_interval = 300 + sac_cfg = SACConfig() From 74eaf098fbf8524610045308a38627e1effe750d Mon Sep 17 00:00:00 2001 From: taochenshh Date: Sat, 11 Jul 2020 10:19:22 -0400 Subject: [PATCH 05/35] change data dir --- easyrl/configs/basic_config.py | 8 ++++++-- examples/run_test.sh | 15 +++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/easyrl/configs/basic_config.py b/easyrl/configs/basic_config.py index 9468226..4c41ffc 100644 --- a/easyrl/configs/basic_config.py +++ b/easyrl/configs/basic_config.py @@ -14,6 +14,7 @@ class BasicConfig: seed: int = 0 device: str = 'cuda' save_dir: str = 'data' + save_dir_root: str = None eval_interval: int = 100 log_interval: int = 10 weight_decay: float = 0.00 @@ -40,6 +41,9 @@ def root_dir(self): @property def data_dir(self): + # save_dir_root will be appended in the front of save_dir + # if save_dir is not specified in absolute path from the command line + save_dir_root = Path.cwd() if self.save_dir_root is None else Path(self.save_dir_root) if hasattr(self, 'diff_cfg') and 'save_dir' in self.diff_cfg: # if 'save_dir' is given, then it will just # use it as the data dir @@ -47,12 +51,12 @@ def data_dir(self): if save_dir.is_absolute(): data_dir = save_dir else: - data_dir = Path.cwd().joinpath(self.save_dir) + data_dir = save_dir_root.joinpath(self.save_dir) if 'seed_' in data_dir.name: return data_dir else: return data_dir.joinpath(f'seed_{self.seed}') - data_dir = Path.cwd().joinpath(self.save_dir) + data_dir = save_dir_root.joinpath(self.save_dir) if self.env_name is not None: data_dir = data_dir.joinpath(self.env_name) skip_params = ['env_name', diff --git a/examples/run_test.sh b/examples/run_test.sh index faade31..934d64d 100755 --- a/examples/run_test.sh +++ b/examples/run_test.sh @@ -17,4 +17,19 @@ CUDA_VISIBLE_DEVICES=0 python sac.py --env_name=Hopper-v3 --seed=1 --max_steps=3 CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Hopper-v3 --seed=1 --alpha=0.2 --max_steps=3000000 & CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Humanoid-v3 --seed=1 --max_steps=3000000 & CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Humanoid-v3 --seed=1 --alpha=0.2 --max_steps=3000000 & + + + +CUDA_VISIBLE_DEVICES=2 python sac.py --env_name=Walker2d-v3 --seed=2 --max_steps=3000000 & +CUDA_VISIBLE_DEVICES=2 python sac.py --env_name=Walker2d-v3 --seed=2 --alpha=0.2 --max_steps=3000000 & +CUDA_VISIBLE_DEVICES=2 python sac.py --env_name=Hopper-v3 --seed=2 --max_steps=3000000 & +CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Hopper-v3 --seed=2 --alpha=0.2 --max_steps=3000000 & +CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Humanoid-v3 --seed=2 --max_steps=3000000 & +CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Humanoid-v3 --seed=2 --alpha=0.2 --max_steps=3000000 & + + +CUDA_VISIBLE_DEVICES=2 python sac.py --env_name=Walker2d-v3 --seed=2 --max_steps=3000000 --tgt_entropy=0. & + +CUDA_VISIBLE_DEVICES=2 python sac.py --env_name=Hopper-v3 --seed=2 --max_steps=3000000 --tgt_entropy=0. & +CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Humanoid-v3 --seed=2 --max_steps=3000000 --tgt_entropy=0. & wait From 4d52e578b7ab882a81c760009c605588c6dd2e2c Mon Sep 17 00:00:00 2001 From: taochenshh Date: Sat, 11 Jul 2020 10:26:02 -0400 Subject: [PATCH 06/35] change data dir --- easyrl/configs/basic_config.py | 1 + examples/run_test.sh | 17 ++++++++--------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/easyrl/configs/basic_config.py b/easyrl/configs/basic_config.py index 4c41ffc..b2ebd5c 100644 --- a/easyrl/configs/basic_config.py +++ b/easyrl/configs/basic_config.py @@ -69,6 +69,7 @@ def data_dir(self): 'eval_interval', 'render', 'seed', + 'save_dir_root', 'max_steps', 'pretrain_model'] if hasattr(self, 'diff_cfg'): diff --git a/examples/run_test.sh b/examples/run_test.sh index 934d64d..43c58c8 100755 --- a/examples/run_test.sh +++ b/examples/run_test.sh @@ -20,16 +20,15 @@ CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Humanoid-v3 --seed=1 --alpha=0.2 -CUDA_VISIBLE_DEVICES=2 python sac.py --env_name=Walker2d-v3 --seed=2 --max_steps=3000000 & -CUDA_VISIBLE_DEVICES=2 python sac.py --env_name=Walker2d-v3 --seed=2 --alpha=0.2 --max_steps=3000000 & +CUDA_VISIBLE_DEVICES=2 python sac.py --env_name=Walker2d-v3 --seed=2 --max_steps=3000000 --save_dir_root=/data/pulkitag/results/taochen/sac & +CUDA_VISIBLE_DEVICES=2 python sac.py --env_name=Walker2d-v3 --seed=2 --alpha=0.2 --max_steps=3000000 --save_dir_root=/data/pulkitag/results/taochen/sac & CUDA_VISIBLE_DEVICES=2 python sac.py --env_name=Hopper-v3 --seed=2 --max_steps=3000000 & -CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Hopper-v3 --seed=2 --alpha=0.2 --max_steps=3000000 & -CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Humanoid-v3 --seed=2 --max_steps=3000000 & -CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Humanoid-v3 --seed=2 --alpha=0.2 --max_steps=3000000 & +CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Hopper-v3 --seed=2 --alpha=0.2 --max_steps=3000000 --save_dir_root=/data/pulkitag/results/taochen/sac & +CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Humanoid-v3 --seed=2 --max_steps=3000000 --save_dir_root=/data/pulkitag/results/taochen/sac & +CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Humanoid-v3 --seed=2 --alpha=0.2 --max_steps=3000000 --save_dir_root=/data/pulkitag/results/taochen/sac & -CUDA_VISIBLE_DEVICES=2 python sac.py --env_name=Walker2d-v3 --seed=2 --max_steps=3000000 --tgt_entropy=0. & - -CUDA_VISIBLE_DEVICES=2 python sac.py --env_name=Hopper-v3 --seed=2 --max_steps=3000000 --tgt_entropy=0. & -CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Humanoid-v3 --seed=2 --max_steps=3000000 --tgt_entropy=0. & +CUDA_VISIBLE_DEVICES=2 python sac.py --env_name=Walker2d-v3 --seed=2 --max_steps=3000000 --tgt_entropy=0. --save_dir_root=/data/pulkitag/results/taochen/sac & +CUDA_VISIBLE_DEVICES=2 python sac.py --env_name=Hopper-v3 --seed=2 --max_steps=3000000 --tgt_entropy=0. --save_dir_root=/data/pulkitag/results/taochen/sac & +CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Humanoid-v3 --seed=2 --max_steps=3000000 --tgt_entropy=0. --save_dir_root=/data/pulkitag/results/taochen/sac & wait From 94c19cadb50274a6282676374c9bc714ba7e9635 Mon Sep 17 00:00:00 2001 From: taochenshh Date: Sun, 12 Jul 2020 23:52:37 -0400 Subject: [PATCH 07/35] add tool to sweep hyperparameters --- easyrl/agents/sac_agent.py | 4 +- easyrl/configs/basic_config.py | 1 + easyrl/utils/common.py | 37 +++--- easyrl/utils/hp_sweeper.py | 156 +++++++++++++++++++++++++ easyrl/utils/non_block_streamreader.py | 37 ++++++ examples/run_test.sh | 14 ++- examples/sac.py | 1 - examples/sac_sweeper.yml | 10 ++ requirements.txt | 4 +- 9 files changed, 243 insertions(+), 21 deletions(-) create mode 100644 easyrl/utils/hp_sweeper.py create mode 100644 easyrl/utils/non_block_streamreader.py create mode 100644 examples/sac_sweeper.yml diff --git a/easyrl/agents/sac_agent.py b/easyrl/agents/sac_agent.py index db03409..c5e17ff 100644 --- a/easyrl/agents/sac_agent.py +++ b/easyrl/agents/sac_agent.py @@ -129,8 +129,8 @@ def update_q(self, obs, actions, next_obs, rewards, dones): nlog_prob = action_log_prob(next_actions, next_act_dist).unsqueeze(-1) nq1_tgt_val = self.q1_tgt((next_obs, next_actions))[0] nq2_tgt_val = self.q2_tgt((next_obs, next_actions))[0] - nq_tgt_val = torch.min(nq1_tgt_val, nq2_tgt_val) - q_tgt_val = rewards + sac_cfg.rew_discount * (1 - dones) * (nq_tgt_val - self.alpha * nlog_prob) + nq_tgt_val = torch.min(nq1_tgt_val, nq2_tgt_val) - self.alpha * nlog_prob + q_tgt_val = rewards + sac_cfg.rew_discount * (1 - dones) * nq_tgt_val loss_q1 = F.mse_loss(q1, q_tgt_val) loss_q2 = F.mse_loss(q2, q_tgt_val) loss_q = loss_q1 + loss_q2 diff --git a/easyrl/configs/basic_config.py b/easyrl/configs/basic_config.py index b2ebd5c..c6a7550 100644 --- a/easyrl/configs/basic_config.py +++ b/easyrl/configs/basic_config.py @@ -69,6 +69,7 @@ def data_dir(self): 'eval_interval', 'render', 'seed', + 'device', 'save_dir_root', 'max_steps', 'pretrain_model'] diff --git a/easyrl/utils/common.py b/easyrl/utils/common.py index 97d6f53..f23dde4 100644 --- a/easyrl/utils/common.py +++ b/easyrl/utils/common.py @@ -9,7 +9,7 @@ import git import numpy as np import torch - +import yaml from easyrl.utils.rl_logger import logger @@ -29,9 +29,9 @@ def list_to_numpy(data, expand_dims=None): data = np.expand_dims(data, axis=expand_dims) return data + def save_traj(traj, save_dir): - if isinstance(save_dir, str): - save_dir = Path(save_dir) + save_dir = pathlib_file(save_dir) if not save_dir.exists(): Path.mkdir(save_dir, parents=True) save_state = traj[0].state is not None @@ -89,8 +89,7 @@ def save_traj(traj, save_dir): def save_images(images, save_dir): - if isinstance(save_dir, str): - save_dir = Path(save_dir) + save_dir = pathlib_file(save_dir) if save_dir.exists(): shutil.rmtree(save_dir, ignore_errors=True) Path.mkdir(save_dir, parents=True) @@ -112,8 +111,7 @@ def convert_imgs_to_video(images, video_file, fps=20): def save_to_json(data, file_name): - if isinstance(file_name, str): - file_name = Path(file_name) + file_name = pathlib_file(file_name) if not file_name.parent.exists(): Path.mkdir(file_name.parent, parents=True) with file_name.open('w') as f: @@ -121,16 +119,21 @@ def save_to_json(data, file_name): def load_from_json(file_name): - if isinstance(file_name, str): - file_name = Path(file_name) + file_name = pathlib_file(file_name) with file_name.open('r') as f: data = json.load(f) return data +def load_from_yaml(file_name): + file_name = pathlib_file(file_name) + with file_name.open('r') as f: + data = yaml.load(f, Loader=yaml.FullLoader) + return data + + def save_to_pickle(data, file_name): - if isinstance(file_name, str): - file_name = Path(file_name) + file_name = pathlib_file(file_name) if not file_name.parent.exists(): Path.mkdir(file_name.parent, parents=True) with file_name.open('wb') as f: @@ -138,13 +141,21 @@ def save_to_pickle(data, file_name): def load_from_pickle(file_name): - if isinstance(file_name, str): - file_name = Path(file_name) + file_name = pathlib_file(file_name) with file_name.open('rb') as f: data = pkl.load(f) return data +def pathlib_file(file_name): + if isinstance(file_name, str): + file_name = Path(file_name) + elif not isinstance(file_name, Path): + raise TypeError(f'Please check the type of ' + f'the filename:{file_name}') + return file_name + + def tile_images(img_nhwc): """ Tile N images into one big PxQ image diff --git a/easyrl/utils/hp_sweeper.py b/easyrl/utils/hp_sweeper.py new file mode 100644 index 0000000..7253950 --- /dev/null +++ b/easyrl/utils/hp_sweeper.py @@ -0,0 +1,156 @@ +import argparse +import itertools +import math +import shlex +import subprocess +import sys +import time +from pathlib import Path + +import GPUtil + +from easyrl.utils.common import load_from_yaml +from easyrl.utils.non_block_streamreader import NonBlockingStreamReader as NBSR +from easyrl.utils.rl_logger import logger + + +def get_hparams_combo(hparams): + """ + This function takes in just the hyperparameters (dict) and return the + combination of all possible hp configuration. + + inputs: + hparams is a dict, where each key is the name of a commandline arg and + the value is the target value of the arg. + + However any arg can also be a list and so this function will calculate + the cross product for all combinations of all args. + + output: + The return value is a sequence of lists. Each list is one of the + permutations of argument values. + """ + hp_vals = [] + + for elem in hparams.values(): + if isinstance(elem, list) or isinstance(elem, tuple): + hp_vals.append(elem) + else: + hp_vals.append([elem]) + + new_hp_vals = list(itertools.product(*hp_vals)) + hp_keys = hparams.keys() + new_hparams_combo = [] + for idx, hp_val in enumerate(new_hp_vals): + new_hparams_combo.append({k: v for k, v in zip(hp_keys, hp_val)}) + return new_hparams_combo + + +def cmd_for_hparams(hparams): + """ + Construct the training script args from the hparams + """ + cmd = '' + for field, val in hparams.items(): + if type(val) is bool: + if val is True: + cmd += f'--{field} ' + elif val != 'None': + cmd += f'--{field} {val} ' + return cmd + + +def get_sweep_cmds(yaml_file): + configs = load_from_yaml(yaml_file) + base_cmd = configs['cmd'] + hparams = configs['hparams'] + + hparams_combo = get_hparams_combo(hparams) + cmds = [] + for idx, hps in enumerate(hparams_combo): + cmd = base_cmd + ' ' + cmd_for_hparams(hps) + cmds.append(cmd) + + all_gpus_stats = GPUtil.getGPUs() + exclude_gpus = configs['exclude_gpus'] + gpu_mem_per_job = configs['gpu_memory_per_job'] + gpu_mem_pct_per_job = float(gpu_mem_per_job) / all_gpus_stats[0].memoryTotal + if exclude_gpus == 'None': + exclude_gpus = [] + gpus_to_use = GPUtil.getAvailable(order='first', + limit=100, + maxLoad=0.8, + maxMemory=1 - gpu_mem_pct_per_job, + includeNan=False, + excludeID=exclude_gpus, + excludeUUID=[]) + num_exps = len(cmds) + gpus_free_mem = [all_gpus_stats[x].memoryFree for x in gpus_to_use] + allowable_gpu_jobs = [int(math.floor(x / gpu_mem_per_job)) for x in gpus_free_mem] + jobs_run_on_gpu = [0 for i in range(len(gpus_free_mem))] + can_run_on_gpu = [True for i in range(len(gpus_free_mem))] + gpu_id = 0 + final_cmds = [] + for idx in range(num_exps): + if not any(can_run_on_gpu): + logger.warning(f'Run out of GPUs!') + break + while not can_run_on_gpu[gpu_id]: + gpu_id = (gpu_id + 1) % len(gpus_free_mem) + final_cmds.append(cmds[idx] + f' --device=cuda:{gpu_id}') + jobs_run_on_gpu[gpu_id] += 1 + can_run_on_gpu[gpu_id] = jobs_run_on_gpu[gpu_id] < allowable_gpu_jobs[gpu_id] + gpu_id = (gpu_id + 1) % len(gpus_free_mem) + return final_cmds + + +def run_sweep_cmds(cmds): + output_dir = Path.cwd().joinpath('sp_outputs') + output_dir.mkdir(parents=True, exist_ok=True) + processes = [] + nbsrs = [] + for idx, cmd in enumerate(cmds): + logger.info(f'CMD_{idx}:{cmd}') + p = subprocess.Popen(shlex.split(cmd), + stderr=subprocess.STDOUT, + stdout=subprocess.PIPE) + processes.append(p) + nbsrs.append(NBSR(p.stdout)) + try: + while True: + all_done = [False for i in range(len(processes))] + for idx, p in enumerate(processes): + stime = time.time() + proc_print = False + while True: + lines = nbsrs[idx].readline(0.2) + if lines: + if not proc_print: + logger.info(f'====================================') + logger.info(f'Process {idx}:') + proc_print = True + print(lines.decode('utf-8')) + if time.time() - stime > 10: + break + else: + break + if p.poll() is not None: + all_done[idx] = True + break + if all(all_done): + break + time.sleep(2) + except KeyboardInterrupt: + print('Exiting...') + for p in processes: + p.terminate() + sys.exit() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--cfg_file', type=str, + required=True, help='config file (yaml)') + args = parser.parse_args() + cmds = get_sweep_cmds(args.cfg_file) + run_sweep_cmds(cmds) diff --git a/easyrl/utils/non_block_streamreader.py b/easyrl/utils/non_block_streamreader.py new file mode 100644 index 0000000..c17c132 --- /dev/null +++ b/easyrl/utils/non_block_streamreader.py @@ -0,0 +1,37 @@ +from queue import Empty +from queue import Queue +from threading import Thread + + +class NonBlockingStreamReader: + + def __init__(self, stream): + ''' + stream: the stream to read from. + Usually a process' stdout or stderr. + ''' + + self._s = stream + self._q = Queue() + + def _populateQueue(stream, queue): + ''' + Collect lines from 'stream' and put them in 'quque'. + ''' + + while True: + line = stream.readline() + if line: + queue.put(line) + + self._t = Thread(target=_populateQueue, + args=(self._s, self._q)) + self._t.daemon = True + self._t.start() # start collecting lines from the stream + + def readline(self, timeout=None): + try: + return self._q.get(block=timeout is not None, + timeout=timeout) + except Empty: + return None diff --git a/examples/run_test.sh b/examples/run_test.sh index 43c58c8..1949fd9 100755 --- a/examples/run_test.sh +++ b/examples/run_test.sh @@ -22,13 +22,19 @@ CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Humanoid-v3 --seed=1 --alpha=0.2 CUDA_VISIBLE_DEVICES=2 python sac.py --env_name=Walker2d-v3 --seed=2 --max_steps=3000000 --save_dir_root=/data/pulkitag/results/taochen/sac & CUDA_VISIBLE_DEVICES=2 python sac.py --env_name=Walker2d-v3 --seed=2 --alpha=0.2 --max_steps=3000000 --save_dir_root=/data/pulkitag/results/taochen/sac & -CUDA_VISIBLE_DEVICES=2 python sac.py --env_name=Hopper-v3 --seed=2 --max_steps=3000000 & +CUDA_VISIBLE_DEVICES=2 python sac.py --env_name=Hopper-v3 --seed=2 --max_steps=3000000 /data/pulkitag/results/taochen/sac & CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Hopper-v3 --seed=2 --alpha=0.2 --max_steps=3000000 --save_dir_root=/data/pulkitag/results/taochen/sac & CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Humanoid-v3 --seed=2 --max_steps=3000000 --save_dir_root=/data/pulkitag/results/taochen/sac & CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Humanoid-v3 --seed=2 --alpha=0.2 --max_steps=3000000 --save_dir_root=/data/pulkitag/results/taochen/sac & -CUDA_VISIBLE_DEVICES=2 python sac.py --env_name=Walker2d-v3 --seed=2 --max_steps=3000000 --tgt_entropy=0. --save_dir_root=/data/pulkitag/results/taochen/sac & -CUDA_VISIBLE_DEVICES=2 python sac.py --env_name=Hopper-v3 --seed=2 --max_steps=3000000 --tgt_entropy=0. --save_dir_root=/data/pulkitag/results/taochen/sac & -CUDA_VISIBLE_DEVICES=1 python sac.py --env_name=Humanoid-v3 --seed=2 --max_steps=3000000 --tgt_entropy=0. --save_dir_root=/data/pulkitag/results/taochen/sac & +CUDA_VISIBLE_DEVICES=0 python sac.py --env_name=Walker2d-v3 --seed=1 --max_steps=3000000 --tgt_entropy=0. --save_dir_root=/data/pulkitag/results/taochen/sac & +CUDA_VISIBLE_DEVICES=0 python sac.py --env_name=Hopper-v3 --seed=1 --max_steps=3000000 --tgt_entropy=0. --save_dir_root=/data/pulkitag/results/taochen/sac & +CUDA_VISIBLE_DEVICES=4 python sac.py --env_name=Humanoid-v3 --seed=1 --max_steps=3000000 --tgt_entropy=0. --save_dir_root=/data/pulkitag/results/taochen/sac & +CUDA_VISIBLE_DEVICES=4 python sac.py --env_name=Walker2d-v3 --seed=0 --max_steps=3000000 --tgt_entropy=0. --save_dir_root=/data/pulkitag/results/taochen/sac & +CUDA_VISIBLE_DEVICES=5 python sac.py --env_name=Hopper-v3 --seed=0 --max_steps=3000000 --tgt_entropy=0. --save_dir_root=/data/pulkitag/results/taochen/sac & +CUDA_VISIBLE_DEVICES=5 python sac.py --env_name=Humanoid-v3 --seed=0 --max_steps=3000000 --tgt_entropy=0. --save_dir_root=/data/pulkitag/results/taochen/sac & +CUDA_VISIBLE_DEVICES=0 python sac.py --env_name=Walker2d-v3 --seed=2 --max_steps=3000000 --tgt_entropy=0. --save_dir_root=/data/pulkitag/results/taochen/sac & +CUDA_VISIBLE_DEVICES=4 python sac.py --env_name=Hopper-v3 --seed=2 --max_steps=3000000 --tgt_entropy=0. --save_dir_root=/data/pulkitag/results/taochen/sac & +CUDA_VISIBLE_DEVICES=5 python sac.py --env_name=Humanoid-v3 --seed=2 --max_steps=3000000 --tgt_entropy=0. --save_dir_root=/data/pulkitag/results/taochen/sac & wait diff --git a/examples/sac.py b/examples/sac.py index 3117138..7f08f31 100644 --- a/examples/sac.py +++ b/examples/sac.py @@ -56,7 +56,6 @@ def main(): tanh_on_dist=True, std_cond_in=True, clamp_log_std=True) - q1 = ValueNet(q1_body) q2 = ValueNet(q2_body) memory = CyclicBuffer(capacity=sac_cfg.replay_size) diff --git a/examples/sac_sweeper.yml b/examples/sac_sweeper.yml new file mode 100644 index 0000000..7970644 --- /dev/null +++ b/examples/sac_sweeper.yml @@ -0,0 +1,10 @@ +cmd: python sac.py + +exclude_gpus: None +gpu_memory_per_job: 1100 # unit: MB +hparams: + save_dir_root: tmp + max_steps: 3000000 +# env_name: [Walker2d-v3, Hopper-v3, Humanoid-v3] +# seed: [0, 1, 2] + alpha: [0.2, None] diff --git a/requirements.txt b/requirements.txt index c367910..27594c6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,6 @@ cloudpickle gitpython tensorboard tqdm -dataclasses;python_version < '3.7' \ No newline at end of file +dataclasses;python_version < '3.7' +pyyaml +gputil \ No newline at end of file From 5d503a7f0e9b5c43ffdbfd8ff002b7fbc4800c4d Mon Sep 17 00:00:00 2001 From: taochenshh Date: Mon, 13 Jul 2020 14:41:46 -0400 Subject: [PATCH 08/35] bk exp --- examples/sac_sweeper.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/sac_sweeper.yml b/examples/sac_sweeper.yml index 7970644..4e7500b 100644 --- a/examples/sac_sweeper.yml +++ b/examples/sac_sweeper.yml @@ -3,8 +3,10 @@ cmd: python sac.py exclude_gpus: None gpu_memory_per_job: 1100 # unit: MB hparams: - save_dir_root: tmp + save_dir_root: /data/pulkitag/results/taochen/sac max_steps: 3000000 -# env_name: [Walker2d-v3, Hopper-v3, Humanoid-v3] + env_name: [Walker2d-v3, Hopper-v3, Humanoid-v3] # seed: [0, 1, 2] - alpha: [0.2, None] + actor_lr: [0.001, 0.0003] + critic_lr: [0.001, 0.0003] +# alpha: [0.2, None] From 33d2082f8fd25c2b84af23d1938e32c13ee47f35 Mon Sep 17 00:00:00 2001 From: taochenshh Date: Mon, 13 Jul 2020 15:32:50 -0400 Subject: [PATCH 09/35] add hp_sweep as a cli command --- easyrl/utils/hp_sweeper.py | 6 +++++- examples/README.md | 6 ++++++ setup.py | 3 +++ 3 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 examples/README.md diff --git a/easyrl/utils/hp_sweeper.py b/easyrl/utils/hp_sweeper.py index 7253950..fcc1439 100644 --- a/easyrl/utils/hp_sweeper.py +++ b/easyrl/utils/hp_sweeper.py @@ -147,10 +147,14 @@ def run_sweep_cmds(cmds): sys.exit() -if __name__ == '__main__': +def main(): parser = argparse.ArgumentParser() parser.add_argument('--cfg_file', type=str, required=True, help='config file (yaml)') args = parser.parse_args() cmds = get_sweep_cmds(args.cfg_file) run_sweep_cmds(cmds) + + +if __name__ == '__main__': + main() diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..097d98b --- /dev/null +++ b/examples/README.md @@ -0,0 +1,6 @@ +### Hyperparameter Sweep + +First, define a yaml file (e.g., `sac_sweeper.yml`) that specifies the search values for each hyperparameter. And run the following command: +```bash +hp_sweep --cfg_file sac_sweeper.yml +``` \ No newline at end of file diff --git a/setup.py b/setup.py index ffc994d..6576840 100644 --- a/setup.py +++ b/setup.py @@ -29,4 +29,7 @@ def read_requirements_file(filename): license='MIT', packages=easyrl_pkgs, install_requires=read_requirements_file('requirements.txt'), + entry_points={ + 'console_scripts': ['hp_sweep=easyrl.utils.hp_sweeper:main'] + } ) From 5240b1067a69ce65078845c08e35cb442bb20d15 Mon Sep 17 00:00:00 2001 From: Tao <15166943+taochenshh@users.noreply.github.com> Date: Wed, 15 Jul 2020 15:19:54 -0400 Subject: [PATCH 10/35] Update sac_sweeper.yml --- examples/sac_sweeper.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/sac_sweeper.yml b/examples/sac_sweeper.yml index 4e7500b..b349016 100644 --- a/examples/sac_sweeper.yml +++ b/examples/sac_sweeper.yml @@ -3,7 +3,7 @@ cmd: python sac.py exclude_gpus: None gpu_memory_per_job: 1100 # unit: MB hparams: - save_dir_root: /data/pulkitag/results/taochen/sac + save_dir_root: tmp max_steps: 3000000 env_name: [Walker2d-v3, Hopper-v3, Humanoid-v3] # seed: [0, 1, 2] From df05b5d33da3f12eb1161194272dac15bf032b17 Mon Sep 17 00:00:00 2001 From: taochenshh Date: Wed, 15 Jul 2020 15:23:46 -0400 Subject: [PATCH 11/35] fix gpu allocation bug in hp_sweeper --- easyrl/utils/hp_sweeper.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/easyrl/utils/hp_sweeper.py b/easyrl/utils/hp_sweeper.py index fcc1439..0ff22e0 100644 --- a/easyrl/utils/hp_sweeper.py +++ b/easyrl/utils/hp_sweeper.py @@ -87,8 +87,8 @@ def get_sweep_cmds(yaml_file): num_exps = len(cmds) gpus_free_mem = [all_gpus_stats[x].memoryFree for x in gpus_to_use] allowable_gpu_jobs = [int(math.floor(x / gpu_mem_per_job)) for x in gpus_free_mem] - jobs_run_on_gpu = [0 for i in range(len(gpus_free_mem))] - can_run_on_gpu = [True for i in range(len(gpus_free_mem))] + jobs_run_on_gpu = [0 for i in range(len(gpus_to_use))] + can_run_on_gpu = [True for i in range(len(gpus_to_use))] gpu_id = 0 final_cmds = [] for idx in range(num_exps): @@ -96,11 +96,11 @@ def get_sweep_cmds(yaml_file): logger.warning(f'Run out of GPUs!') break while not can_run_on_gpu[gpu_id]: - gpu_id = (gpu_id + 1) % len(gpus_free_mem) - final_cmds.append(cmds[idx] + f' --device=cuda:{gpu_id}') + gpu_id = (gpu_id + 1) % len(gpus_to_use) + final_cmds.append(cmds[idx] + f' --device=cuda:{gpus_to_use[gpu_id]}') jobs_run_on_gpu[gpu_id] += 1 can_run_on_gpu[gpu_id] = jobs_run_on_gpu[gpu_id] < allowable_gpu_jobs[gpu_id] - gpu_id = (gpu_id + 1) % len(gpus_free_mem) + gpu_id = (gpu_id + 1) % len(gpus_to_use) return final_cmds @@ -149,7 +149,7 @@ def run_sweep_cmds(cmds): def main(): parser = argparse.ArgumentParser() - parser.add_argument('--cfg_file', type=str, + parser.add_argument('--cfg_file', '-f', type=str, required=True, help='config file (yaml)') args = parser.parse_args() cmds = get_sweep_cmds(args.cfg_file) From 93cab66e23c509716cea5d4ee6f6b4b985715f15 Mon Sep 17 00:00:00 2001 From: taochenshh Date: Thu, 23 Jul 2020 10:20:39 -0400 Subject: [PATCH 12/35] get grad norm by default --- easyrl/agents/ppo_agent.py | 9 ++------- easyrl/agents/sac_agent.py | 25 ++++++++----------------- easyrl/utils/torch_util.py | 12 ++++++++++++ 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/easyrl/agents/ppo_agent.py b/easyrl/agents/ppo_agent.py index 1853586..6d79624 100644 --- a/easyrl/agents/ppo_agent.py +++ b/easyrl/agents/ppo_agent.py @@ -13,7 +13,7 @@ from easyrl.utils.torch_util import action_entropy from easyrl.utils.torch_util import action_from_dist from easyrl.utils.torch_util import action_log_prob -from easyrl.utils.torch_util import get_grad_norm +from easyrl.utils.torch_util import clip_grad from easyrl.utils.torch_util import load_ckpt_data from easyrl.utils.torch_util import load_state_dict from easyrl.utils.torch_util import move_to @@ -130,12 +130,7 @@ def optimize(self, data, *args, **kwargs): self.optimizer.zero_grad() loss.backward() - if ppo_cfg.max_grad_norm is not None: - grad_norm = torch.nn.utils.clip_grad_norm_(self.all_params, - ppo_cfg.max_grad_norm) - grad_norm = grad_norm.item() - else: - grad_norm = get_grad_norm(self.all_params) + grad_norm = clip_grad(self.all_params, ppo_cfg.max_grad_norm) self.optimizer.step() with torch.no_grad(): approx_kl = 0.5 * torch.mean(torch.pow(old_log_prob - log_prob, 2)) diff --git a/easyrl/agents/sac_agent.py b/easyrl/agents/sac_agent.py index c5e17ff..ccdb67f 100644 --- a/easyrl/agents/sac_agent.py +++ b/easyrl/agents/sac_agent.py @@ -1,3 +1,4 @@ +import pickle from copy import deepcopy import torch @@ -7,11 +8,14 @@ from easyrl.agents.base_agent import BaseAgent from easyrl.configs.sac_config import sac_cfg +from easyrl.utils.common import load_from_pickle +from easyrl.utils.common import save_to_pickle from easyrl.utils.gym_util import num_space_dim +from easyrl.utils.rl_logger import logger from easyrl.utils.torch_util import action_from_dist from easyrl.utils.torch_util import action_log_prob +from easyrl.utils.torch_util import clip_grad from easyrl.utils.torch_util import freeze_model -from easyrl.utils.torch_util import get_grad_norm from easyrl.utils.torch_util import load_ckpt_data from easyrl.utils.torch_util import load_state_dict from easyrl.utils.torch_util import move_to @@ -20,10 +24,7 @@ from easyrl.utils.torch_util import torch_float from easyrl.utils.torch_util import torch_to_np from easyrl.utils.torch_util import unfreeze_model -from easyrl.utils.common import save_to_pickle -from easyrl.utils.rl_logger import logger -from easyrl.utils.common import load_from_pickle -import pickle + class SACAgent(BaseAgent): def __init__(self, actor, q1, q2, env, memory): @@ -136,12 +137,7 @@ def update_q(self, obs, actions, next_obs, rewards, dones): loss_q = loss_q1 + loss_q2 self.q_optimizer.zero_grad() loss_q.backward() - if sac_cfg.max_grad_norm is not None: - grad_norm = torch.nn.utils.clip_grad_norm_(self.q_params, - sac_cfg.max_grad_norm) - grad_norm = grad_norm.item() - else: - grad_norm = get_grad_norm(self.q_params) + grad_norm = clip_grad(self.q_params, sac_cfg.max_grad_norm) self.q_optimizer.step() q_info = dict( q1_loss=loss_q1.item(), @@ -166,12 +162,7 @@ def update_pi(self, obs): self.q_optimizer.zero_grad() self.pi_optimizer.zero_grad() loss_pi.backward() - if sac_cfg.max_grad_norm is not None: - grad_norm = torch.nn.utils.clip_grad_norm_(self.actor.parameters(), - sac_cfg.max_grad_norm) - grad_norm = grad_norm.item() - else: - grad_norm = get_grad_norm(self.actor.parameters()) + grad_norm = clip_grad(self.actor.parameters(), sac_cfg.max_grad_norm) self.pi_optimizer.step() pi_info = dict( pi_loss=loss_pi.item(), diff --git a/easyrl/utils/torch_util.py b/easyrl/utils/torch_util.py index 32af0d6..5ba4a6c 100644 --- a/easyrl/utils/torch_util.py +++ b/easyrl/utils/torch_util.py @@ -24,6 +24,16 @@ def hard_update(target, source): target.load_state_dict(source.state_dict()) +def clip_grad(params, max_grad_norm): + if max_grad_norm is not None: + grad_norm = torch.nn.utils.clip_grad_norm_(params, + max_grad_norm) + grad_norm = grad_norm.item() + else: + grad_norm = get_grad_norm(params) + return grad_norm + + def freeze_model(model): if isinstance(model, list) or isinstance(model, tuple): for md in model: @@ -54,6 +64,8 @@ def get_grad_norm(model): total_norm = 0 iterator = model.parameters() if isinstance(model, nn.Module) else model for p in iterator: + if p.grad is None: + continue total_norm += p.grad.data.pow(2).sum().item() total_norm = total_norm ** 0.5 return total_norm From c1d4132cb1219de73307c7d516a451b8e6c8f25e Mon Sep 17 00:00:00 2001 From: taochenshh Date: Thu, 20 Aug 2020 11:58:55 -0400 Subject: [PATCH 13/35] update utils --- easyrl/agents/ppo_agent.py | 8 +++++-- easyrl/engine/ppo_engine.py | 6 ++--- easyrl/utils/common.py | 2 ++ easyrl/utils/gym_util.py | 22 ++++++++++++++++- easyrl/utils/hp_sweeper.py | 47 +++++++++++++++++++++++++++++-------- easyrl/utils/torch_util.py | 7 ++++-- examples/sac_sweeper.yml | 30 +++++++++++++++++++---- 7 files changed, 99 insertions(+), 23 deletions(-) diff --git a/easyrl/agents/ppo_agent.py b/easyrl/agents/ppo_agent.py index 6d79624..86fa7da 100644 --- a/easyrl/agents/ppo_agent.py +++ b/easyrl/agents/ppo_agent.py @@ -171,8 +171,12 @@ def cal_loss(self, val, old_val, ret, log_prob, old_log_prob, adv, entropy): 1 - ppo_cfg.clip_range, 1 + ppo_cfg.clip_range) pg_loss = -torch.mean(torch.min(surr1, surr2)) - - loss = pg_loss - entropy * ppo_cfg.ent_coef + \ + # if entropy.item() < 0.2: + # ent_coef = 1 + # else: + # ent_coef = ppo_cfg.ent_coef + ent_coef = ppo_cfg.ent_coef + loss = pg_loss - entropy * ent_coef + \ vf_loss * ppo_cfg.vf_coef return loss, pg_loss, vf_loss, ratio diff --git a/easyrl/engine/ppo_engine.py b/easyrl/engine/ppo_engine.py index ded202b..ae07d37 100644 --- a/easyrl/engine/ppo_engine.py +++ b/easyrl/engine/ppo_engine.py @@ -36,15 +36,15 @@ def __init__(self, agent, runner): def train(self): for iter_t in count(): - traj, rollout_time = self.rollout_once(sample=True, - time_steps=ppo_cfg.episode_steps) - train_log_info = self.train_once(traj) if iter_t % ppo_cfg.eval_interval == 0: eval_log_info, _ = self.eval() self.agent.save_model(is_best=self._eval_is_best, step=self.cur_step) else: eval_log_info = None + traj, rollout_time = self.rollout_once(sample=True, + time_steps=ppo_cfg.episode_steps) + train_log_info = self.train_once(traj) if iter_t % ppo_cfg.log_interval == 0: train_log_info['train/rollout_time'] = rollout_time if eval_log_info is not None: diff --git a/easyrl/utils/common.py b/easyrl/utils/common.py index f23dde4..75a0d02 100644 --- a/easyrl/utils/common.py +++ b/easyrl/utils/common.py @@ -183,6 +183,8 @@ def linear_decay_percent(epoch, total_epochs): def get_list_stats(data): + if len(data) < 1: + return dict() min_data = np.amin(data) max_data = np.amax(data) mean_data = np.mean(data) diff --git a/easyrl/utils/gym_util.py b/easyrl/utils/gym_util.py index de59910..eff2a6a 100644 --- a/easyrl/utils/gym_util.py +++ b/easyrl/utils/gym_util.py @@ -1,3 +1,5 @@ +from copy import deepcopy + import gym import numpy as np from gym.spaces import Box @@ -11,7 +13,7 @@ from easyrl.envs.shmem_vec_env import ShmemVecEnv from easyrl.envs.timeout import TimeOutEnv from easyrl.utils.rl_logger import logger - +from gym.wrappers.time_limit import TimeLimit def num_space_dim(space): if isinstance(space, Box): @@ -55,3 +57,21 @@ def _thunk(): else: envs = DummyVecEnv(envs) return envs + + +def get_render_images(env): + try: + img = env.get_images() + except AttributeError: + try: + img = env.render('rgb_array') + except AttributeError: + raise AttributeError('Cannot get rendered images.') + return deepcopy(img) + + +def is_time_limit_env(env): + if not (isinstance(env, TimeLimit)): + if not hasattr(env, 'env') or (hasattr(env, 'env') and not isinstance(env.env, TimeLimit)): + return False + return True diff --git a/easyrl/utils/hp_sweeper.py b/easyrl/utils/hp_sweeper.py index 0ff22e0..074ff6f 100644 --- a/easyrl/utils/hp_sweeper.py +++ b/easyrl/utils/hp_sweeper.py @@ -14,6 +14,20 @@ from easyrl.utils.rl_logger import logger +def expand_hparam_items(hparams, hp_keys, hp_vals, key_prefix=None): + for key, elem in hparams.items(): + if isinstance(elem, list) or isinstance(elem, tuple): + key = key if key_prefix is None else f'{key_prefix}/{key}' + hp_keys.append(key) + hp_vals.append(elem) + elif isinstance(elem, dict): + expand_hparam_items(elem, hp_keys, hp_vals, key_prefix=key) + else: + key = key if key_prefix is None else f'{key_prefix}/{key}' + hp_keys.append(key) + hp_vals.append([elem]) + + def get_hparams_combo(hparams): """ This function takes in just the hyperparameters (dict) and return the @@ -31,15 +45,10 @@ def get_hparams_combo(hparams): permutations of argument values. """ hp_vals = [] - - for elem in hparams.values(): - if isinstance(elem, list) or isinstance(elem, tuple): - hp_vals.append(elem) - else: - hp_vals.append([elem]) + hp_keys = [] + expand_hparam_items(hparams, hp_keys, hp_vals) new_hp_vals = list(itertools.product(*hp_vals)) - hp_keys = hparams.keys() new_hparams_combo = [] for idx, hp_val in enumerate(new_hp_vals): new_hparams_combo.append({k: v for k, v in zip(hp_keys, hp_val)}) @@ -52,14 +61,30 @@ def cmd_for_hparams(hparams): """ cmd = '' for field, val in hparams.items(): + # by default, if a boolean variable is not specified + # with a default config value, we assume the default + # value is False if type(val) is bool: - if val is True: - cmd += f'--{field} ' - elif val != 'None': + if '/' in field and 'true' in field: + cmd = boolean_cmd(cmd, field, val, default_false=False) + else: + cmd = boolean_cmd(cmd, field, val, default_false=True) + elif val != 'None' and val: cmd += f'--{field} {val} ' return cmd +def boolean_cmd(cmd, field, val, default_false=True): + if '/' in field: + field = field.split('/')[-1] + if val is default_false: + if default_false: + cmd += f'--{field} ' + else: + cmd += f'--no_{field} ' + return cmd + + def get_sweep_cmds(yaml_file): configs = load_from_yaml(yaml_file) base_cmd = configs['cmd'] @@ -141,6 +166,8 @@ def run_sweep_cmds(cmds): break time.sleep(2) except KeyboardInterrupt: + logger.warning('Keyboard interruption.') + finally: print('Exiting...') for p in processes: p.terminate() diff --git a/easyrl/utils/torch_util.py b/easyrl/utils/torch_util.py index 5ba4a6c..0450ed1 100644 --- a/easyrl/utils/torch_util.py +++ b/easyrl/utils/torch_util.py @@ -402,10 +402,13 @@ def ortho_init(module, nonlinearity=None, weight_scale=1.0, constant_bias=0.0): class EpisodeDataset(Dataset): - def __init__(self, **kwargs): + def __init__(self, swap_leading_axes=True, **kwargs): self.data = dict() for key, val in kwargs.items(): - self.data[key] = self._swap_leading_axes(val) + if swap_leading_axes: + self.data[key] = self._swap_leading_axes(val) + else: + self.data[key] = val self.length = next(iter(self.data.values())).shape[0] def __len__(self): diff --git a/examples/sac_sweeper.yml b/examples/sac_sweeper.yml index b349016..b56e808 100644 --- a/examples/sac_sweeper.yml +++ b/examples/sac_sweeper.yml @@ -1,12 +1,32 @@ cmd: python sac.py -exclude_gpus: None +exclude_gpus: None # [1] gpu_memory_per_job: 1100 # unit: MB hparams: - save_dir_root: tmp + save_dir_root: tgt_sample_data max_steps: 3000000 env_name: [Walker2d-v3, Hopper-v3, Humanoid-v3] -# seed: [0, 1, 2] - actor_lr: [0.001, 0.0003] - critic_lr: [0.001, 0.0003] +# no_pretrain_actor: True +# warmup_steps: 256 +# pretrain_model: None +# freeze_q: True +# polyak: [0.99, 0.92] +# alpha: [0.] +# opt_interval: [1000] +# opt_num: [500, 1000] +# batch_size: [256, 512] +# no_q2: [True] +# no_qent: [True, False] +# no_pent: [True, False] +# no_tgt: [True, False] +# hard_update: [1000, 10000, 5000] + + seed: [1, 0] +# actor_lr: [0.001, 0.0003] +# critic_lr: [0.001, 0.0003] # alpha: [0.2, None] + # default_true: + # tgt_sample: [True, False] + default_false: +# no_q2: [True] + From b50fe4ce632063e681b3875fae2b3eaf1baac12d Mon Sep 17 00:00:00 2001 From: taochenshh Date: Wed, 2 Sep 2020 00:31:08 -0400 Subject: [PATCH 14/35] add episode_steps util in Data class --- easyrl/utils/data.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/easyrl/utils/data.py b/easyrl/utils/data.py index c995c79..3490d1f 100644 --- a/easyrl/utils/data.py +++ b/easyrl/utils/data.py @@ -160,6 +160,32 @@ def episode_returns(self): all_epr.append(epr) return all_epr + @property + def episode_steps(self): + """ + return the number of steps in each episode + + Returns: + list: a list of length-num_envs, + each element in this list is a list of # of steps in an episode + + """ + all_epl = [] + dones = self.dones + for i in range(dones.shape[1]): + epl = [] + di = dones[:, i] + + if not np.any(di): + epl.append(di.shape[0]) + else: + di = np.insert(di, 0, 1) + done_idx = np.where(di)[0] + leng = np.diff(done_idx) + epl.extend(leng.tolist()) + all_epl.append(epl) + return all_epl + def pop(self): """ Remove and return the last element from the trajectory From 5eb1a1483d2b402d91951c8a4cde39e1bfd22624 Mon Sep 17 00:00:00 2001 From: taochenshh Date: Wed, 23 Sep 2020 12:29:02 -0400 Subject: [PATCH 15/35] add loss to penalize the action mean drifting away from 1 --- easyrl/agents/ppo_agent.py | 12 +++++++----- examples/ppo.py | 4 ++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/easyrl/agents/ppo_agent.py b/easyrl/agents/ppo_agent.py index 86fa7da..26f501c 100644 --- a/easyrl/agents/ppo_agent.py +++ b/easyrl/agents/ppo_agent.py @@ -17,6 +17,7 @@ from easyrl.utils.torch_util import load_ckpt_data from easyrl.utils.torch_util import load_state_dict from easyrl.utils.torch_util import move_to +from torch.distributions import Independent from easyrl.utils.torch_util import save_model from easyrl.utils.torch_util import torch_float from easyrl.utils.torch_util import torch_to_np @@ -163,7 +164,7 @@ def optim_preprocess(self, data): raise ValueError('val, entropy, log_prob should be 1-dim!') return val, old_val, ret, log_prob, old_log_prob, adv, entropy - def cal_loss(self, val, old_val, ret, log_prob, old_log_prob, adv, entropy): + def cal_loss(self, val, old_val, ret, log_prob, old_log_prob, adv, entropy, act_dist): vf_loss = self.cal_val_loss(val=val, old_val=old_val, ret=ret) ratio = torch.exp(log_prob - old_log_prob) surr1 = adv * ratio @@ -171,13 +172,14 @@ def cal_loss(self, val, old_val, ret, log_prob, old_log_prob, adv, entropy): 1 - ppo_cfg.clip_range, 1 + ppo_cfg.clip_range) pg_loss = -torch.mean(torch.min(surr1, surr2)) - # if entropy.item() < 0.2: - # ent_coef = 1 - # else: - # ent_coef = ppo_cfg.ent_coef + ent_coef = ppo_cfg.ent_coef loss = pg_loss - entropy * ent_coef + \ vf_loss * ppo_cfg.vf_coef + if isinstance(act_dist, Independent): + dist = torch.abs(act_dist.mean) - 1.5 + act_penalty = torch.mean(torch.max(dist, torch.zeros_like(dist))) + loss = loss + act_penalty return loss, pg_loss, vf_loss, ratio def cal_val_loss(self, val, old_val, ret): diff --git a/examples/ppo.py b/examples/ppo.py index ff607ff..1b81fdb 100644 --- a/examples/ppo.py +++ b/examples/ppo.py @@ -12,7 +12,7 @@ from easyrl.runner.episodic_runner import EpisodicRunner from easyrl.utils.common import set_random_seed from easyrl.utils.gym_util import make_vec_env - +from pybullet_envs.gym_locomotion_envs import AntBulletEnv def main(): cfg_from_cmd(ppo_cfg) @@ -27,7 +27,7 @@ def main(): skip_params = [] ppo_cfg.restore_cfg(skip_params=skip_params) if ppo_cfg.env_name is None: - ppo_cfg.env_name = 'HalfCheetah-v2' + ppo_cfg.env_name = 'Ant-v2' set_random_seed(ppo_cfg.seed) env = make_vec_env(ppo_cfg.env_name, ppo_cfg.num_envs, From f936e52bc967d83b5d2213ef4dc09f2db2782996 Mon Sep 17 00:00:00 2001 From: taochenshh Date: Wed, 23 Sep 2020 12:31:20 -0400 Subject: [PATCH 16/35] add loss to penalize the action mean drifting away from 1 --- easyrl/agents/ppo_agent.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/easyrl/agents/ppo_agent.py b/easyrl/agents/ppo_agent.py index 26f501c..a862b64 100644 --- a/easyrl/agents/ppo_agent.py +++ b/easyrl/agents/ppo_agent.py @@ -164,7 +164,7 @@ def optim_preprocess(self, data): raise ValueError('val, entropy, log_prob should be 1-dim!') return val, old_val, ret, log_prob, old_log_prob, adv, entropy - def cal_loss(self, val, old_val, ret, log_prob, old_log_prob, adv, entropy, act_dist): + def cal_loss(self, val, old_val, ret, log_prob, old_log_prob, adv, entropy, act_dist=None): vf_loss = self.cal_val_loss(val=val, old_val=old_val, ret=ret) ratio = torch.exp(log_prob - old_log_prob) surr1 = adv * ratio @@ -176,7 +176,7 @@ def cal_loss(self, val, old_val, ret, log_prob, old_log_prob, adv, entropy, act_ ent_coef = ppo_cfg.ent_coef loss = pg_loss - entropy * ent_coef + \ vf_loss * ppo_cfg.vf_coef - if isinstance(act_dist, Independent): + if act_dist is not None and isinstance(act_dist, Independent): dist = torch.abs(act_dist.mean) - 1.5 act_penalty = torch.mean(torch.max(dist, torch.zeros_like(dist))) loss = loss + act_penalty From d74e966a60d3b3fa4aa3fa910effd041175f0338 Mon Sep 17 00:00:00 2001 From: taochenshh Date: Wed, 23 Sep 2020 20:34:45 -0400 Subject: [PATCH 17/35] remove penalize act dist mean drifting --- easyrl/agents/ppo_agent.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/easyrl/agents/ppo_agent.py b/easyrl/agents/ppo_agent.py index a862b64..6622002 100644 --- a/easyrl/agents/ppo_agent.py +++ b/easyrl/agents/ppo_agent.py @@ -164,7 +164,7 @@ def optim_preprocess(self, data): raise ValueError('val, entropy, log_prob should be 1-dim!') return val, old_val, ret, log_prob, old_log_prob, adv, entropy - def cal_loss(self, val, old_val, ret, log_prob, old_log_prob, adv, entropy, act_dist=None): + def cal_loss(self, val, old_val, ret, log_prob, old_log_prob, adv, entropy): vf_loss = self.cal_val_loss(val=val, old_val=old_val, ret=ret) ratio = torch.exp(log_prob - old_log_prob) surr1 = adv * ratio @@ -176,10 +176,6 @@ def cal_loss(self, val, old_val, ret, log_prob, old_log_prob, adv, entropy, act_ ent_coef = ppo_cfg.ent_coef loss = pg_loss - entropy * ent_coef + \ vf_loss * ppo_cfg.vf_coef - if act_dist is not None and isinstance(act_dist, Independent): - dist = torch.abs(act_dist.mean) - 1.5 - act_penalty = torch.mean(torch.max(dist, torch.zeros_like(dist))) - loss = loss + act_penalty return loss, pg_loss, vf_loss, ratio def cal_val_loss(self, val, old_val, ret): From 30da5b39c717d2991b0a4afacd9dc5416351b93c Mon Sep 17 00:00:00 2001 From: taochenshh Date: Tue, 8 Dec 2020 00:26:47 -0500 Subject: [PATCH 18/35] add rnn ppo example --- easyrl/agents/ppo_agent.py | 1 - easyrl/agents/ppo_rl2_agent.py | 67 +++++++++++++++++ easyrl/agents/ppo_rnn_agent.py | 37 ++++++++-- easyrl/engine/ppo_rnn_engine.py | 3 + easyrl/models/diag_gaussian_policy.py | 1 + easyrl/models/rl2_base.py | 33 +++++++++ easyrl/models/rnn_base.py | 23 +++++- easyrl/models/rnn_diag_gaussian_policy.py | 62 ++++++++++++++++ easyrl/runner/rnn_runner.py | 9 ++- easyrl/utils/common.py | 1 + easyrl/utils/gym_util.py | 3 +- examples/ppo.py | 2 +- examples/rnn_ppo.py | 89 +++++++++++++++++++++++ examples/sac.py | 7 +- examples/sac_sweeper.yml | 40 +++++----- 15 files changed, 338 insertions(+), 40 deletions(-) create mode 100644 easyrl/agents/ppo_rl2_agent.py create mode 100644 easyrl/models/rl2_base.py create mode 100644 easyrl/models/rnn_diag_gaussian_policy.py create mode 100644 examples/rnn_ppo.py diff --git a/easyrl/agents/ppo_agent.py b/easyrl/agents/ppo_agent.py index 6622002..d053d07 100644 --- a/easyrl/agents/ppo_agent.py +++ b/easyrl/agents/ppo_agent.py @@ -17,7 +17,6 @@ from easyrl.utils.torch_util import load_ckpt_data from easyrl.utils.torch_util import load_state_dict from easyrl.utils.torch_util import move_to -from torch.distributions import Independent from easyrl.utils.torch_util import save_model from easyrl.utils.torch_util import torch_float from easyrl.utils.torch_util import torch_to_np diff --git a/easyrl/agents/ppo_rl2_agent.py b/easyrl/agents/ppo_rl2_agent.py new file mode 100644 index 0000000..c3fd150 --- /dev/null +++ b/easyrl/agents/ppo_rl2_agent.py @@ -0,0 +1,67 @@ +import torch + +from easyrl.agents.ppo_agent import PPOAgent +from easyrl.configs.ppo_config import ppo_cfg +from easyrl.utils.torch_util import action_entropy +from easyrl.utils.torch_util import action_from_dist +from easyrl.utils.torch_util import action_log_prob +from easyrl.utils.torch_util import torch_float +from easyrl.utils.torch_util import torch_to_np + + +class PPORNNAgent(PPOAgent): + def __init__(self, actor, critic, same_body=False): + super().__init__(actor=actor, + critic=critic, + same_body=same_body) + + @torch.no_grad() + def get_action(self, ob, sample=True, hidden_state=None, *args, **kwargs): + self.eval_mode() + t_ob = torch.from_numpy(ob).float().to(ppo_cfg.device).unsqueeze(dim=1) + act_dist, val, out_hidden_state = self.get_act_val(t_ob, + hidden_state=hidden_state) + action = action_from_dist(act_dist, + sample=sample) + log_prob = action_log_prob(action, act_dist) + entropy = action_entropy(act_dist, log_prob) + action_info = dict( + log_prob=torch_to_np(log_prob.squeeze(1)), + entropy=torch_to_np(entropy.squeeze(1)), + val=torch_to_np(val.squeeze(1)), + ) + return torch_to_np(action.squeeze(1)), action_info, out_hidden_state + + def get_act_val(self, ob, hidden_state=None, *args, **kwargs): + ob = torch_float(ob, device=ppo_cfg.device) + act_dist, body_out, out_hidden_state = self.actor(ob, hidden_state=hidden_state) + if self.same_body: + val, body_out, out_hidden_state = self.critic(body_x=body_out, hidden_state=hidden_state) + else: + val, body_out, out_hidden_state = self.critic(x=ob, hidden_state=hidden_state) + val = val.squeeze(-1) + return act_dist, val, out_hidden_state + + @torch.no_grad() + def get_val(self, ob, hidden_state=None, *args, **kwargs): + self.eval_mode() + ob = torch_float(ob, device=ppo_cfg.device).unsqueeze(dim=1) + val, body_out, out_hidden_state = self.critic(x=ob, + hidden_state=hidden_state) + val = val.squeeze(-1) + return val, out_hidden_state + + def optim_preprocess(self, data): + for key, val in data.items(): + data[key] = torch_float(val, device=ppo_cfg.device) + ob = data['ob'] + action = data['action'] + ret = data['ret'] + adv = data['adv'] + old_log_prob = data['log_prob'] + old_val = data['val'] + + act_dist, val, out_hidden_state = self.get_act_val(ob) + log_prob = action_log_prob(action, act_dist) + entropy = action_entropy(act_dist, log_prob) + return val, old_val, ret, log_prob, old_log_prob, adv, entropy diff --git a/easyrl/agents/ppo_rnn_agent.py b/easyrl/agents/ppo_rnn_agent.py index c3fd150..2f1c5ed 100644 --- a/easyrl/agents/ppo_rnn_agent.py +++ b/easyrl/agents/ppo_rnn_agent.py @@ -1,3 +1,4 @@ +import numpy as np import torch from easyrl.agents.ppo_agent import PPOAgent @@ -16,8 +17,10 @@ def __init__(self, actor, critic, same_body=False): same_body=same_body) @torch.no_grad() - def get_action(self, ob, sample=True, hidden_state=None, *args, **kwargs): + def get_action(self, ob, sample=True, hidden_state=None, prev_done=False, *args, **kwargs): self.eval_mode() + hidden_state = self.check_hidden_state(hidden_state, prev_done) + t_ob = torch.from_numpy(ob).float().to(ppo_cfg.device).unsqueeze(dim=1) act_dist, val, out_hidden_state = self.get_act_val(t_ob, hidden_state=hidden_state) @@ -32,19 +35,27 @@ def get_action(self, ob, sample=True, hidden_state=None, *args, **kwargs): ) return torch_to_np(action.squeeze(1)), action_info, out_hidden_state - def get_act_val(self, ob, hidden_state=None, *args, **kwargs): + def get_act_val(self, ob, hidden_state=None, done=None, *args, **kwargs): ob = torch_float(ob, device=ppo_cfg.device) - act_dist, body_out, out_hidden_state = self.actor(ob, hidden_state=hidden_state) + act_dist, body_out, out_hidden_state = self.actor(ob, + hidden_state=hidden_state, + done=done) if self.same_body: - val, body_out, out_hidden_state = self.critic(body_x=body_out, hidden_state=hidden_state) + val, body_out, out_hidden_state = self.critic(body_x=body_out, + hidden_state=hidden_state, + done=done) else: - val, body_out, out_hidden_state = self.critic(x=ob, hidden_state=hidden_state) + val, body_out, out_hidden_state = self.critic(x=ob, + hidden_state=hidden_state, + done=done) val = val.squeeze(-1) return act_dist, val, out_hidden_state @torch.no_grad() - def get_val(self, ob, hidden_state=None, *args, **kwargs): + def get_val(self, ob, hidden_state=None, prev_done=False, *args, **kwargs): self.eval_mode() + hidden_state = self.check_hidden_state(hidden_state, prev_done) + ob = torch_float(ob, device=ppo_cfg.device).unsqueeze(dim=1) val, body_out, out_hidden_state = self.critic(x=ob, hidden_state=hidden_state) @@ -60,8 +71,20 @@ def optim_preprocess(self, data): adv = data['adv'] old_log_prob = data['log_prob'] old_val = data['val'] + done = data['done'] - act_dist, val, out_hidden_state = self.get_act_val(ob) + act_dist, val, out_hidden_state = self.get_act_val(ob, done=done) log_prob = action_log_prob(action, act_dist) entropy = action_entropy(act_dist, log_prob) return val, old_val, ret, log_prob, old_log_prob, adv, entropy + + def check_hidden_state(self, hidden_state, prev_done): + if prev_done is not None: + # if the last step is the end of an episode, + # then reset hidden state + done_idx = np.argwhere(prev_done).flatten() + if done_idx.size > 0: + ld, b, hz = hidden_state.shape + hidden_state[:, done_idx] = torch.zeros(ld, done_idx.size, hz, + device=hidden_state.device) + return hidden_state diff --git a/easyrl/engine/ppo_rnn_engine.py b/easyrl/engine/ppo_rnn_engine.py index 22ba81f..fcbd22d 100644 --- a/easyrl/engine/ppo_rnn_engine.py +++ b/easyrl/engine/ppo_rnn_engine.py @@ -1,3 +1,5 @@ +import time + import numpy as np from torch.utils.data import DataLoader @@ -28,6 +30,7 @@ def traj_preprocess(self, traj): adv=adv.swapaxes(0, 1), log_prob=log_prob.swapaxes(0, 1), val=vals.swapaxes(0, 1), + done=traj.dones.swapaxes(0, 1) ) rollout_dataset = DictDataset(**data) rollout_dataloader = DataLoader(rollout_dataset, diff --git a/easyrl/models/diag_gaussian_policy.py b/easyrl/models/diag_gaussian_policy.py index 5dff04f..b3ddd3c 100644 --- a/easyrl/models/diag_gaussian_policy.py +++ b/easyrl/models/diag_gaussian_policy.py @@ -9,6 +9,7 @@ LOG_STD_MAX = 2 LOG_STD_MIN = -20 + class DiagGaussianPolicy(nn.Module): def __init__(self, body_net, diff --git a/easyrl/models/rl2_base.py b/easyrl/models/rl2_base.py new file mode 100644 index 0000000..2f2001a --- /dev/null +++ b/easyrl/models/rl2_base.py @@ -0,0 +1,33 @@ +import torch.nn as nn + + +class RL2Base(nn.Module): + def __init__(self, + body_net, + rnn_features=128, + in_features=128, + rnn_layers=1, + ): + super().__init__() + self.body = body_net + self.gru = nn.GRU(input_size=in_features, + hidden_size=rnn_features, + num_layers=rnn_layers, + batch_first=True) + self.fcs = nn.Linear(rnn_features, rnn_features) + self.fcs = nn.Sequential( + nn.ELU(), + nn.Linear(in_features=rnn_features, out_features=rnn_features), + nn.ELU() + ) + + def forward(self, x=None, hidden_state=None): + b = x.shape[0] + t = x.shape[1] + x = x.view(b * t, *x.shape[2:]) + obs_feature = self.body(x) + obs_feature = obs_feature.view(b, t, *obs_feature.shape[1:]) + rnn_features, hidden_state = self.gru(obs_feature, + hidden_state) + out = self.fcs(rnn_features) + return out, hidden_state diff --git a/easyrl/models/rnn_base.py b/easyrl/models/rnn_base.py index b58bb7a..69a1dda 100644 --- a/easyrl/models/rnn_base.py +++ b/easyrl/models/rnn_base.py @@ -1,3 +1,4 @@ +import torch import torch.nn as nn @@ -21,13 +22,29 @@ def __init__(self, nn.ELU() ) - def forward(self, x=None, hidden_state=None): + def forward(self, x=None, hidden_state=None, done=None): b = x.shape[0] t = x.shape[1] x = x.view(b * t, *x.shape[2:]) obs_feature = self.body(x) obs_feature = obs_feature.view(b, t, *obs_feature.shape[1:]) - rnn_features, hidden_state = self.gru(obs_feature, - hidden_state) + + if self.training: + done_ts = (done == 1).any(dim=0).nonzero().squeeze().cpu().numpy() + 1 + done_ts = done_ts.tolist() + done_ts = [0] + done_ts + [t] + rnn_features = [] + for idx in range(len(done_ts) - 1): + sid = done_ts[idx] + eid = done_ts[idx + 1] + if hidden_state is not None: + hidden_state = hidden_state * (1 - done[:, sid]).view(1, -1, 1) + rfeatures, hidden_state = self.gru(obs_feature[:, sid:eid], + hidden_state) + rnn_features.append(rfeatures) + rnn_features = torch.cat(rnn_features, dim=1) + else: + rnn_features, hidden_state = self.gru(obs_feature, + hidden_state) out = self.fcs(rnn_features) return out, hidden_state diff --git a/easyrl/models/rnn_diag_gaussian_policy.py b/easyrl/models/rnn_diag_gaussian_policy.py new file mode 100644 index 0000000..1eadafe --- /dev/null +++ b/easyrl/models/rnn_diag_gaussian_policy.py @@ -0,0 +1,62 @@ +import torch +import torch.nn as nn +from torch.distributions import Independent +from torch.distributions import Normal +from torch.distributions import TransformedDistribution + +from torch.distributions.transforms import TanhTransform + +LOG_STD_MAX = 2 +LOG_STD_MIN = -20 + + +class RNNDiagGaussianPolicy(nn.Module): + def __init__(self, + body_net, + action_dim, + init_log_std=-0.51, + std_cond_in=False, + tanh_on_dist=False, + in_features=None, + clamp_log_std=False): # add tanh on the action distribution + super().__init__() + self.std_cond_in = std_cond_in + self.tanh_on_dist = tanh_on_dist + self.body = body_net + self.clamp_log_std = clamp_log_std + + if in_features is None: + for i in reversed(range(len(self.body.fcs))): + layer = self.body.fcs[i] + if hasattr(layer, 'out_features'): + in_features = layer.out_features + break + + self.head_mean = nn.Linear(in_features, action_dim) + if self.std_cond_in: + self.head_logstd = nn.Linear(in_features, action_dim) + else: + self.head_logstd = nn.Parameter(torch.full((action_dim,), + init_log_std)) + + def forward(self, x=None, body_x=None, hidden_state=None, **kwargs): + if x is None and body_x is None: + raise ValueError('One of [x, body_x] should be provided!') + if body_x is None: + body_x, hidden_state = self.body(x, + hidden_state=hidden_state, + **kwargs) + body_out = body_x[0] if isinstance(body_x, tuple) else body_x + mean = self.head_mean(body_out) + if self.std_cond_in: + log_std = self.head_logstd(body_out) + else: + log_std = self.head_logstd.expand_as(mean) + if self.clamp_log_std: + log_std = torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX) + std = torch.exp(log_std) + action_dist = Independent(Normal(loc=mean, scale=std), 1) + if self.tanh_on_dist: + action_dist = TransformedDistribution(action_dist, + [TanhTransform(cache_size=1)]) + return action_dist, body_x, hidden_state diff --git a/easyrl/runner/rnn_runner.py b/easyrl/runner/rnn_runner.py index 7d540bd..526bafa 100644 --- a/easyrl/runner/rnn_runner.py +++ b/easyrl/runner/rnn_runner.py @@ -35,6 +35,7 @@ def __call__(self, time_steps, sample=True, evaluation=False, if return_on_done: all_dones = np.zeros(env.num_envs, dtype=bool) hidden_state = None + done = None for t in range(time_steps): if render: env.render() @@ -44,11 +45,10 @@ def __call__(self, time_steps, sample=True, evaluation=False, # get render images at the same time step as ob imgs = deepcopy(env.get_images()) - ## TODO add masks on hidden state so that hidden state from - ## previous episode does not get passed to the next episode after done=True - action, action_info, hidden_state = self.agent.get_action(ob['ob'], + action, action_info, hidden_state = self.agent.get_action(ob, sample=sample, hidden_state=hidden_state, + prev_done=done, **action_kwargs) next_ob, reward, done, info = env.step(action) next_ob = deepcopy(next_ob) @@ -74,6 +74,7 @@ def __call__(self, time_steps, sample=True, evaluation=False, break if not evaluation: last_val, _ = self.agent.get_val(traj[-1].next_ob, - hidden_state=hidden_state) + hidden_state=hidden_state, + prev_done=done) traj.add_extra('last_val', torch_to_np(last_val)) return traj diff --git a/easyrl/utils/common.py b/easyrl/utils/common.py index 75a0d02..afb2bfd 100644 --- a/easyrl/utils/common.py +++ b/easyrl/utils/common.py @@ -10,6 +10,7 @@ import numpy as np import torch import yaml + from easyrl.utils.rl_logger import logger diff --git a/easyrl/utils/gym_util.py b/easyrl/utils/gym_util.py index eff2a6a..d59c257 100644 --- a/easyrl/utils/gym_util.py +++ b/easyrl/utils/gym_util.py @@ -8,12 +8,13 @@ from gym.spaces import MultiBinary from gym.spaces import MultiDiscrete from gym.spaces import Tuple +from gym.wrappers.time_limit import TimeLimit from easyrl.envs.dummy_vec_env import DummyVecEnv from easyrl.envs.shmem_vec_env import ShmemVecEnv from easyrl.envs.timeout import TimeOutEnv from easyrl.utils.rl_logger import logger -from gym.wrappers.time_limit import TimeLimit + def num_space_dim(space): if isinstance(space, Box): diff --git a/examples/ppo.py b/examples/ppo.py index 1b81fdb..06ca565 100644 --- a/examples/ppo.py +++ b/examples/ppo.py @@ -12,7 +12,7 @@ from easyrl.runner.episodic_runner import EpisodicRunner from easyrl.utils.common import set_random_seed from easyrl.utils.gym_util import make_vec_env -from pybullet_envs.gym_locomotion_envs import AntBulletEnv + def main(): cfg_from_cmd(ppo_cfg) diff --git a/examples/rnn_ppo.py b/examples/rnn_ppo.py new file mode 100644 index 0000000..0010ca5 --- /dev/null +++ b/examples/rnn_ppo.py @@ -0,0 +1,89 @@ +import gym +import torch.nn as nn + +from easyrl.agents.ppo_rnn_agent import PPORNNAgent +from easyrl.configs.command_line import cfg_from_cmd +from easyrl.configs.ppo_config import ppo_cfg +from easyrl.engine.ppo_rnn_engine import PPORNNEngine +from easyrl.models.mlp import MLP +from easyrl.models.rnn_base import RNNBase +from easyrl.models.rnn_categorical_policy import RNNCategoricalPolicy +from easyrl.models.rnn_diag_gaussian_policy import RNNDiagGaussianPolicy +from easyrl.models.rnn_value_net import RNNValueNet +from easyrl.runner.rnn_runner import RNNRunner +from easyrl.utils.common import set_random_seed +from easyrl.utils.gym_util import make_vec_env + + +def main(): + cfg_from_cmd(ppo_cfg) + if ppo_cfg.resume or ppo_cfg.test: + if ppo_cfg.test: + skip_params = [ + 'test_num', + 'num_envs', + 'sample_action', + ] + else: + skip_params = [] + ppo_cfg.restore_cfg(skip_params=skip_params) + if ppo_cfg.env_name is None: + ppo_cfg.env_name = 'Ant-v2' + set_random_seed(ppo_cfg.seed) + env = make_vec_env(ppo_cfg.env_name, + ppo_cfg.num_envs, + seed=ppo_cfg.seed) + env.reset() + ob_size = env.observation_space.shape[0] + + actor_body = MLP(input_size=ob_size, + hidden_sizes=[256], + output_size=256, + hidden_act=nn.ReLU, + output_act=nn.ReLU) + actor_body = RNNBase(body_net=actor_body, + rnn_features=256, + in_features=256, + rnn_layers=1, + ) + critic_body = MLP(input_size=ob_size, + hidden_sizes=[256], + output_size=256, + hidden_act=nn.ReLU, + output_act=nn.ReLU) + critic_body = RNNBase(body_net=critic_body, + rnn_features=256, + in_features=256, + rnn_layers=1, + ) + if isinstance(env.action_space, gym.spaces.Discrete): + act_size = env.action_space.n + actor = RNNCategoricalPolicy(actor_body, action_dim=act_size) + elif isinstance(env.action_space, gym.spaces.Box): + act_size = env.action_space.shape[0] + actor = RNNDiagGaussianPolicy(actor_body, action_dim=act_size, + tanh_on_dist=ppo_cfg.tanh_on_dist, + std_cond_in=ppo_cfg.std_cond_in) + else: + raise TypeError(f'Unknown action space ' + f'type: {env.action_space}') + + critic = RNNValueNet(critic_body) + agent = PPORNNAgent(actor, critic) + runner = RNNRunner(agent=agent, env=env) + engine = PPORNNEngine(agent=agent, + runner=runner) + if not ppo_cfg.test: + engine.train() + else: + stat_info, raw_traj_info = engine.eval(render=ppo_cfg.render, + save_eval_traj=ppo_cfg.save_test_traj, + eval_num=ppo_cfg.test_num, + sleep_time=0.04) + import pprint + pprint.pprint(stat_info) + env.close() + + +if __name__ == '__main__': + main() diff --git a/examples/sac.py b/examples/sac.py index 7f08f31..cb3c39f 100644 --- a/examples/sac.py +++ b/examples/sac.py @@ -1,14 +1,15 @@ import gym -import torch.nn as nn import torch +import torch.nn as nn + from easyrl.agents.sac_agent import SACAgent from easyrl.configs.command_line import cfg_from_cmd from easyrl.configs.sac_config import sac_cfg from easyrl.engine.sac_engine import SACEngine -from easyrl.replays.circular_buffer import CyclicBuffer from easyrl.models.diag_gaussian_policy import DiagGaussianPolicy from easyrl.models.mlp import MLP from easyrl.models.value_net import ValueNet +from easyrl.replays.circular_buffer import CyclicBuffer from easyrl.runner.step_runner import StepRunner from easyrl.utils.common import set_random_seed @@ -61,7 +62,7 @@ def main(): memory = CyclicBuffer(capacity=sac_cfg.replay_size) agent = SACAgent(actor, q1=q1, q2=q2, env=env, memory=memory) runner = StepRunner(agent=agent, env=env, eval_env=eval_env) - + engine = SACEngine(agent=agent, runner=runner) if not sac_cfg.test: diff --git a/examples/sac_sweeper.yml b/examples/sac_sweeper.yml index b56e808..e7bc612 100644 --- a/examples/sac_sweeper.yml +++ b/examples/sac_sweeper.yml @@ -3,30 +3,30 @@ cmd: python sac.py exclude_gpus: None # [1] gpu_memory_per_job: 1100 # unit: MB hparams: - save_dir_root: tgt_sample_data + save_dir_root: data max_steps: 3000000 env_name: [Walker2d-v3, Hopper-v3, Humanoid-v3] -# no_pretrain_actor: True -# warmup_steps: 256 -# pretrain_model: None -# freeze_q: True -# polyak: [0.99, 0.92] -# alpha: [0.] -# opt_interval: [1000] -# opt_num: [500, 1000] -# batch_size: [256, 512] -# no_q2: [True] -# no_qent: [True, False] -# no_pent: [True, False] -# no_tgt: [True, False] -# hard_update: [1000, 10000, 5000] + # no_pretrain_actor: True + # warmup_steps: 256 + # pretrain_model: None + # freeze_q: True + # polyak: [0.99, 0.92] + # alpha: [0.] + # opt_interval: [1000] + # opt_num: [500, 1000] + # batch_size: [256, 512] + # no_q2: [True] + # no_qent: [True, False] + # no_pent: [True, False] + # no_tgt: [True, False] + # hard_update: [1000, 10000, 5000] seed: [1, 0] -# actor_lr: [0.001, 0.0003] -# critic_lr: [0.001, 0.0003] -# alpha: [0.2, None] - # default_true: - # tgt_sample: [True, False] + # actor_lr: [0.001, 0.0003] + # critic_lr: [0.001, 0.0003] + # alpha: [0.2, None] + # default_true: + # tgt_sample: [True, False] default_false: # no_q2: [True] From 203cfff8f4c88f66230f361daaa92f5a45fe2c12 Mon Sep 17 00:00:00 2001 From: taochenshh Date: Wed, 9 Dec 2020 18:27:37 -0500 Subject: [PATCH 19/35] fix small bug --- easyrl/configs/ppo_config.py | 4 ++-- easyrl/models/rnn_base.py | 6 ++++-- examples/rnn_ppo.py | 3 ++- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/easyrl/configs/ppo_config.py b/easyrl/configs/ppo_config.py index dce7423..c7e9719 100644 --- a/easyrl/configs/ppo_config.py +++ b/easyrl/configs/ppo_config.py @@ -9,7 +9,7 @@ class PPOConfig(BasicConfig): # will use policy_lr by default policy_lr: float = 3e-4 value_lr: float = 1e-3 - linear_decay_lr: bool = True + linear_decay_lr: bool = False max_decay_steps: int = 1e6 num_envs: int = 8 eval_num_envs: int = None @@ -20,7 +20,7 @@ class PPOConfig(BasicConfig): vf_coef: float = 0.5 ent_coef: float = 0.01 clip_range: float = 0.2 - linear_decay_clip_range: bool = True + linear_decay_clip_range: bool = False gae_lambda: float = 0.95 rew_discount: float = 0.99 use_amsgrad: bool = True diff --git a/easyrl/models/rnn_base.py b/easyrl/models/rnn_base.py index 69a1dda..055a955 100644 --- a/easyrl/models/rnn_base.py +++ b/easyrl/models/rnn_base.py @@ -30,9 +30,11 @@ def forward(self, x=None, hidden_state=None, done=None): obs_feature = obs_feature.view(b, t, *obs_feature.shape[1:]) if self.training: - done_ts = (done == 1).any(dim=0).nonzero().squeeze().cpu().numpy() + 1 + done_ts = (done == 1).any(dim=0).nonzero().squeeze(dim=-1).cpu().numpy() + 1 done_ts = done_ts.tolist() - done_ts = [0] + done_ts + [t] + done_ts = [0] + done_ts + if done_ts[-1] != t: + done_ts = done_ts + [t] rnn_features = [] for idx in range(len(done_ts) - 1): sid = done_ts[idx] diff --git a/examples/rnn_ppo.py b/examples/rnn_ppo.py index 0010ca5..8f254a9 100644 --- a/examples/rnn_ppo.py +++ b/examples/rnn_ppo.py @@ -23,12 +23,13 @@ def main(): 'test_num', 'num_envs', 'sample_action', + 'seed' ] else: skip_params = [] ppo_cfg.restore_cfg(skip_params=skip_params) if ppo_cfg.env_name is None: - ppo_cfg.env_name = 'Ant-v2' + ppo_cfg.env_name = 'Hopper-v2' set_random_seed(ppo_cfg.seed) env = make_vec_env(ppo_cfg.env_name, ppo_cfg.num_envs, From 540450549003bc747df2090a479a2746a543bb9f Mon Sep 17 00:00:00 2001 From: taochenshh Date: Mon, 21 Dec 2020 17:17:26 -0500 Subject: [PATCH 20/35] refactor code --- easyrl/agents/ppo_agent.py | 134 ++++++++++++++++--------------- easyrl/agents/ppo_rl2_agent.py | 10 +-- easyrl/agents/ppo_rnn_agent.py | 10 +-- easyrl/agents/sac_agent.py | 101 ++++++++++++----------- easyrl/configs/__init__.py | 31 +++++++ easyrl/configs/basic_config.py | 2 +- easyrl/configs/sac_config.py | 5 +- easyrl/engine/basic_engine.py | 26 +++++- easyrl/engine/ppo_engine.py | 66 +++++++-------- easyrl/engine/ppo_rnn_engine.py | 6 +- easyrl/engine/sac_engine.py | 81 ++++++++++--------- easyrl/envs/vec_env.py | 39 +++++++++ easyrl/runner/episodic_runner.py | 27 +++++-- easyrl/runner/step_runner.py | 97 +++++++++++----------- easyrl/utils/gym_util.py | 4 +- easyrl/utils/torch_util.py | 4 +- examples/ppo.py | 39 ++++----- examples/sac.py | 47 ++++++----- 18 files changed, 429 insertions(+), 300 deletions(-) diff --git a/easyrl/agents/ppo_agent.py b/easyrl/agents/ppo_agent.py index d053d07..7dc225d 100644 --- a/easyrl/agents/ppo_agent.py +++ b/easyrl/agents/ppo_agent.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from functools import partial import numpy as np @@ -7,7 +8,7 @@ from torch.optim.lr_scheduler import LambdaLR from easyrl.agents.base_agent import BaseAgent -from easyrl.configs.ppo_config import ppo_cfg +from easyrl.configs import cfg from easyrl.utils.common import linear_decay_percent from easyrl.utils.rl_logger import logger from easyrl.utils.torch_util import action_entropy @@ -22,52 +23,55 @@ from easyrl.utils.torch_util import torch_to_np +@dataclass class PPOAgent(BaseAgent): - def __init__(self, actor, critic, same_body=False): - self.actor = actor - self.critic = critic - move_to([self.actor, self.critic], - device=ppo_cfg.device) + actor: nn.Module + critic: nn.Module + same_body: float = False - self.same_body = same_body - if ppo_cfg.vf_loss_type == 'mse': - self.val_loss_criterion = nn.MSELoss().to(ppo_cfg.device) - elif ppo_cfg.vf_loss_type == 'smoothl1': - self.val_loss_criterion = nn.SmoothL1Loss().to(ppo_cfg.device) + def __post_init__(self): + move_to([self.actor, self.critic], + device=cfg.alg.device) + if cfg.alg.vf_loss_type == 'mse': + self.val_loss_criterion = nn.MSELoss().to(cfg.alg.device) + elif cfg.alg.vf_loss_type == 'smoothl1': + self.val_loss_criterion = nn.SmoothL1Loss().to(cfg.alg.device) else: - raise TypeError(f'Unknown value loss type: {ppo_cfg.vf_loss_type}!') + raise TypeError(f'Unknown value loss type: {cfg.alg.vf_loss_type}!') all_params = list(self.actor.parameters()) + list(self.critic.parameters()) # keep unique elements only. The following code works for python >=3.7 # for earlier version of python, u need to use OrderedDict self.all_params = dict.fromkeys(all_params).keys() - if (ppo_cfg.linear_decay_lr or ppo_cfg.linear_decay_clip_range) and \ - ppo_cfg.max_steps > ppo_cfg.max_decay_steps: - raise ValueError('max_steps should be no greater than max_decay_steps.') - total_epochs = int(np.ceil(ppo_cfg.max_decay_steps / (ppo_cfg.num_envs * - ppo_cfg.episode_steps))) - if ppo_cfg.linear_decay_clip_range: - self.clip_range_decay_rate = ppo_cfg.clip_range / float(total_epochs) + if (cfg.alg.linear_decay_lr or cfg.alg.linear_decay_clip_range) and \ + cfg.alg.max_steps > cfg.alg.max_decay_steps: + logger.warning('max_steps should not be greater than max_decay_steps.') + cfg.alg.max_decay_steps = int(cfg.alg.max_steps * 1.5) + logger.warning(f'Resetting max_decay_steps to {cfg.alg.max_decay_steps}!') + total_epochs = int(np.ceil(cfg.alg.max_decay_steps / (cfg.alg.num_envs * + cfg.alg.episode_steps))) + if cfg.alg.linear_decay_clip_range: + self.clip_range_decay_rate = cfg.alg.clip_range / float(total_epochs) p_lr_lambda = partial(linear_decay_percent, total_epochs=total_epochs) optim_args = dict( - lr=ppo_cfg.policy_lr, - weight_decay=ppo_cfg.weight_decay + lr=cfg.alg.policy_lr, + weight_decay=cfg.alg.weight_decay ) - if not ppo_cfg.sgd: - optim_args['amsgrad'] = ppo_cfg.use_amsgrad + if not cfg.alg.sgd: + optim_args['amsgrad'] = cfg.alg.use_amsgrad optim_func = optim.Adam else: - optim_args['nesterov'] = True if ppo_cfg.momentum > 0 else False - optim_args['momentum'] = ppo_cfg.momentum + optim_args['nesterov'] = True if cfg.alg.momentum > 0 else False + optim_args['momentum'] = cfg.alg.momentum optim_func = optim.SGD if self.same_body: optim_args['params'] = self.all_params else: optim_args['params'] = [{'params': self.actor.parameters(), - 'lr': ppo_cfg.policy_lr}, + 'lr': cfg.alg.policy_lr}, {'params': self.critic.parameters(), - 'lr': ppo_cfg.value_lr}] + 'lr': cfg.alg.value_lr}] self.optimizer = optim_func(**optim_args) @@ -83,7 +87,7 @@ def __init__(self, actor, critic, same_body=False): @torch.no_grad() def get_action(self, ob, sample=True, *args, **kwargs): self.eval_mode() - t_ob = torch_float(ob, device=ppo_cfg.device) + t_ob = torch_float(ob, device=cfg.alg.device) act_dist, val = self.get_act_val(t_ob) action = action_from_dist(act_dist, sample=sample) @@ -97,7 +101,7 @@ def get_action(self, ob, sample=True, *args, **kwargs): return torch_to_np(action), action_info def get_act_val(self, ob, *args, **kwargs): - ob = torch_float(ob, device=ppo_cfg.device) + ob = torch_float(ob, device=cfg.alg.device) act_dist, body_out = self.actor(ob) if self.same_body: val, body_out = self.critic(body_x=body_out) @@ -109,37 +113,31 @@ def get_act_val(self, ob, *args, **kwargs): @torch.no_grad() def get_val(self, ob, *args, **kwargs): self.eval_mode() - ob = torch_float(ob, device=ppo_cfg.device) + ob = torch_float(ob, device=cfg.alg.device) val, body_out = self.critic(x=ob) val = val.squeeze(-1) return val def optimize(self, data, *args, **kwargs): - self.train_mode() pre_res = self.optim_preprocess(data) - val, old_val, ret, log_prob, old_log_prob, adv, entropy = pre_res - entropy = torch.mean(entropy) - loss_res = self.cal_loss(val=val, - old_val=old_val, - ret=ret, - log_prob=log_prob, - old_log_prob=old_log_prob, - adv=adv, - entropy=entropy) + processed_data = pre_res + processed_data['entropy'] = torch.mean(processed_data['entropy']) + loss_res = self.cal_loss(**processed_data) loss, pg_loss, vf_loss, ratio = loss_res self.optimizer.zero_grad() loss.backward() - grad_norm = clip_grad(self.all_params, ppo_cfg.max_grad_norm) + grad_norm = clip_grad(self.all_params, cfg.alg.max_grad_norm) self.optimizer.step() with torch.no_grad(): - approx_kl = 0.5 * torch.mean(torch.pow(old_log_prob - log_prob, 2)) - clip_frac = np.mean(np.abs(torch_to_np(ratio) - 1.0) > ppo_cfg.clip_range) + approx_kl = 0.5 * torch.mean(torch.pow(processed_data['old_log_prob'] - + processed_data['log_prob'], 2)) + clip_frac = np.mean(np.abs(torch_to_np(ratio) - 1.0) > cfg.alg.clip_range) optim_info = dict( pg_loss=pg_loss.item(), vf_loss=vf_loss.item(), total_loss=loss.item(), - entropy=entropy.item(), + entropy=processed_data['entropy'].item(), approx_kl=approx_kl.item(), clip_frac=clip_frac ) @@ -147,8 +145,9 @@ def optimize(self, data, *args, **kwargs): return optim_info def optim_preprocess(self, data): + self.train_mode() for key, val in data.items(): - data[key] = torch_float(val, device=ppo_cfg.device) + data[key] = torch_float(val, device=cfg.alg.device) ob = data['ob'] action = data['action'] ret = data['ret'] @@ -161,27 +160,35 @@ def optim_preprocess(self, data): entropy = action_entropy(act_dist, log_prob) if not all([x.ndim == 1 for x in [val, entropy, log_prob]]): raise ValueError('val, entropy, log_prob should be 1-dim!') - return val, old_val, ret, log_prob, old_log_prob, adv, entropy + processed_data = dict( + val=val, + old_val=old_val, + ret=ret, + log_prob=log_prob, + old_log_prob=old_log_prob, + adv=adv, + entropy=entropy + ) + return processed_data def cal_loss(self, val, old_val, ret, log_prob, old_log_prob, adv, entropy): vf_loss = self.cal_val_loss(val=val, old_val=old_val, ret=ret) ratio = torch.exp(log_prob - old_log_prob) surr1 = adv * ratio surr2 = adv * torch.clamp(ratio, - 1 - ppo_cfg.clip_range, - 1 + ppo_cfg.clip_range) + 1 - cfg.alg.clip_range, + 1 + cfg.alg.clip_range) pg_loss = -torch.mean(torch.min(surr1, surr2)) - ent_coef = ppo_cfg.ent_coef - loss = pg_loss - entropy * ent_coef + \ - vf_loss * ppo_cfg.vf_coef + loss = pg_loss - entropy * cfg.alg.ent_coef + \ + vf_loss * cfg.alg.vf_coef return loss, pg_loss, vf_loss, ratio def cal_val_loss(self, val, old_val, ret): - if ppo_cfg.clip_vf_loss: + if cfg.alg.clip_vf_loss: clipped_val = old_val + torch.clamp(val - old_val, - -ppo_cfg.clip_range, - ppo_cfg.clip_range) + -cfg.alg.clip_range, + cfg.alg.clip_range) vf_loss1 = torch.pow(val - ret, 2) vf_loss2 = torch.pow(clipped_val - ret, 2) vf_loss = 0.5 * torch.mean(torch.max(vf_loss1, @@ -203,17 +210,14 @@ def decay_lr(self): self.lr_scheduler.step() def get_lr(self): - try: - cur_lr = self.lr_scheduler.get_last_lr() - except AttributeError: - cur_lr = self.lr_scheduler.get_lr() + cur_lr = self.lr_scheduler.get_lr() lrs = {'policy_lr': cur_lr[0]} if len(cur_lr) > 1: lrs['value_lr'] = cur_lr[1] return lrs def decay_clip_range(self): - ppo_cfg.clip_range -= self.clip_range_decay_rate + cfg.alg.clip_range -= self.clip_range_decay_rate def save_model(self, is_best=False, step=None): data_to_save = { @@ -224,13 +228,13 @@ def save_model(self, is_best=False, step=None): 'lr_scheduler_state_dict': self.lr_scheduler.state_dict() } - if ppo_cfg.linear_decay_clip_range: - data_to_save['clip_range'] = ppo_cfg.clip_range + if cfg.alg.linear_decay_clip_range: + data_to_save['clip_range'] = cfg.alg.clip_range data_to_save['clip_range_decay_rate'] = self.clip_range_decay_rate - save_model(data_to_save, ppo_cfg, is_best=is_best, step=step) + save_model(data_to_save, cfg.alg, is_best=is_best, step=step) def load_model(self, step=None, pretrain_model=None): - ckpt_data = load_ckpt_data(ppo_cfg, step=step, + ckpt_data = load_ckpt_data(cfg.alg, step=step, pretrain_model=pretrain_model) load_state_dict(self.actor, ckpt_data['actor_state_dict']) @@ -240,9 +244,9 @@ def load_model(self, step=None, pretrain_model=None): return self.optimizer.load_state_dict(ckpt_data['optim_state_dict']) self.lr_scheduler.load_state_dict(ckpt_data['lr_scheduler_state_dict']) - if ppo_cfg.linear_decay_clip_range: + if cfg.alg.linear_decay_clip_range: self.clip_range_decay_rate = ckpt_data['clip_range_decay_rate'] - ppo_cfg.clip_range = ckpt_data['clip_range'] + cfg.alg.clip_range = ckpt_data['clip_range'] return ckpt_data['step'] def print_param_grad_status(self): diff --git a/easyrl/agents/ppo_rl2_agent.py b/easyrl/agents/ppo_rl2_agent.py index c3fd150..66c514d 100644 --- a/easyrl/agents/ppo_rl2_agent.py +++ b/easyrl/agents/ppo_rl2_agent.py @@ -1,7 +1,7 @@ import torch from easyrl.agents.ppo_agent import PPOAgent -from easyrl.configs.ppo_config import ppo_cfg +from easyrl.configs import cfg from easyrl.utils.torch_util import action_entropy from easyrl.utils.torch_util import action_from_dist from easyrl.utils.torch_util import action_log_prob @@ -18,7 +18,7 @@ def __init__(self, actor, critic, same_body=False): @torch.no_grad() def get_action(self, ob, sample=True, hidden_state=None, *args, **kwargs): self.eval_mode() - t_ob = torch.from_numpy(ob).float().to(ppo_cfg.device).unsqueeze(dim=1) + t_ob = torch.from_numpy(ob).float().to(cfg.alg.device).unsqueeze(dim=1) act_dist, val, out_hidden_state = self.get_act_val(t_ob, hidden_state=hidden_state) action = action_from_dist(act_dist, @@ -33,7 +33,7 @@ def get_action(self, ob, sample=True, hidden_state=None, *args, **kwargs): return torch_to_np(action.squeeze(1)), action_info, out_hidden_state def get_act_val(self, ob, hidden_state=None, *args, **kwargs): - ob = torch_float(ob, device=ppo_cfg.device) + ob = torch_float(ob, device=cfg.alg.device) act_dist, body_out, out_hidden_state = self.actor(ob, hidden_state=hidden_state) if self.same_body: val, body_out, out_hidden_state = self.critic(body_x=body_out, hidden_state=hidden_state) @@ -45,7 +45,7 @@ def get_act_val(self, ob, hidden_state=None, *args, **kwargs): @torch.no_grad() def get_val(self, ob, hidden_state=None, *args, **kwargs): self.eval_mode() - ob = torch_float(ob, device=ppo_cfg.device).unsqueeze(dim=1) + ob = torch_float(ob, device=cfg.alg.device).unsqueeze(dim=1) val, body_out, out_hidden_state = self.critic(x=ob, hidden_state=hidden_state) val = val.squeeze(-1) @@ -53,7 +53,7 @@ def get_val(self, ob, hidden_state=None, *args, **kwargs): def optim_preprocess(self, data): for key, val in data.items(): - data[key] = torch_float(val, device=ppo_cfg.device) + data[key] = torch_float(val, device=cfg.alg.device) ob = data['ob'] action = data['action'] ret = data['ret'] diff --git a/easyrl/agents/ppo_rnn_agent.py b/easyrl/agents/ppo_rnn_agent.py index 2f1c5ed..4f70610 100644 --- a/easyrl/agents/ppo_rnn_agent.py +++ b/easyrl/agents/ppo_rnn_agent.py @@ -2,7 +2,7 @@ import torch from easyrl.agents.ppo_agent import PPOAgent -from easyrl.configs.ppo_config import ppo_cfg +from easyrl.configs import cfg from easyrl.utils.torch_util import action_entropy from easyrl.utils.torch_util import action_from_dist from easyrl.utils.torch_util import action_log_prob @@ -21,7 +21,7 @@ def get_action(self, ob, sample=True, hidden_state=None, prev_done=False, *args, self.eval_mode() hidden_state = self.check_hidden_state(hidden_state, prev_done) - t_ob = torch.from_numpy(ob).float().to(ppo_cfg.device).unsqueeze(dim=1) + t_ob = torch.from_numpy(ob).float().to(cfg.alg.device).unsqueeze(dim=1) act_dist, val, out_hidden_state = self.get_act_val(t_ob, hidden_state=hidden_state) action = action_from_dist(act_dist, @@ -36,7 +36,7 @@ def get_action(self, ob, sample=True, hidden_state=None, prev_done=False, *args, return torch_to_np(action.squeeze(1)), action_info, out_hidden_state def get_act_val(self, ob, hidden_state=None, done=None, *args, **kwargs): - ob = torch_float(ob, device=ppo_cfg.device) + ob = torch_float(ob, device=cfg.alg.device) act_dist, body_out, out_hidden_state = self.actor(ob, hidden_state=hidden_state, done=done) @@ -56,7 +56,7 @@ def get_val(self, ob, hidden_state=None, prev_done=False, *args, **kwargs): self.eval_mode() hidden_state = self.check_hidden_state(hidden_state, prev_done) - ob = torch_float(ob, device=ppo_cfg.device).unsqueeze(dim=1) + ob = torch_float(ob, device=cfg.alg.device).unsqueeze(dim=1) val, body_out, out_hidden_state = self.critic(x=ob, hidden_state=hidden_state) val = val.squeeze(-1) @@ -64,7 +64,7 @@ def get_val(self, ob, hidden_state=None, prev_done=False, *args, **kwargs): def optim_preprocess(self, data): for key, val in data.items(): - data[key] = torch_float(val, device=ppo_cfg.device) + data[key] = torch_float(val, device=cfg.alg.device) ob = data['ob'] action = data['action'] ret = data['ret'] diff --git a/easyrl/agents/sac_agent.py b/easyrl/agents/sac_agent.py index ccdb67f..8b8f8b5 100644 --- a/easyrl/agents/sac_agent.py +++ b/easyrl/agents/sac_agent.py @@ -7,7 +7,7 @@ import torch.optim as optim from easyrl.agents.base_agent import BaseAgent -from easyrl.configs.sac_config import sac_cfg +from easyrl.configs import cfg from easyrl.utils.common import load_from_pickle from easyrl.utils.common import save_to_pickle from easyrl.utils.gym_util import num_space_dim @@ -24,14 +24,16 @@ from easyrl.utils.torch_util import torch_float from easyrl.utils.torch_util import torch_to_np from easyrl.utils.torch_util import unfreeze_model - +import gym +from typing import Any class SACAgent(BaseAgent): - def __init__(self, actor, q1, q2, env, memory): - self.actor = actor - self.q1 = q1 - self.q2 = q2 - self.memory = memory + actor: nn.Module + env: gym.Env + memory: Any + q1: nn.Module = None + q2: nn.Module = None + def __post_init__(self): self.q1_tgt = deepcopy(self.q1) self.q2_tgt = deepcopy(self.q2) freeze_model(self.q1_tgt) @@ -40,12 +42,12 @@ def __init__(self, actor, q1, q2, env, memory): self.q2_tgt.eval() move_to([self.actor, self.q1, self.q2, self.q1_tgt, self.q2_tgt], - device=sac_cfg.device) - self.mem_file = sac_cfg.model_dir.joinpath('mem.pkl') + device=cfg.alg.device) + self.mem_file = cfg.alg.model_dir.joinpath('mem.pkl') optim_args = dict( - lr=sac_cfg.actor_lr, - weight_decay=sac_cfg.weight_decay, - amsgrad=sac_cfg.use_amsgrad + lr=cfg.alg.actor_lr, + weight_decay=cfg.alg.weight_decay, + amsgrad=cfg.alg.use_amsgrad ) self.pi_optimizer = optim.Adam(self.actor.parameters(), @@ -53,30 +55,30 @@ def __init__(self, actor, q1, q2, env, memory): q_params = list(self.q1.parameters()) + list(self.q2.parameters()) # keep unique elements only. self.q_params = dict.fromkeys(q_params).keys() - optim_args['lr'] = sac_cfg.critic_lr + optim_args['lr'] = cfg.alg.critic_lr self.q_optimizer = optim.Adam(self.q_params, **optim_args) - if sac_cfg.alpha is None: - if sac_cfg.tgt_entropy is None: - self.tgt_entropy = -float(num_space_dim(env.action_space)) + if cfg.alg.alpha is None: + if cfg.alg.tgt_entropy is None: + self.tgt_entropy = -float(num_space_dim(self.env.action_space)) else: - self.tgt_entropy = sac_cfg.tgt_entropy - self.log_alpha = nn.Parameter(torch.zeros(1, device=sac_cfg.device)) + self.tgt_entropy = cfg.alg.tgt_entropy + self.log_alpha = nn.Parameter(torch.zeros(1, device=cfg.alg.device)) self.alpha_optimizer = optim.Adam( [self.log_alpha], - lr=sac_cfg.actor_lr, + lr=cfg.alg.actor_lr, ) @property def alpha(self): - if sac_cfg.alpha is None: + if cfg.alg.alpha is None: return self.log_alpha.exp().item() else: - return sac_cfg.alpha + return cfg.alg.alpha @torch.no_grad() def get_action(self, ob, sample=True, *args, **kwargs): self.eval_mode() - ob = torch_float(ob, device=sac_cfg.device) + ob = torch_float(ob, device=cfg.alg.device) act_dist = self.actor(ob)[0] action = action_from_dist(act_dist, sample=sample) @@ -84,11 +86,11 @@ def get_action(self, ob, sample=True, *args, **kwargs): return torch_to_np(action), action_info @torch.no_grad() - def get_val(self, ob, action, tgt=False, q1=True, *args, **kwargs): + def get_val(self, ob, action, tgt=False, first=True, *args, **kwargs): self.eval_mode() - ob = torch_float(ob, device=sac_cfg.device) - action = torch_float(action, device=sac_cfg.device) - idx = 1 if q1 else 2 + ob = torch_float(ob, device=cfg.alg.device) + action = torch_float(action, device=cfg.alg.device) + idx = 1 if first else 2 tgt_suffix = '_tgt' if tgt else '' q_func = getattr(self, f'q{idx}{tgt_suffix}') val = q_func((ob, action))[0] @@ -98,26 +100,26 @@ def get_val(self, ob, action, tgt=False, q1=True, *args, **kwargs): def optimize(self, data, *args, **kwargs): self.train_mode() for key, val in data.items(): - data[key] = torch_float(val, device=sac_cfg.device) - obs = data['obs'].squeeze(1) - actions = data['actions'].squeeze(1) - next_obs = data['next_obs'].squeeze(1) - rewards = data['rewards'] - dones = data['dones'] + data[key] = torch_float(val, device=cfg.alg.device) + obs = data['obs'] + actions = data['actions'] + next_obs = data['next_obs'] + rewards = data['rewards'].unsqueeze(-1) + dones = data['dones'].unsqueeze(-1) q_info = self.update_q(obs=obs, actions=actions, next_obs=next_obs, rewards=rewards, dones=dones) pi_info = self.update_pi(obs=obs) - alpha_info = self.update_alpha(pi_info['pi_entropy']) + alpha_info = self.update_alpha(pi_info['pi_neg_log_prob']) optim_info = {**q_info, **pi_info, **alpha_info} optim_info['alpha'] = self.alpha if hasattr(self, 'log_alpha'): optim_info['log_alpha'] = self.log_alpha.item() - soft_update(self.q1_tgt, self.q1, sac_cfg.polyak) - soft_update(self.q2_tgt, self.q2, sac_cfg.polyak) + soft_update(self.q1_tgt, self.q1, cfg.alg.polyak) + soft_update(self.q2_tgt, self.q2, cfg.alg.polyak) return optim_info def update_q(self, obs, actions, next_obs, rewards, dones): @@ -131,19 +133,20 @@ def update_q(self, obs, actions, next_obs, rewards, dones): nq1_tgt_val = self.q1_tgt((next_obs, next_actions))[0] nq2_tgt_val = self.q2_tgt((next_obs, next_actions))[0] nq_tgt_val = torch.min(nq1_tgt_val, nq2_tgt_val) - self.alpha * nlog_prob - q_tgt_val = rewards + sac_cfg.rew_discount * (1 - dones) * nq_tgt_val + q_tgt_val = rewards + cfg.alg.rew_discount * (1 - dones) * nq_tgt_val loss_q1 = F.mse_loss(q1, q_tgt_val) loss_q2 = F.mse_loss(q2, q_tgt_val) loss_q = loss_q1 + loss_q2 self.q_optimizer.zero_grad() loss_q.backward() - grad_norm = clip_grad(self.q_params, sac_cfg.max_grad_norm) + grad_norm = clip_grad(self.q_params, cfg.alg.max_grad_norm) self.q_optimizer.step() q_info = dict( q1_loss=loss_q1.item(), q2_loss=loss_q2.item(), - q1_val=torch_to_np(q1), - q2_val=torch_to_np(q2) + vec_q1_val=torch_to_np(q1), + vec_q2_val=torch_to_np(q2), + vec_q_tgt_val=torch_to_np(q_tgt_val), ) q_info['q_grad_norm'] = grad_norm return q_info @@ -162,20 +165,20 @@ def update_pi(self, obs): self.q_optimizer.zero_grad() self.pi_optimizer.zero_grad() loss_pi.backward() - grad_norm = clip_grad(self.actor.parameters(), sac_cfg.max_grad_norm) + grad_norm = clip_grad(self.actor.parameters(), cfg.alg.max_grad_norm) self.pi_optimizer.step() pi_info = dict( pi_loss=loss_pi.item(), - pi_entropy=-new_log_prob.mean().item() + pi_neg_log_prob=-new_log_prob.mean().item() ) pi_info['pi_grad_norm'] = grad_norm unfreeze_model([self.q1, self.q2]) return pi_info - def update_alpha(self, pi_entropy): - if sac_cfg.alpha is not None: + def update_alpha(self, pi_neg_log_prob): + if cfg.alg.alpha is not None: return dict() - alpha_loss = self.log_alpha.exp() * (pi_entropy - self.tgt_entropy) + alpha_loss = self.log_alpha.exp() * (pi_neg_log_prob - self.tgt_entropy) self.alpha_optimizer.zero_grad() alpha_loss.backward() self.alpha_optimizer.step() @@ -205,16 +208,16 @@ def save_model(self, is_best=False, step=None): 'pi_optim_state_dict': self.pi_optimizer.state_dict(), 'q_optim_state_dict': self.q_optimizer.state_dict(), } - if sac_cfg.alpha is None: + if cfg.alg.alpha is None: data_to_save['log_alpha'] = self.log_alpha data_to_save['alpha_optim_state_dict'] = self.alpha_optimizer.state_dict() - save_model(data_to_save, sac_cfg, is_best=is_best, step=step) + save_model(data_to_save, cfg.alg, is_best=is_best, step=step) logger.info(f'Saving the replay buffer to: {self.mem_file}.') save_to_pickle(self.memory, self.mem_file) logger.info('The replay buffer is saved.') def load_model(self, step=None, pretrain_model=None): - ckpt_data = load_ckpt_data(sac_cfg, step=step, + ckpt_data = load_ckpt_data(cfg.alg, step=step, pretrain_model=pretrain_model) load_state_dict(self.actor, ckpt_data['actor_state_dict']) @@ -226,13 +229,13 @@ def load_model(self, step=None, pretrain_model=None): ckpt_data['q2_state_dict']) load_state_dict(self.q2_tgt, ckpt_data['q2_tgt_state_dict']) - if sac_cfg.alpha is None: + if cfg.alg.alpha is None: self.log_alpha = ckpt_data['log_alpha'] if pretrain_model is not None: return self.pi_optimizer.load_state_dict(ckpt_data['pi_optim_state_dict']) self.q_optimizer.load_state_dict(ckpt_data['q_optim_state_dict']) - if sac_cfg.alpha is None: + if cfg.alg.alpha is None: self.alpha_optimizer.load_state_dict(ckpt_data['alpha_optim_state_dict']) logger.info(f'Loading the replay buffer from: {self.mem_file}.') diff --git a/easyrl/configs/__init__.py b/easyrl/configs/__init__.py index e69de29..a392972 100644 --- a/easyrl/configs/__init__.py +++ b/easyrl/configs/__init__.py @@ -0,0 +1,31 @@ +from dataclasses import dataclass +from typing import Any + +from easyrl.configs.ppo_config import PPOConfig +from easyrl.configs.sac_config import SACConfig +from easyrl.utils.rl_logger import logger + + +@dataclass +class CFG: + alg: Any = None + + +cfg = CFG() + + +def set_config(alg): + global cfg + if alg == 'ppo': + cfg.alg = PPOConfig() + elif alg == 'sac': + cfg.alg = SACConfig() + elif alg == 'sac_adv': + cfg.alg = SACAdvConfig() + elif alg == 'redq': + cfg.alg = REQDConfig() + elif alg == 'offppo': + cfg.alg = OffPPOConfig() + else: + raise ValueError(f'Unimplemented algorithm: {alg}') + logger.info(f'Alogrithm type:{type(cfg.alg)}') diff --git a/easyrl/configs/basic_config.py b/easyrl/configs/basic_config.py index c6a7550..2423c07 100644 --- a/easyrl/configs/basic_config.py +++ b/easyrl/configs/basic_config.py @@ -19,7 +19,7 @@ class BasicConfig: log_interval: int = 10 weight_decay: float = 0.00 max_grad_norm: float = None - batch_size: int = 128 + batch_size: int = 256 save_best_only: bool = False episode_steps: int = 1000 max_steps: int = 1e6 diff --git a/easyrl/configs/sac_config.py b/easyrl/configs/sac_config.py index 377c23f..110eb05 100644 --- a/easyrl/configs/sac_config.py +++ b/easyrl/configs/sac_config.py @@ -10,12 +10,15 @@ class SACConfig(BasicConfig): warmup_steps: int = 10000 use_amsgrad: bool = True opt_interval: int = 50 # perform optimization every n environment steps - opt_num: int = 50 # how many optimization loops in every optimization stage + opt_num: int = 25 # how many optimization loops in every optimization stage + # Increase this number if num_envs > 1 so that the data is updated more often + # as the data collection is also faster alpha: float = None rew_discount: float = 0.99 replay_size: int = 1000000 polyak: float = 0.995 tgt_entropy: float = None + num_envs: int = 1 def __post_init__(self): self.eval_interval = 300 diff --git a/easyrl/engine/basic_engine.py b/easyrl/engine/basic_engine.py index 6c1c45b..e2119a9 100644 --- a/easyrl/engine/basic_engine.py +++ b/easyrl/engine/basic_engine.py @@ -1,13 +1,33 @@ +from collections import deque +from dataclasses import dataclass +from typing import Any + import numpy as np +from easyrl.configs import cfg +from easyrl.utils.rl_logger import TensorboardLogger +@dataclass class BasicEngine: - def __init__(self, agent, runner, **kwargs): - self.agent = agent - self.runner = runner + agent: Any + runner: Any + + def __post_init__(self): self.cur_step = 0 self._best_eval_ret = -np.inf self._eval_is_best = False + if cfg.alg.test or cfg.alg.resume: + self.cur_step = self.agent.load_model(step=cfg.alg.resume_step) + else: + if cfg.alg.pretrain_model is not None: + self.agent.load_model(pretrain_model=cfg.alg.pretrain_model) + cfg.alg.create_model_log_dir() + self.train_ep_return = deque(maxlen=100) + self.smooth_eval_return = None + self.smooth_tau = cfg.alg.smooth_eval_tau + self.optim_stime = None + if not cfg.alg.test: + self.tf_logger = TensorboardLogger(log_dir=cfg.alg.log_dir) def train(self, **kwargs): raise NotImplementedError diff --git a/easyrl/engine/ppo_engine.py b/easyrl/engine/ppo_engine.py index ae07d37..5e26797 100644 --- a/easyrl/engine/ppo_engine.py +++ b/easyrl/engine/ppo_engine.py @@ -8,7 +8,7 @@ from torch.utils.data import DataLoader from tqdm import tqdm -from easyrl.configs.ppo_config import ppo_cfg +from easyrl.configs import cfg from easyrl.engine.basic_engine import BasicEngine from easyrl.utils.common import get_list_stats from easyrl.utils.common import save_traj @@ -18,63 +18,55 @@ class PPOEngine(BasicEngine): - def __init__(self, agent, runner): - super().__init__(agent=agent, - runner=runner) - if ppo_cfg.test or ppo_cfg.resume: - self.cur_step = self.agent.load_model(step=ppo_cfg.resume_step) - else: - if ppo_cfg.pretrain_model is not None: - self.agent.load_model(pretrain_model=ppo_cfg.pretrain_model) - ppo_cfg.create_model_log_dir() - self.train_ep_return = deque(maxlen=100) - self.smooth_eval_return = None - self.smooth_tau = ppo_cfg.smooth_eval_tau - self.optim_stime = None - if not ppo_cfg.test: - self.tf_logger = TensorboardLogger(log_dir=ppo_cfg.log_dir) - def train(self): for iter_t in count(): - if iter_t % ppo_cfg.eval_interval == 0: - eval_log_info, _ = self.eval() + if iter_t % cfg.alg.eval_interval == 0: + det_log_info, _ = self.eval(eval_num=cfg.alg.test_num, + sample=False, smooth=True) + sto_log_info, _ = self.eval(eval_num=cfg.alg.test_num, + sample=True, smooth=False) + + det_log_info = {f'det/{k}': v for k, v in det_log_info.items()} + sto_log_info = {f'sto/{k}': v for k, v in sto_log_info.items()} + eval_log_info = {**det_log_info, **sto_log_info} self.agent.save_model(is_best=self._eval_is_best, step=self.cur_step) else: eval_log_info = None traj, rollout_time = self.rollout_once(sample=True, - time_steps=ppo_cfg.episode_steps) + time_steps=cfg.alg.episode_steps) train_log_info = self.train_once(traj) - if iter_t % ppo_cfg.log_interval == 0: + if iter_t % cfg.alg.log_interval == 0: train_log_info['train/rollout_time'] = rollout_time if eval_log_info is not None: train_log_info.update(eval_log_info) - if ppo_cfg.linear_decay_lr: + if cfg.alg.linear_decay_lr: train_log_info.update(self.agent.get_lr()) - if ppo_cfg.linear_decay_clip_range: - train_log_info.update(dict(clip_range=ppo_cfg.clip_range)) + if cfg.alg.linear_decay_clip_range: + train_log_info.update(dict(clip_range=cfg.alg.clip_range)) scalar_log = {'scalar': train_log_info} self.tf_logger.save_dict(scalar_log, step=self.cur_step) - if self.cur_step > ppo_cfg.max_steps: + if self.cur_step > cfg.alg.max_steps: break - if ppo_cfg.linear_decay_lr: + if cfg.alg.linear_decay_lr: self.agent.decay_lr() - if ppo_cfg.linear_decay_clip_range: + if cfg.alg.linear_decay_clip_range: self.agent.decay_clip_range() @torch.no_grad() - def eval(self, render=False, save_eval_traj=False, eval_num=1, sleep_time=0, smooth=True, no_tqdm=None): + def eval(self, render=False, save_eval_traj=False, eval_num=1, + sleep_time=0, sample=True, smooth=True, no_tqdm=None): time_steps = [] rets = [] lst_step_infos = [] if no_tqdm: disable_tqdm = bool(no_tqdm) else: - disable_tqdm = not ppo_cfg.test + disable_tqdm = not cfg.alg.test for idx in tqdm(range(eval_num), disable=disable_tqdm): - traj, _ = self.rollout_once(time_steps=ppo_cfg.episode_steps, + traj, _ = self.rollout_once(time_steps=cfg.alg.episode_steps, return_on_done=True, - sample=ppo_cfg.sample_action, + sample=cfg.alg.sample_action and sample, render=render, sleep_time=sleep_time, render_image=save_eval_traj, @@ -88,7 +80,7 @@ def eval(self, render=False, save_eval_traj=False, eval_num=1, sleep_time=0, smo lst_step_infos.append(infos[tsps[ej] - 1][ej]) time_steps.extend(tsps) if save_eval_traj: - save_traj(traj, ppo_cfg.eval_dir) + save_traj(traj, cfg.alg.eval_dir) raw_traj_info = {'return': rets, 'episode_length': time_steps, @@ -127,7 +119,7 @@ def train_once(self, traj): self.cur_step += traj.total_steps rollout_dataloader = self.traj_preprocess(traj) optim_infos = [] - for oe in range(ppo_cfg.opt_epochs): + for oe in range(cfg.alg.opt_epochs): for batch_ndx, batch_data in enumerate(rollout_dataloader): optim_info = self.agent.optimize(batch_data) optim_infos.append(optim_info) @@ -139,7 +131,7 @@ def traj_preprocess(self, traj): log_prob = np.array([ainfo['log_prob'] for ainfo in action_infos]) adv = self.cal_advantages(traj) ret = adv + vals - if ppo_cfg.normalize_adv: + if cfg.alg.normalize_adv: adv = adv.astype(np.float64) adv = (adv - np.mean(adv)) / (np.std(adv) + 1e-8) data = dict( @@ -152,7 +144,7 @@ def traj_preprocess(self, traj): ) rollout_dataset = EpisodeDataset(**data) rollout_dataloader = DataLoader(rollout_dataset, - batch_size=ppo_cfg.batch_size, + batch_size=cfg.alg.batch_size, shuffle=True) return rollout_dataloader @@ -161,8 +153,8 @@ def cal_advantages(self, traj): action_infos = traj.action_infos vals = np.array([ainfo['val'] for ainfo in action_infos]) last_val = traj.extra_data['last_val'] - adv = cal_gae(gamma=ppo_cfg.rew_discount, - lam=ppo_cfg.gae_lambda, + adv = cal_gae(gamma=cfg.alg.rew_discount, + lam=cfg.alg.gae_lambda, rewards=rewards, value_estimates=vals, last_value=last_val, diff --git a/easyrl/engine/ppo_rnn_engine.py b/easyrl/engine/ppo_rnn_engine.py index fcbd22d..d7194fd 100644 --- a/easyrl/engine/ppo_rnn_engine.py +++ b/easyrl/engine/ppo_rnn_engine.py @@ -3,7 +3,7 @@ import numpy as np from torch.utils.data import DataLoader -from easyrl.configs.ppo_config import ppo_cfg +from easyrl.configs import cfg from easyrl.engine.ppo_engine import PPOEngine from easyrl.utils.torch_util import DictDataset @@ -19,7 +19,7 @@ def traj_preprocess(self, traj): log_prob = np.array([ainfo['log_prob'] for ainfo in action_infos]) adv = self.cal_advantages(traj) ret = adv + vals - if ppo_cfg.normalize_adv: + if cfg.alg.normalize_adv: adv = adv.astype(np.float64) adv = (adv - np.mean(adv)) / (np.std(adv) + 1e-8) # TxN --> NxT @@ -34,6 +34,6 @@ def traj_preprocess(self, traj): ) rollout_dataset = DictDataset(**data) rollout_dataloader = DataLoader(rollout_dataset, - batch_size=ppo_cfg.batch_size, + batch_size=cfg.alg.batch_size, shuffle=True) return rollout_dataloader diff --git a/easyrl/engine/sac_engine.py b/easyrl/engine/sac_engine.py index 6dc943f..a218ae1 100644 --- a/easyrl/engine/sac_engine.py +++ b/easyrl/engine/sac_engine.py @@ -1,5 +1,4 @@ import time -from collections import deque from copy import deepcopy from itertools import count @@ -7,47 +6,33 @@ import torch from tqdm import tqdm -from easyrl.configs.sac_config import sac_cfg +from easyrl.configs import cfg from easyrl.engine.basic_engine import BasicEngine from easyrl.utils.common import get_list_stats from easyrl.utils.common import save_traj +from easyrl.utils.data import StepData from easyrl.utils.data import Trajectory -from easyrl.utils.rl_logger import TensorboardLogger class SACEngine(BasicEngine): - def __init__(self, agent, runner): - super().__init__(agent=agent, - runner=runner) - if sac_cfg.test or sac_cfg.resume: - self.cur_step = self.agent.load_model(step=sac_cfg.resume_step) - else: - if sac_cfg.pretrain_model is not None: - self.agent.load_model(pretrain_model=sac_cfg.pretrain_model) - sac_cfg.create_model_log_dir() - self.train_ep_return = deque(maxlen=100) - self.smooth_eval_return = None - self.smooth_tau = sac_cfg.smooth_eval_tau - self.optim_stime = None - if not sac_cfg.test: - self.tf_logger = TensorboardLogger(log_dir=sac_cfg.log_dir) def train(self): - if len(self.agent.memory) < sac_cfg.warmup_steps: + if len(self.agent.memory) < cfg.alg.warmup_steps: self.runner.reset() + rollout_steps = int((cfg.alg.warmup_steps - len(self.agent.memory)) / cfg.alg.num_envs) traj, _ = self.rollout_once(random_action=True, - time_steps=sac_cfg.warmup_steps - len(self.agent.memory)) + time_steps=rollout_steps) self.add_traj_to_memory(traj) self.runner.reset() for iter_t in count(): traj, rollout_time = self.rollout_once(sample=True, - time_steps=sac_cfg.opt_interval) + time_steps=cfg.alg.opt_interval) self.add_traj_to_memory(traj) train_log_info = self.train_once() - if iter_t % sac_cfg.eval_interval == 0: - det_log_info, _ = self.eval(eval_num=sac_cfg.test_num, + if iter_t % cfg.alg.eval_interval == 0: + det_log_info, _ = self.eval(eval_num=cfg.alg.test_num, sample=False, smooth=True) - sto_log_info, _ = self.eval(eval_num=sac_cfg.test_num, + sto_log_info, _ = self.eval(eval_num=cfg.alg.test_num, sample=True, smooth=False) det_log_info = {f'det/{k}': v for k, v in det_log_info.items()} sto_log_info = {f'sto/{k}': v for k, v in sto_log_info.items()} @@ -56,13 +41,14 @@ def train(self): step=self.cur_step) else: eval_log_info = None - if iter_t % sac_cfg.log_interval == 0: + if iter_t % cfg.alg.log_interval == 0: train_log_info['train/rollout_time'] = rollout_time + train_log_info['memory_size'] = len(self.agent.memory) if eval_log_info is not None: train_log_info.update(eval_log_info) scalar_log = {'scalar': train_log_info} self.tf_logger.save_dict(scalar_log, step=self.cur_step) - if self.cur_step > sac_cfg.max_steps: + if self.cur_step > cfg.alg.max_steps: break @torch.no_grad() @@ -74,11 +60,11 @@ def eval(self, render=False, save_eval_traj=False, sample=True, if no_tqdm: disable_tqdm = bool(no_tqdm) else: - disable_tqdm = not sac_cfg.test + disable_tqdm = not cfg.alg.test for idx in tqdm(range(eval_num), disable=disable_tqdm): - traj, _ = self.rollout_once(time_steps=sac_cfg.episode_steps, + traj, _ = self.rollout_once(time_steps=cfg.alg.episode_steps, return_on_done=True, - sample=sac_cfg.sample_action and sample, + sample=cfg.alg.sample_action and sample, render=render, sleep_time=sleep_time, render_image=save_eval_traj, @@ -92,7 +78,7 @@ def eval(self, render=False, save_eval_traj=False, sample=True, lst_step_infos.append(infos[tsps[ej] - 1][ej]) time_steps.extend(tsps) if save_eval_traj: - save_traj(traj, sac_cfg.eval_dir) + save_traj(traj, cfg.alg.eval_dir) raw_traj_info = {'return': rets, 'episode_length': time_steps, @@ -129,8 +115,8 @@ def rollout_once(self, *args, **kwargs): def train_once(self): self.optim_stime = time.perf_counter() optim_infos = [] - for oe in range(sac_cfg.opt_num): - sampled_data = self.agent.memory.sample(batch_size=sac_cfg.batch_size) + for oe in range(cfg.alg.opt_num): + sampled_data = self.agent.memory.sample(batch_size=cfg.alg.batch_size) sampled_data = Trajectory(traj_data=sampled_data) batch_data = dict( obs=sampled_data.obs, @@ -145,12 +131,19 @@ def train_once(self): def get_train_log(self, optim_infos): log_info = dict() - for key in optim_infos[0].keys(): - if 'val' in key: - continue + vector_keys = set() + scalar_keys = set() + for oinf in optim_infos: + for key in oinf.keys(): + if 'vec_' in key: + vector_keys.add(key) + else: + scalar_keys.add(key) + + for key in scalar_keys: log_info[key] = np.mean([inf[key] for inf in optim_infos if key in inf]) - for key in ['q1_val', 'q2_val']: + for key in vector_keys: k_stats = get_list_stats([inf[key] for inf in optim_infos if key in inf]) for sk, sv in k_stats.items(): log_info[f'{key}/' + sk] = sv @@ -163,6 +156,20 @@ def get_train_log(self, optim_infos): return train_log_info def add_traj_to_memory(self, traj): - for sd in traj.traj_data: + obs = traj.obs + actions = traj.actions + next_obs = traj.next_obs + rewards = traj.rewards + dones = traj.dones + rets = map(lambda x: x.swapaxes(0, 1).reshape(x.shape[0] * x.shape[1], + *x.shape[2:]), + (obs, actions, next_obs, rewards, dones)) + obs, actions, next_obs, rewards, dones = rets + for i in range(obs.shape[0]): + sd = StepData(ob=obs[i], + action=actions[i], + next_ob=next_obs[i], + reward=rewards[i], + done=dones[i]) self.agent.memory.append(deepcopy(sd)) self.cur_step += traj.total_steps diff --git a/easyrl/envs/vec_env.py b/easyrl/envs/vec_env.py index a3a64af..5845879 100644 --- a/easyrl/envs/vec_env.py +++ b/easyrl/envs/vec_env.py @@ -6,6 +6,8 @@ from abc import ABC from abc import abstractmethod +import numpy as np + from easyrl.utils.common import tile_images @@ -50,6 +52,43 @@ def __init__(self, num_envs, observation_space, action_space): self.observation_space = observation_space self.action_space = action_space + def random_actions(self): + """ + Return randomly sampled actions (shape: [num_envs, action_shape]) + """ + if isinstance(self.action_space, Discrete): + return np.random.randint(self.action_space.n, size=(self.num_envs, 1)) + elif isinstance(self.action_space, Box): + high = self.action_space.high if self.action_space.dtype.kind == 'f' \ + else self.action_space.high.astype('int64') + 1 + sample = np.empty((self.num_envs,) + self.action_space.shape) + + # Masking arrays which classify the coordinates according to interval + # type + unbounded = ~self.action_space.bounded_below & ~self.action_space.bounded_above + upp_bounded = ~self.action_space.bounded_below & self.action_space.bounded_above + low_bounded = self.action_space.bounded_below & ~self.action_space.bounded_above + bounded = self.action_space.bounded_below & self.action_space.bounded_above + + # Vectorized sampling by interval type + sample[:, unbounded] = np.random.normal( + size=(self.num_envs,) + unbounded[unbounded].shape) + + sample[:, low_bounded] = np.random.exponential( + size=(self.num_envs,) + low_bounded[low_bounded].shape) + self.action_space.low[low_bounded] + + sample[:, upp_bounded] = -np.random.exponential( + size=(self.num_envs,) + upp_bounded[upp_bounded].shape) + self.action_space.high[upp_bounded] + + sample[:, bounded] = np.random.uniform(low=self.action_space.low[bounded], + high=high[bounded], + size=(self.num_envs,) + bounded[bounded].shape) + if self.action_space.dtype.kind == 'i': + sample = np.floor(sample) + return sample.astype(self.action_space.dtype) + else: + raise TypeError(f'Unknown data type of action space: {type(self.action_space)}') + @abstractmethod def reset(self): """ diff --git a/easyrl/runner/episodic_runner.py b/easyrl/runner/episodic_runner.py index 9b6b803..448833d 100644 --- a/easyrl/runner/episodic_runner.py +++ b/easyrl/runner/episodic_runner.py @@ -8,13 +8,14 @@ from easyrl.utils.data import StepData from easyrl.utils.data import Trajectory from easyrl.utils.torch_util import torch_to_np - +from easyrl.utils.gym_util import get_render_images class EpisodicRunner(BasicRunner): @torch.no_grad() def __call__(self, time_steps, sample=True, evaluation=False, return_on_done=False, render=False, render_image=False, - sleep_time=0, reset_kwargs=None, action_kwargs=None): + sleep_time=0, reset_kwargs=None, action_kwargs=None, + random_action=False): traj = Trajectory() if reset_kwargs is None: reset_kwargs = {} @@ -41,11 +42,15 @@ def __call__(self, time_steps, sample=True, evaluation=False, time.sleep(sleep_time) if render_image: # get render images at the same time step as ob - imgs = deepcopy(env.get_images()) + imgs = get_render_images(env) - action, action_info = self.agent.get_action(ob, - sample=sample, - **action_kwargs) + if random_action: + action = env.random_actions() + action_info = dict() + else: + action, action_info = self.agent.get_action(ob, + sample=sample, + **action_kwargs) next_ob, reward, done, info = env.step(action) next_ob = deepcopy(next_ob) if render_image: @@ -57,12 +62,17 @@ def __call__(self, time_steps, sample=True, evaluation=False, # vec env automatically resets the environment when it's done # so the returned next_ob is not actually the next observation all_dones[done_idx] = True + + true_done = deepcopy(done) + for iidx, inf in enumerate(info): + true_done[iidx] = true_done[iidx] and not inf.get('TimeLimit.truncated', + False) sd = StepData(ob=ob, action=deepcopy(action), action_info=deepcopy(action_info), next_ob=next_ob, reward=deepcopy(reward), - done=deepcopy(done), + done=true_done, info=deepcopy(info)) ob = next_ob traj.add(sd) @@ -72,3 +82,6 @@ def __call__(self, time_steps, sample=True, evaluation=False, last_val = self.agent.get_val(traj[-1].next_ob) traj.add_extra('last_val', torch_to_np(last_val)) return traj + + def reset(self, *args, **kwargs): + pass diff --git a/easyrl/runner/step_runner.py b/easyrl/runner/step_runner.py index 9510593..aed3d8e 100644 --- a/easyrl/runner/step_runner.py +++ b/easyrl/runner/step_runner.py @@ -2,51 +2,51 @@ from copy import deepcopy import torch -from gym.wrappers.time_limit import TimeLimit from easyrl.runner.base_runner import BasicRunner -from easyrl.utils.common import list_to_numpy from easyrl.utils.data import StepData from easyrl.utils.data import Trajectory - +from easyrl.utils.gym_util import get_render_images +from easyrl.utils.gym_util import is_time_limit_env +from easyrl.utils.common import list_to_numpy +import numpy as np class StepRunner(BasicRunner): # Simulate the environment for T steps, # and in the next call, the environment will continue # from where it's left in the previous call. - # only single env (no parallel envs) is supported for now. - # we also assume the environment is wrapped by TimeLimit - # from https://github.com/openai/gym/blob/master/gym/wrappers/time_limit.py - def __init__(self, agent, env, eval_env=None, max_steps=None): + # we also assume the environment is wrapped by VecEnv + def __init__(self, agent, env, eval_env=None): super().__init__(agent=agent, env=env, eval_env=eval_env) self.cur_ob = None - self.max_steps = max_steps - self.cur_step = 0 - if not (isinstance(env, TimeLimit) and isinstance(eval_env, TimeLimit)): - raise TypeError('Please add TimeLimit wrapper on the environment.') @torch.no_grad() def __call__(self, time_steps, sample=True, evaluation=False, return_on_done=False, render=False, render_image=False, - sleep_time=0, reset_kwargs=None, + sleep_time=0, env_reset_kwargs=None, agent_reset_kwargs=None, action_kwargs=None, random_action=False): traj = Trajectory() - if reset_kwargs is None: - reset_kwargs = {} + if env_reset_kwargs is None: + env_reset_kwargs = {} + if agent_reset_kwargs is None: + agent_reset_kwargs = {} if action_kwargs is None: action_kwargs = {} + action_kwargs['eval'] = evaluation if evaluation: env = self.eval_env else: env = self.train_env if self.cur_ob is None or evaluation: - ob = env.reset(**reset_kwargs) - self.cur_step = 0 - else: - ob = self.cur_ob + self.reset(env=env, + env_reset_kwargs=env_reset_kwargs, + agent_reset_kwargs=agent_reset_kwargs) + ob = self.cur_ob ob = deepcopy(ob) + if return_on_done: + all_dones = np.zeros(env.num_envs, dtype=bool) for t in range(time_steps): if render: env.render() @@ -54,48 +54,53 @@ def __call__(self, time_steps, sample=True, evaluation=False, time.sleep(sleep_time) if render_image: # get render images at the same time step as ob - imgs = deepcopy(env.get_images()) + imgs = get_render_images(env) if random_action: - action = env.action_space.sample() + action = env.random_actions() action_info = dict() else: action, action_info = self.agent.get_action(ob, sample=sample, **action_kwargs) next_ob, reward, done, info = env.step(action) - self.cur_step += 1 next_ob = deepcopy(next_ob) if render_image: for img, inf in zip(imgs, info): inf['render_image'] = deepcopy(img) - true_done = done and not info.get('TimeLimit.truncated', - False) - sd = StepData(ob=list_to_numpy(deepcopy(ob), - expand_dims=0), - action=list_to_numpy(deepcopy(action), - expand_dims=0), - action_info=[deepcopy(action_info)], - next_ob=list_to_numpy(deepcopy(next_ob), - expand_dims=0), - reward=list_to_numpy(reward), - done=list_to_numpy(true_done), - info=[deepcopy(info)]) + + done_idx = np.argwhere(done).flatten() + if done_idx.size > 0 and return_on_done: + # vec env automatically resets the environment when it's done + # so the returned next_ob is not actually the next observation + all_dones[done_idx] = True + + true_done = deepcopy(done) + for iidx, inf in enumerate(info): + true_done[iidx] = true_done[iidx] and not inf.get('TimeLimit.truncated', + False) + sd = StepData(ob=ob, + action=deepcopy(action), + action_info=deepcopy(action_info), + next_ob=next_ob, + reward=reward, + done=true_done, + info=info) ob = next_ob traj.add(sd) - if return_on_done and done: + if return_on_done and np.all(all_dones): break - need_reset = done - if self.max_steps is not None: - need_reset = need_reset or self.cur_step > self.max_steps - if need_reset: - ob = deepcopy(env.reset(**reset_kwargs)) - self.cur_step = 0 - self.cur_ob = deepcopy(ob) + self.cur_ob = None if evaluation else deepcopy(ob) return traj - def reset(self, reset_kwargs=None): - if reset_kwargs is None: - reset_kwargs = {} - ob = self.train_env.reset(**reset_kwargs) - self.cur_step = 0 + def reset(self, env=None, env_reset_kwargs=None, agent_reset_kwargs=None): + if env is None: + env = self.train_env + if env_reset_kwargs is None: + env_reset_kwargs = {} + if agent_reset_kwargs is None: + agent_reset_kwargs = {} + ob = env.reset(**env_reset_kwargs) + if hasattr(self.agent, 'reset'): + self.agent.reset(**agent_reset_kwargs) self.cur_ob = deepcopy(ob) + return ob diff --git a/easyrl/utils/gym_util.py b/easyrl/utils/gym_util.py index d59c257..d5cf854 100644 --- a/easyrl/utils/gym_util.py +++ b/easyrl/utils/gym_util.py @@ -73,6 +73,8 @@ def get_render_images(env): def is_time_limit_env(env): if not (isinstance(env, TimeLimit)): - if not hasattr(env, 'env') or (hasattr(env, 'env') and not isinstance(env.env, TimeLimit)): + if not hasattr(env, 'env'): return False + else: + return is_time_limit_env(env.env) return True diff --git a/easyrl/utils/torch_util.py b/easyrl/utils/torch_util.py index 0450ed1..104952f 100644 --- a/easyrl/utils/torch_util.py +++ b/easyrl/utils/torch_util.py @@ -34,11 +34,13 @@ def clip_grad(params, max_grad_norm): return grad_norm -def freeze_model(model): +def freeze_model(model, eval=True): if isinstance(model, list) or isinstance(model, tuple): for md in model: freeze_model(md) else: + if eval: + model.eval() for param in model.parameters(): param.requires_grad = False diff --git a/examples/ppo.py b/examples/ppo.py index 06ca565..dfce0fa 100644 --- a/examples/ppo.py +++ b/examples/ppo.py @@ -2,8 +2,9 @@ import torch.nn as nn from easyrl.agents.ppo_agent import PPOAgent +from easyrl.configs import cfg +from easyrl.configs import set_config from easyrl.configs.command_line import cfg_from_cmd -from easyrl.configs.ppo_config import ppo_cfg from easyrl.engine.ppo_engine import PPOEngine from easyrl.models.categorical_policy import CategoricalPolicy from easyrl.models.diag_gaussian_policy import DiagGaussianPolicy @@ -15,9 +16,10 @@ def main(): - cfg_from_cmd(ppo_cfg) - if ppo_cfg.resume or ppo_cfg.test: - if ppo_cfg.test: + set_config('ppo') + cfg_from_cmd(cfg.alg) + if cfg.alg.resume or cfg.alg.test: + if cfg.alg.test: skip_params = [ 'test_num', 'num_envs', @@ -25,13 +27,13 @@ def main(): ] else: skip_params = [] - ppo_cfg.restore_cfg(skip_params=skip_params) - if ppo_cfg.env_name is None: - ppo_cfg.env_name = 'Ant-v2' - set_random_seed(ppo_cfg.seed) - env = make_vec_env(ppo_cfg.env_name, - ppo_cfg.num_envs, - seed=ppo_cfg.seed) + cfg.alg.restore_cfg(skip_params=skip_params) + if cfg.alg.env_name is None: + cfg.alg.env_name = 'HalfCheetah-v2' + set_random_seed(cfg.alg.seed) + env = make_vec_env(cfg.alg.env_name, + cfg.alg.num_envs, + seed=cfg.alg.seed) env.reset() ob_size = env.observation_space.shape[0] @@ -51,23 +53,22 @@ def main(): elif isinstance(env.action_space, gym.spaces.Box): act_size = env.action_space.shape[0] actor = DiagGaussianPolicy(actor_body, action_dim=act_size, - tanh_on_dist=ppo_cfg.tanh_on_dist, - std_cond_in=ppo_cfg.std_cond_in) + tanh_on_dist=cfg.alg.tanh_on_dist, + std_cond_in=cfg.alg.std_cond_in) else: - raise TypeError(f'Unknown action space ' - f'type: {env.action_space}') + raise TypeError(f'Unknown action space type: {env.action_space}') critic = ValueNet(critic_body) agent = PPOAgent(actor, critic) runner = EpisodicRunner(agent=agent, env=env) engine = PPOEngine(agent=agent, runner=runner) - if not ppo_cfg.test: + if not cfg.alg.test: engine.train() else: - stat_info, raw_traj_info = engine.eval(render=ppo_cfg.render, - save_eval_traj=ppo_cfg.save_test_traj, - eval_num=ppo_cfg.test_num, + stat_info, raw_traj_info = engine.eval(render=cfg.alg.render, + save_eval_traj=cfg.alg.save_test_traj, + eval_num=cfg.alg.test_num, sleep_time=0.04) import pprint pprint.pprint(stat_info) diff --git a/examples/sac.py b/examples/sac.py index cb3c39f..50e5929 100644 --- a/examples/sac.py +++ b/examples/sac.py @@ -3,22 +3,25 @@ import torch.nn as nn from easyrl.agents.sac_agent import SACAgent +from easyrl.configs import cfg +from easyrl.configs import set_config from easyrl.configs.command_line import cfg_from_cmd -from easyrl.configs.sac_config import sac_cfg from easyrl.engine.sac_engine import SACEngine +from easyrl.envs.single_env_wrapper import SingleEnvWrapper from easyrl.models.diag_gaussian_policy import DiagGaussianPolicy from easyrl.models.mlp import MLP from easyrl.models.value_net import ValueNet from easyrl.replays.circular_buffer import CyclicBuffer -from easyrl.runner.step_runner import StepRunner +from easyrl.runner.nenv_step_runner import StepRunner from easyrl.utils.common import set_random_seed - +from easyrl.utils.gym_util import make_vec_env def main(): torch.set_num_threads(1) - cfg_from_cmd(sac_cfg) - if sac_cfg.resume or sac_cfg.test: - if sac_cfg.test: + set_config('sac') + cfg_from_cmd(cfg.alg) + if cfg.alg.resume or cfg.alg.test: + if cfg.alg.test: skip_params = [ 'test_num', 'num_envs', @@ -26,15 +29,19 @@ def main(): ] else: skip_params = [] - sac_cfg.restore_cfg(skip_params=skip_params) - if sac_cfg.env_name is None: - sac_cfg.env_name = 'HalfCheetah-v2' - if not sac_cfg.test: - sac_cfg.test_num = 10 - set_random_seed(sac_cfg.seed) - env = gym.make(sac_cfg.env_name) - env.seed(sac_cfg.seed) - eval_env = gym.make(sac_cfg.env_name) + cfg.alg.restore_cfg(skip_params=skip_params) + if cfg.alg.env_name is None: + cfg.alg.env_name = 'HalfCheetah-v2' + if not cfg.alg.test: + cfg.alg.test_num = 10 + set_random_seed(cfg.alg.seed) + env = make_vec_env(cfg.alg.env_name, + cfg.alg.num_envs, + seed=cfg.alg.seed) + # env = SingleEnvWrapper(gym.make(cfg.alg.env_name)) + eval_env = make_vec_env(cfg.alg.env_name, + cfg.alg.num_envs, + seed=cfg.alg.seed) ob_size = env.observation_space.shape[0] act_size = env.action_space.shape[0] @@ -59,18 +66,18 @@ def main(): clamp_log_std=True) q1 = ValueNet(q1_body) q2 = ValueNet(q2_body) - memory = CyclicBuffer(capacity=sac_cfg.replay_size) + memory = CyclicBuffer(capacity=cfg.alg.replay_size) agent = SACAgent(actor, q1=q1, q2=q2, env=env, memory=memory) runner = StepRunner(agent=agent, env=env, eval_env=eval_env) engine = SACEngine(agent=agent, runner=runner) - if not sac_cfg.test: + if not cfg.alg.test: engine.train() else: - stat_info, raw_traj_info = engine.eval(render=sac_cfg.render, - save_eval_traj=sac_cfg.save_test_traj, - eval_num=sac_cfg.test_num, + stat_info, raw_traj_info = engine.eval(render=cfg.alg.render, + save_eval_traj=cfg.alg.save_test_traj, + eval_num=cfg.alg.test_num, sleep_time=0.04) import pprint pprint.pprint(stat_info) From 84e0550debf7bc7be6decdc5a0610cbc39d7377a Mon Sep 17 00:00:00 2001 From: taochenshh Date: Mon, 21 Dec 2020 17:51:05 -0500 Subject: [PATCH 21/35] fix minor bugs --- easyrl/agents/ppo_rl2_agent.py | 19 +++++++++++----- easyrl/agents/ppo_rnn_agent.py | 19 +++++++++++----- easyrl/agents/sac_agent.py | 8 +++++-- easyrl/configs/sac_config.py | 4 ++-- easyrl/engine/basic_engine.py | 1 + easyrl/engine/ppo_engine.py | 2 -- easyrl/engine/ppo_rnn_engine.py | 2 -- easyrl/envs/vec_env.py | 2 ++ easyrl/models/rnn_base.py | 2 +- easyrl/runner/episodic_runner.py | 3 ++- easyrl/runner/step_runner.py | 5 ++--- examples/rnn_ppo.py | 37 ++++++++++++++++---------------- examples/sac.py | 5 ++--- examples/sac_sweeper.yml | 2 +- 14 files changed, 66 insertions(+), 45 deletions(-) diff --git a/easyrl/agents/ppo_rl2_agent.py b/easyrl/agents/ppo_rl2_agent.py index 66c514d..21882f0 100644 --- a/easyrl/agents/ppo_rl2_agent.py +++ b/easyrl/agents/ppo_rl2_agent.py @@ -1,3 +1,5 @@ +from dataclasses import dataclass + import torch from easyrl.agents.ppo_agent import PPOAgent @@ -9,11 +11,8 @@ from easyrl.utils.torch_util import torch_to_np +@dataclass class PPORNNAgent(PPOAgent): - def __init__(self, actor, critic, same_body=False): - super().__init__(actor=actor, - critic=critic, - same_body=same_body) @torch.no_grad() def get_action(self, ob, sample=True, hidden_state=None, *args, **kwargs): @@ -52,6 +51,7 @@ def get_val(self, ob, hidden_state=None, *args, **kwargs): return val, out_hidden_state def optim_preprocess(self, data): + self.train_mode() for key, val in data.items(): data[key] = torch_float(val, device=cfg.alg.device) ob = data['ob'] @@ -64,4 +64,13 @@ def optim_preprocess(self, data): act_dist, val, out_hidden_state = self.get_act_val(ob) log_prob = action_log_prob(action, act_dist) entropy = action_entropy(act_dist, log_prob) - return val, old_val, ret, log_prob, old_log_prob, adv, entropy + processed_data = dict( + val=val, + old_val=old_val, + ret=ret, + log_prob=log_prob, + old_log_prob=old_log_prob, + adv=adv, + entropy=entropy + ) + return processed_data diff --git a/easyrl/agents/ppo_rnn_agent.py b/easyrl/agents/ppo_rnn_agent.py index 4f70610..72de591 100644 --- a/easyrl/agents/ppo_rnn_agent.py +++ b/easyrl/agents/ppo_rnn_agent.py @@ -1,3 +1,5 @@ +from dataclasses import dataclass + import numpy as np import torch @@ -10,11 +12,8 @@ from easyrl.utils.torch_util import torch_to_np +@dataclass class PPORNNAgent(PPOAgent): - def __init__(self, actor, critic, same_body=False): - super().__init__(actor=actor, - critic=critic, - same_body=same_body) @torch.no_grad() def get_action(self, ob, sample=True, hidden_state=None, prev_done=False, *args, **kwargs): @@ -63,6 +62,7 @@ def get_val(self, ob, hidden_state=None, prev_done=False, *args, **kwargs): return val, out_hidden_state def optim_preprocess(self, data): + self.train_mode() for key, val in data.items(): data[key] = torch_float(val, device=cfg.alg.device) ob = data['ob'] @@ -76,7 +76,16 @@ def optim_preprocess(self, data): act_dist, val, out_hidden_state = self.get_act_val(ob, done=done) log_prob = action_log_prob(action, act_dist) entropy = action_entropy(act_dist, log_prob) - return val, old_val, ret, log_prob, old_log_prob, adv, entropy + processed_data = dict( + val=val, + old_val=old_val, + ret=ret, + log_prob=log_prob, + old_log_prob=old_log_prob, + adv=adv, + entropy=entropy + ) + return processed_data def check_hidden_state(self, hidden_state, prev_done): if prev_done is not None: diff --git a/easyrl/agents/sac_agent.py b/easyrl/agents/sac_agent.py index 8b8f8b5..c948894 100644 --- a/easyrl/agents/sac_agent.py +++ b/easyrl/agents/sac_agent.py @@ -1,6 +1,9 @@ import pickle from copy import deepcopy +from dataclasses import dataclass +from typing import Any +import gym import torch import torch.nn as nn import torch.nn.functional as F @@ -24,15 +27,16 @@ from easyrl.utils.torch_util import torch_float from easyrl.utils.torch_util import torch_to_np from easyrl.utils.torch_util import unfreeze_model -import gym -from typing import Any + +@dataclass class SACAgent(BaseAgent): actor: nn.Module env: gym.Env memory: Any q1: nn.Module = None q2: nn.Module = None + def __post_init__(self): self.q1_tgt = deepcopy(self.q1) self.q2_tgt = deepcopy(self.q2) diff --git a/easyrl/configs/sac_config.py b/easyrl/configs/sac_config.py index 110eb05..abaf0b9 100644 --- a/easyrl/configs/sac_config.py +++ b/easyrl/configs/sac_config.py @@ -11,8 +11,8 @@ class SACConfig(BasicConfig): use_amsgrad: bool = True opt_interval: int = 50 # perform optimization every n environment steps opt_num: int = 25 # how many optimization loops in every optimization stage - # Increase this number if num_envs > 1 so that the data is updated more often - # as the data collection is also faster + # Increase this number if num_envs > 1 so that the data is updated more often + # as the data collection is also faster alpha: float = None rew_discount: float = 0.99 replay_size: int = 1000000 diff --git a/easyrl/engine/basic_engine.py b/easyrl/engine/basic_engine.py index e2119a9..c72d6f2 100644 --- a/easyrl/engine/basic_engine.py +++ b/easyrl/engine/basic_engine.py @@ -3,6 +3,7 @@ from typing import Any import numpy as np + from easyrl.configs import cfg from easyrl.utils.rl_logger import TensorboardLogger diff --git a/easyrl/engine/ppo_engine.py b/easyrl/engine/ppo_engine.py index 5e26797..4df9ff0 100644 --- a/easyrl/engine/ppo_engine.py +++ b/easyrl/engine/ppo_engine.py @@ -1,5 +1,4 @@ import time -from collections import deque from itertools import chain from itertools import count @@ -13,7 +12,6 @@ from easyrl.utils.common import get_list_stats from easyrl.utils.common import save_traj from easyrl.utils.gae import cal_gae -from easyrl.utils.rl_logger import TensorboardLogger from easyrl.utils.torch_util import EpisodeDataset diff --git a/easyrl/engine/ppo_rnn_engine.py b/easyrl/engine/ppo_rnn_engine.py index d7194fd..1eda31d 100644 --- a/easyrl/engine/ppo_rnn_engine.py +++ b/easyrl/engine/ppo_rnn_engine.py @@ -1,5 +1,3 @@ -import time - import numpy as np from torch.utils.data import DataLoader diff --git a/easyrl/envs/vec_env.py b/easyrl/envs/vec_env.py index 5845879..ff81e90 100644 --- a/easyrl/envs/vec_env.py +++ b/easyrl/envs/vec_env.py @@ -7,6 +7,8 @@ from abc import abstractmethod import numpy as np +from gym.spaces import Box +from gym.spaces import Discrete from easyrl.utils.common import tile_images diff --git a/easyrl/models/rnn_base.py b/easyrl/models/rnn_base.py index 055a955..1170534 100644 --- a/easyrl/models/rnn_base.py +++ b/easyrl/models/rnn_base.py @@ -30,7 +30,7 @@ def forward(self, x=None, hidden_state=None, done=None): obs_feature = obs_feature.view(b, t, *obs_feature.shape[1:]) if self.training: - done_ts = (done == 1).any(dim=0).nonzero().squeeze(dim=-1).cpu().numpy() + 1 + done_ts = (done == 1).any(dim=0).nonzero(as_tuple=False).squeeze(dim=-1).cpu().numpy() + 1 done_ts = done_ts.tolist() done_ts = [0] + done_ts if done_ts[-1] != t: diff --git a/easyrl/runner/episodic_runner.py b/easyrl/runner/episodic_runner.py index 448833d..7554aef 100644 --- a/easyrl/runner/episodic_runner.py +++ b/easyrl/runner/episodic_runner.py @@ -7,8 +7,9 @@ from easyrl.runner.base_runner import BasicRunner from easyrl.utils.data import StepData from easyrl.utils.data import Trajectory -from easyrl.utils.torch_util import torch_to_np from easyrl.utils.gym_util import get_render_images +from easyrl.utils.torch_util import torch_to_np + class EpisodicRunner(BasicRunner): @torch.no_grad() diff --git a/easyrl/runner/step_runner.py b/easyrl/runner/step_runner.py index aed3d8e..73f456e 100644 --- a/easyrl/runner/step_runner.py +++ b/easyrl/runner/step_runner.py @@ -1,15 +1,14 @@ import time from copy import deepcopy +import numpy as np import torch from easyrl.runner.base_runner import BasicRunner from easyrl.utils.data import StepData from easyrl.utils.data import Trajectory from easyrl.utils.gym_util import get_render_images -from easyrl.utils.gym_util import is_time_limit_env -from easyrl.utils.common import list_to_numpy -import numpy as np + class StepRunner(BasicRunner): # Simulate the environment for T steps, diff --git a/examples/rnn_ppo.py b/examples/rnn_ppo.py index 8f254a9..89b0652 100644 --- a/examples/rnn_ppo.py +++ b/examples/rnn_ppo.py @@ -2,8 +2,9 @@ import torch.nn as nn from easyrl.agents.ppo_rnn_agent import PPORNNAgent +from easyrl.configs import cfg +from easyrl.configs import set_config from easyrl.configs.command_line import cfg_from_cmd -from easyrl.configs.ppo_config import ppo_cfg from easyrl.engine.ppo_rnn_engine import PPORNNEngine from easyrl.models.mlp import MLP from easyrl.models.rnn_base import RNNBase @@ -16,24 +17,24 @@ def main(): - cfg_from_cmd(ppo_cfg) - if ppo_cfg.resume or ppo_cfg.test: - if ppo_cfg.test: + set_config('ppo') + cfg_from_cmd(cfg.alg) + if cfg.alg.resume or cfg.alg.test: + if cfg.alg.test: skip_params = [ 'test_num', 'num_envs', 'sample_action', - 'seed' ] else: skip_params = [] - ppo_cfg.restore_cfg(skip_params=skip_params) - if ppo_cfg.env_name is None: - ppo_cfg.env_name = 'Hopper-v2' - set_random_seed(ppo_cfg.seed) - env = make_vec_env(ppo_cfg.env_name, - ppo_cfg.num_envs, - seed=ppo_cfg.seed) + cfg.alg.restore_cfg(skip_params=skip_params) + if cfg.alg.env_name is None: + cfg.alg.env_name = 'HalfCheetah-v2' + set_random_seed(cfg.alg.seed) + env = make_vec_env(cfg.alg.env_name, + cfg.alg.num_envs, + seed=cfg.alg.seed) env.reset() ob_size = env.observation_space.shape[0] @@ -63,8 +64,8 @@ def main(): elif isinstance(env.action_space, gym.spaces.Box): act_size = env.action_space.shape[0] actor = RNNDiagGaussianPolicy(actor_body, action_dim=act_size, - tanh_on_dist=ppo_cfg.tanh_on_dist, - std_cond_in=ppo_cfg.std_cond_in) + tanh_on_dist=cfg.alg.tanh_on_dist, + std_cond_in=cfg.alg.std_cond_in) else: raise TypeError(f'Unknown action space ' f'type: {env.action_space}') @@ -74,12 +75,12 @@ def main(): runner = RNNRunner(agent=agent, env=env) engine = PPORNNEngine(agent=agent, runner=runner) - if not ppo_cfg.test: + if not cfg.alg.test: engine.train() else: - stat_info, raw_traj_info = engine.eval(render=ppo_cfg.render, - save_eval_traj=ppo_cfg.save_test_traj, - eval_num=ppo_cfg.test_num, + stat_info, raw_traj_info = engine.eval(render=cfg.alg.render, + save_eval_traj=cfg.alg.save_test_traj, + eval_num=cfg.alg.test_num, sleep_time=0.04) import pprint pprint.pprint(stat_info) diff --git a/examples/sac.py b/examples/sac.py index 50e5929..63df734 100644 --- a/examples/sac.py +++ b/examples/sac.py @@ -1,21 +1,20 @@ -import gym import torch import torch.nn as nn +from easyrl.runner.step_runner import StepRunner from easyrl.agents.sac_agent import SACAgent from easyrl.configs import cfg from easyrl.configs import set_config from easyrl.configs.command_line import cfg_from_cmd from easyrl.engine.sac_engine import SACEngine -from easyrl.envs.single_env_wrapper import SingleEnvWrapper from easyrl.models.diag_gaussian_policy import DiagGaussianPolicy from easyrl.models.mlp import MLP from easyrl.models.value_net import ValueNet from easyrl.replays.circular_buffer import CyclicBuffer -from easyrl.runner.nenv_step_runner import StepRunner from easyrl.utils.common import set_random_seed from easyrl.utils.gym_util import make_vec_env + def main(): torch.set_num_threads(1) set_config('sac') diff --git a/examples/sac_sweeper.yml b/examples/sac_sweeper.yml index e7bc612..4f183c6 100644 --- a/examples/sac_sweeper.yml +++ b/examples/sac_sweeper.yml @@ -25,7 +25,7 @@ hparams: # actor_lr: [0.001, 0.0003] # critic_lr: [0.001, 0.0003] # alpha: [0.2, None] - # default_true: + # default_true: # tgt_sample: [True, False] default_false: # no_q2: [True] From 6c67ad9ad2ca98f9518039191a8cc7e58d498269 Mon Sep 17 00:00:00 2001 From: taochenshh Date: Mon, 21 Dec 2020 23:46:27 -0500 Subject: [PATCH 22/35] update config func --- easyrl/configs/__init__.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/easyrl/configs/__init__.py b/easyrl/configs/__init__.py index a392972..89127e7 100644 --- a/easyrl/configs/__init__.py +++ b/easyrl/configs/__init__.py @@ -14,18 +14,16 @@ class CFG: cfg = CFG() -def set_config(alg): +def set_config(alg, config_func=None): global cfg + if config_func is not None: + cfg.alg = config_func() + logger.info(f'Alogrithm type:{config_func.__name__}') + return if alg == 'ppo': cfg.alg = PPOConfig() elif alg == 'sac': cfg.alg = SACConfig() - elif alg == 'sac_adv': - cfg.alg = SACAdvConfig() - elif alg == 'redq': - cfg.alg = REQDConfig() - elif alg == 'offppo': - cfg.alg = OffPPOConfig() else: raise ValueError(f'Unimplemented algorithm: {alg}') logger.info(f'Alogrithm type:{type(cfg.alg)}') From 16246476ea505d8f07208bf16ff3c3e8564a3b47 Mon Sep 17 00:00:00 2001 From: taochenshh Date: Mon, 21 Dec 2020 23:47:12 -0500 Subject: [PATCH 23/35] update config func --- easyrl/configs/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easyrl/configs/__init__.py b/easyrl/configs/__init__.py index 89127e7..4e20e9f 100644 --- a/easyrl/configs/__init__.py +++ b/easyrl/configs/__init__.py @@ -14,7 +14,7 @@ class CFG: cfg = CFG() -def set_config(alg, config_func=None): +def set_config(alg=None, config_func=None): global cfg if config_func is not None: cfg.alg = config_func() From 8f2188ea0b2e64506fe65db66fc26b1413d959b4 Mon Sep 17 00:00:00 2001 From: taochenshh Date: Mon, 28 Dec 2020 11:09:01 -0500 Subject: [PATCH 24/35] fix rnn --- easyrl/agents/base_agent.py | 16 ++++ easyrl/agents/ppo_rnn_agent.py | 51 +++++----- easyrl/engine/basic_engine.py | 36 ++++++- easyrl/engine/ppo_engine.py | 1 + easyrl/engine/ppo_rnn_engine.py | 9 +- easyrl/envs/dummy_vec_env.py | 1 + easyrl/envs/reward_wrapper.py | 18 +++- easyrl/envs/shmem_vec_env.py | 2 +- easyrl/envs/timeout.py | 4 +- easyrl/envs/vec_env.py | 3 +- easyrl/envs/vec_normalize.py | 30 +++++- easyrl/models/mlp.py | 18 ++-- easyrl/models/name_wrapper.py | 13 +++ easyrl/models/rnn_base.py | 7 +- easyrl/runner/base_runner.py | 47 +++++++++ .../{episodic_runner.py => nstep_runner.py} | 54 ++++++----- easyrl/runner/rnn_runner.py | 96 ++++++++++++++----- ...p_runner.py => single_env_nstep_runner.py} | 53 +++++----- easyrl/utils/common.py | 41 +++++++- easyrl/utils/data.py | 5 + easyrl/utils/gym_util.py | 38 ++++++-- easyrl/utils/hp_sweeper.py | 19 ++-- examples/README.md | 6 +- examples/ppo.py | 23 +++-- examples/rnn_ppo.py | 50 +++++----- examples/sac.py | 13 ++- setup.py | 2 +- 27 files changed, 470 insertions(+), 186 deletions(-) create mode 100644 easyrl/models/name_wrapper.py rename easyrl/runner/{episodic_runner.py => nstep_runner.py} (65%) rename easyrl/runner/{step_runner.py => single_env_nstep_runner.py} (67%) diff --git a/easyrl/agents/base_agent.py b/easyrl/agents/base_agent.py index 1be30c5..0b01b25 100644 --- a/easyrl/agents/base_agent.py +++ b/easyrl/agents/base_agent.py @@ -1,7 +1,23 @@ +from dataclasses import dataclass +import gym +from easyrl.envs.vec_normalize import VecNormalize +from easyrl.utils.gym_util import save_vec_normalized_env +from easyrl.utils.gym_util import load_vec_normalized_env + +@dataclass class BaseAgent: + env: gym.Env def get_action(self, ob, sample=True, **kwargs): raise NotImplementedError def optimize(self, data, **kwargs): raise NotImplementedError + + def save_env(self, save_dir): + if isinstance(self.env, VecNormalize): + save_vec_normalized_env(self.env, save_dir) + + def load_env(self, save_dir): + if isinstance(self.env, VecNormalize): + load_vec_normalized_env(self.env, save_dir) diff --git a/easyrl/agents/ppo_rnn_agent.py b/easyrl/agents/ppo_rnn_agent.py index 72de591..c4e6871 100644 --- a/easyrl/agents/ppo_rnn_agent.py +++ b/easyrl/agents/ppo_rnn_agent.py @@ -2,7 +2,6 @@ import numpy as np import torch - from easyrl.agents.ppo_agent import PPOAgent from easyrl.configs import cfg from easyrl.utils.torch_util import action_entropy @@ -14,11 +13,20 @@ @dataclass class PPORNNAgent(PPOAgent): + def __post_init__(self): + if not self.same_body: + raise TypeError('EasyRL only supports RNN policy that shares the ' + 'same body with value function at this moment. ' + 'If you do want to use different body networks for' + 'the actor and critic, then you need to modify the code ' + 'such that the hidden states are handled correctly for each' + 'network (the actor and the critic have different hidden ' + 'states in this case).') + super(PPORNNAgent, self).__post_init__() @torch.no_grad() - def get_action(self, ob, sample=True, hidden_state=None, prev_done=False, *args, **kwargs): + def get_action(self, ob, sample=True, hidden_state=None, *args, **kwargs): self.eval_mode() - hidden_state = self.check_hidden_state(hidden_state, prev_done) t_ob = torch.from_numpy(ob).float().to(cfg.alg.device).unsqueeze(dim=1) act_dist, val, out_hidden_state = self.get_act_val(t_ob, @@ -27,10 +35,12 @@ def get_action(self, ob, sample=True, hidden_state=None, prev_done=False, *args, sample=sample) log_prob = action_log_prob(action, act_dist) entropy = action_entropy(act_dist, log_prob) + in_hidden_state = torch_to_np(hidden_state) if hidden_state is not None else hidden_state action_info = dict( log_prob=torch_to_np(log_prob.squeeze(1)), entropy=torch_to_np(entropy.squeeze(1)), val=torch_to_np(val.squeeze(1)), + in_hidden_state=in_hidden_state ) return torch_to_np(action.squeeze(1)), action_info, out_hidden_state @@ -39,21 +49,16 @@ def get_act_val(self, ob, hidden_state=None, done=None, *args, **kwargs): act_dist, body_out, out_hidden_state = self.actor(ob, hidden_state=hidden_state, done=done) - if self.same_body: - val, body_out, out_hidden_state = self.critic(body_x=body_out, - hidden_state=hidden_state, - done=done) - else: - val, body_out, out_hidden_state = self.critic(x=ob, - hidden_state=hidden_state, - done=done) + + val, body_out, _ = self.critic(body_x=body_out, + hidden_state=hidden_state, + done=done) val = val.squeeze(-1) return act_dist, val, out_hidden_state @torch.no_grad() - def get_val(self, ob, hidden_state=None, prev_done=False, *args, **kwargs): + def get_val(self, ob, hidden_state=None, *args, **kwargs): self.eval_mode() - hidden_state = self.check_hidden_state(hidden_state, prev_done) ob = torch_float(ob, device=cfg.alg.device).unsqueeze(dim=1) val, body_out, out_hidden_state = self.critic(x=ob, @@ -64,7 +69,8 @@ def get_val(self, ob, hidden_state=None, prev_done=False, *args, **kwargs): def optim_preprocess(self, data): self.train_mode() for key, val in data.items(): - data[key] = torch_float(val, device=cfg.alg.device) + if val is not None: + data[key] = torch_float(val, device=cfg.alg.device) ob = data['ob'] action = data['action'] ret = data['ret'] @@ -72,8 +78,12 @@ def optim_preprocess(self, data): old_log_prob = data['log_prob'] old_val = data['val'] done = data['done'] + hidden_state = data['hidden_state'] + hidden_state = hidden_state.permute(1, 0, 2) - act_dist, val, out_hidden_state = self.get_act_val(ob, done=done) + act_dist, val, out_hidden_state = self.get_act_val(ob, + hidden_state=hidden_state, + done=done) log_prob = action_log_prob(action, act_dist) entropy = action_entropy(act_dist, log_prob) processed_data = dict( @@ -86,14 +96,3 @@ def optim_preprocess(self, data): entropy=entropy ) return processed_data - - def check_hidden_state(self, hidden_state, prev_done): - if prev_done is not None: - # if the last step is the end of an episode, - # then reset hidden state - done_idx = np.argwhere(prev_done).flatten() - if done_idx.size > 0: - ld, b, hz = hidden_state.shape - hidden_state[:, done_idx] = torch.zeros(ld, done_idx.size, hz, - device=hidden_state.device) - return hidden_state diff --git a/easyrl/engine/basic_engine.py b/easyrl/engine/basic_engine.py index c72d6f2..393d245 100644 --- a/easyrl/engine/basic_engine.py +++ b/easyrl/engine/basic_engine.py @@ -6,7 +6,7 @@ from easyrl.configs import cfg from easyrl.utils.rl_logger import TensorboardLogger - +from easyrl.utils.common import get_list_stats @dataclass class BasicEngine: @@ -35,3 +35,37 @@ def train(self, **kwargs): def eval(self, **kwargs): raise NotImplementedError + + def get_train_log(self, optim_infos, traj=None): + log_info = dict() + vector_keys = set() + scalar_keys = set() + for oinf in optim_infos: + for key in oinf.keys(): + if 'vec_' in key: + vector_keys.add(key) + else: + scalar_keys.add(key) + + for key in scalar_keys: + log_info[key] = np.mean([inf[key] for inf in optim_infos if key in inf]) + + for key in vector_keys: + k_stats = get_list_stats([inf[key] for inf in optim_infos if key in inf]) + for sk, sv in k_stats.items(): + log_info[f'{key}/' + sk] = sv + + if traj is not None: + actions_stats = get_list_stats(traj.actions) + for sk, sv in actions_stats.items(): + log_info['rollout_action/' + sk] = sv + log_info['rollout_steps_per_iter'] = traj.total_steps + + ep_returns_stats = get_list_stats(self.runner.train_ep_return) + for sk, sv in ep_returns_stats.items(): + log_info['episode_return/' + sk] = sv + + train_log_info = dict() + for key, val in log_info.items(): + train_log_info['train/' + key] = val + return train_log_info \ No newline at end of file diff --git a/easyrl/engine/ppo_engine.py b/easyrl/engine/ppo_engine.py index 4df9ff0..65bbd1a 100644 --- a/easyrl/engine/ppo_engine.py +++ b/easyrl/engine/ppo_engine.py @@ -32,6 +32,7 @@ def train(self): else: eval_log_info = None traj, rollout_time = self.rollout_once(sample=True, + get_last_val=True, time_steps=cfg.alg.episode_steps) train_log_info = self.train_once(traj) if iter_t % cfg.alg.log_interval == 0: diff --git a/easyrl/engine/ppo_rnn_engine.py b/easyrl/engine/ppo_rnn_engine.py index 1eda31d..d7dd677 100644 --- a/easyrl/engine/ppo_rnn_engine.py +++ b/easyrl/engine/ppo_rnn_engine.py @@ -15,6 +15,12 @@ def traj_preprocess(self, traj): action_infos = traj.action_infos vals = np.array([ainfo['val'] for ainfo in action_infos]) log_prob = np.array([ainfo['log_prob'] for ainfo in action_infos]) + hidden_state = action_infos[0]['in_hidden_state'] + if hidden_state is not None: + hidden_state = hidden_state.swapaxes(0, 1) + else: + hidden_state_shape = self.runner.hidden_state_shape + hidden_state = np.zeros((vals.shape[1], hidden_state_shape[0], hidden_state_shape[2])) adv = self.cal_advantages(traj) ret = adv + vals if cfg.alg.normalize_adv: @@ -28,7 +34,8 @@ def traj_preprocess(self, traj): adv=adv.swapaxes(0, 1), log_prob=log_prob.swapaxes(0, 1), val=vals.swapaxes(0, 1), - done=traj.dones.swapaxes(0, 1) + done=traj.step_extras.swapaxes(0, 1), # we use the mask here instead of true_done + hidden_state=hidden_state, ) rollout_dataset = DictDataset(**data) rollout_dataloader = DataLoader(rollout_dataset, diff --git a/easyrl/envs/dummy_vec_env.py b/easyrl/envs/dummy_vec_env.py index 0750f11..1f89933 100644 --- a/easyrl/envs/dummy_vec_env.py +++ b/easyrl/envs/dummy_vec_env.py @@ -62,6 +62,7 @@ def step_wait(self): res = self.envs[e].step(action) obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = res if self.buf_dones[e]: + self.buf_infos[e]['true_next_ob'] = obs obs = self.envs[e].reset() self._save_obs(e, obs) return (self._obs_from_buf(), np.copy(self.buf_rews), diff --git a/easyrl/envs/reward_wrapper.py b/easyrl/envs/reward_wrapper.py index 6168d4f..f7c7fe6 100644 --- a/easyrl/envs/reward_wrapper.py +++ b/easyrl/envs/reward_wrapper.py @@ -15,7 +15,7 @@ def __init__(self, venv, scale=0.01, observation_space=None, action_space=None): self.scale = scale def step(self, action): - observation, reward, done, info = self.venv.step(action) + observation, reward, done, info = super().step(action) for idx, inf in enumerate(info): inf['raw_reward'] = reward[idx] return observation, self.reward(reward), done, info @@ -23,6 +23,12 @@ def step(self, action): def reward(self, reward): return reward * self.scale + def reset(self): + return self.venv.reset() + + def step_wait(self): + return self.venv.step_wait() + class RewardMinMaxNorm(VecEnvWrapper): def __init__(self, venv, min_rew, max_rew, observation_space=None, action_space=None): @@ -33,11 +39,17 @@ def __init__(self, venv, min_rew, max_rew, observation_space=None, action_space= self.max_rew = max_rew def step(self, action): - observation, reward, done, info = self.venv.step(action) + observation, reward, done, info = super().step(action) for idx, inf in enumerate(info): inf['raw_reward'] = reward[idx] return observation, self.reward(reward), done, info def reward(self, reward): reward = (reward - self.min_rew) / (self.max_rew - self.min_rew) - return reward * self.scale + return reward + + def reset(self): + return self.venv.reset() + + def step_wait(self): + return self.venv.step_wait() diff --git a/easyrl/envs/shmem_vec_env.py b/easyrl/envs/shmem_vec_env.py index ce69938..c75dae4 100644 --- a/easyrl/envs/shmem_vec_env.py +++ b/easyrl/envs/shmem_vec_env.py @@ -7,7 +7,6 @@ import multiprocessing as mp import numpy as np - from easyrl.envs.vec_env import CloudpickleWrapper from easyrl.envs.vec_env import VecEnv from easyrl.envs.vec_env import clear_mpi_env_vars @@ -153,6 +152,7 @@ def _write_obs(maybe_dict_obs): elif cmd == 'step': obs, reward, done, info = env.step(data) if done: + info['true_next_ob'] = obs obs = env.reset() pipe.send((_write_obs(obs), reward, done, info)) elif cmd == 'render': diff --git a/easyrl/envs/timeout.py b/easyrl/envs/timeout.py index b2222c4..9b3fd5c 100644 --- a/easyrl/envs/timeout.py +++ b/easyrl/envs/timeout.py @@ -1,9 +1,9 @@ import gym -class TimeOutEnv(gym.Wrapper): +class NoTimeOutEnv(gym.Wrapper): def __init__(self, env): - super(TimeOutEnv, self).__init__(env) + super(NoTimeOutEnv, self).__init__(env) def step(self, action): observation, reward, done, info = self.env.step(action) diff --git a/easyrl/envs/vec_env.py b/easyrl/envs/vec_env.py index ff81e90..4459ef6 100644 --- a/easyrl/envs/vec_env.py +++ b/easyrl/envs/vec_env.py @@ -7,11 +7,10 @@ from abc import abstractmethod import numpy as np +from easyrl.utils.common import tile_images from gym.spaces import Box from gym.spaces import Discrete -from easyrl.utils.common import tile_images - class AlreadySteppingError(Exception): """ diff --git a/easyrl/envs/vec_normalize.py b/easyrl/envs/vec_normalize.py index 6d38d2a..27cb7ab 100644 --- a/easyrl/envs/vec_normalize.py +++ b/easyrl/envs/vec_normalize.py @@ -10,7 +10,7 @@ class VecNormalize(VecEnvWrapper): and returns from an environment. """ - def __init__(self, venv, ob=True, ret=True, clipob=10., + def __init__(self, venv, training=True, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None @@ -20,6 +20,7 @@ def __init__(self, venv, ob=True, ret=True, clipob=10., self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon + self.training = training def step_wait(self): obs, rews, news, infos = self.venv.step_wait() @@ -28,7 +29,8 @@ def step_wait(self): if self.ret_rms: for idx, inf in enumerate(infos): inf['raw_reward'] = rews[idx] - self.ret_rms.update(self.ret) + if self.training: + self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret[news] = 0. @@ -36,7 +38,8 @@ def step_wait(self): def _obfilt(self, obs): if self.ob_rms: - self.ob_rms.update(obs) + if self.training: + self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs @@ -47,3 +50,24 @@ def reset(self): self.ret = np.zeros(self.num_envs) obs = self.venv.reset() return self._obfilt(obs) + + def get_states(self): + data = dict( + ob_rms=self.ob_rms, + ret_rms=self.ret_rms, + clipob=self.clipob, + cliprew=self.cliprew, + gamma=self.gamma, + epsilon=self.epsilon + ) + return data + + def set_states(self, data): + assert isinstance(data, dict) + keys = ['ob_rms', 'ret_rms', 'clipob', + 'cliprew', 'gamma', 'epsilon'] + for key in keys: + if key in data: + setattr(self, key, data[key]) + else: + print(f'Warning: {key} does not exist in data.') diff --git a/easyrl/models/mlp.py b/easyrl/models/mlp.py index 8ca34e6..10eb8d3 100644 --- a/easyrl/models/mlp.py +++ b/easyrl/models/mlp.py @@ -12,31 +12,31 @@ def __init__(self, output_size, hidden_act=nn.ReLU, output_act=None, - add_layer_norm=False, - add_spectral_norm=False): + hid_layer_norm=False, + hid_spectral_norm=False, + out_layer_norm=False, + out_spectral_norm=False): super().__init__() if not isinstance(hidden_sizes, list): raise TypeError('hidden_sizes should be a list') - if add_spectral_norm: - logger.info('Spectral Normalization on!') - if add_layer_norm: - logger.info('Layer Normalization on!') in_size = input_size self.fcs = nn.ModuleList() for i, hid_size in enumerate(hidden_sizes): fc = nn.Linear(in_size, hid_size) - if add_spectral_norm: + if hid_spectral_norm: fc = spectral_norm(fc) in_size = hid_size self.fcs.append(fc) - if add_layer_norm: + if hid_layer_norm: self.fcs.append(nn.LayerNorm(hid_size)) self.fcs.append(hidden_act()) last_fc = nn.Linear(in_size, output_size) - if add_spectral_norm: + if out_spectral_norm: last_fc = spectral_norm(last_fc) self.fcs.append(last_fc) + if out_layer_norm: + self.fcs.append(nn.LayerNorm(output_size)) if output_act is not None: self.fcs.append(output_act()) diff --git a/easyrl/models/name_wrapper.py b/easyrl/models/name_wrapper.py new file mode 100644 index 0000000..a34c4ae --- /dev/null +++ b/easyrl/models/name_wrapper.py @@ -0,0 +1,13 @@ +import torch.nn as nn + + +class AddNameWrapper(nn.Module): + def __init__(self, + model, + name): + super().__init__() + self.model_name = name + setattr(self, name, model) + + def forward(self, *args, **kwargs): + return getattr(self, self.model_name)(*args, **kwargs) diff --git a/easyrl/models/rnn_base.py b/easyrl/models/rnn_base.py index 1170534..f9e66e5 100644 --- a/easyrl/models/rnn_base.py +++ b/easyrl/models/rnn_base.py @@ -15,9 +15,7 @@ def __init__(self, hidden_size=rnn_features, num_layers=rnn_layers, batch_first=True) - self.fcs = nn.Linear(rnn_features, rnn_features) self.fcs = nn.Sequential( - nn.ELU(), nn.Linear(in_features=rnn_features, out_features=rnn_features), nn.ELU() ) @@ -36,11 +34,12 @@ def forward(self, x=None, hidden_state=None, done=None): if done_ts[-1] != t: done_ts = done_ts + [t] rnn_features = [] + for idx in range(len(done_ts) - 1): sid = done_ts[idx] eid = done_ts[idx + 1] - if hidden_state is not None: - hidden_state = hidden_state * (1 - done[:, sid]).view(1, -1, 1) + if hidden_state is not None and sid > 0: + hidden_state = hidden_state * (1 - done[:, sid-1]).view(1, -1, 1) rfeatures, hidden_state = self.gru(obs_feature[:, sid:eid], hidden_state) rnn_features.append(rfeatures) diff --git a/easyrl/runner/base_runner.py b/easyrl/runner/base_runner.py index c8344e0..09c1443 100644 --- a/easyrl/runner/base_runner.py +++ b/easyrl/runner/base_runner.py @@ -1,8 +1,55 @@ +import numpy as np +from copy import deepcopy +from easyrl.utils.gym_util import get_true_done +from collections import deque + class BasicRunner: def __init__(self, agent, env, eval_env=None): self.agent = agent self.train_env = env + self.num_train_envs = env.num_envs + self.obs = None self.eval_env = env if eval_env is None else eval_env + self.train_ep_return = deque(maxlen=100) + self.train_ep_len = deque(maxlen=100) + self.reset_record() def __call__(self, **kwargs): raise NotImplementedError + + def reset(self, env=None, *args, **kwargs): + if env is None: + env = self.train_env + self.obs = env.reset(*args, **kwargs) + self.reset_record() + + def reset_record(self): + self.cur_ep_len = np.zeros(self.num_train_envs) + self.cur_ep_return = np.zeros(self.num_train_envs) + + def get_true_done_next_ob(self, next_ob, done, reward, info, all_dones): + done_idx = np.argwhere(done).flatten() + self.cur_ep_len += 1 + if 'raw_reward' in info[0]: + self.cur_ep_return += np.array([x['raw_reward'] for x in info]) + else: + self.cur_ep_return += reward + if done_idx.size > 0: + # vec env automatically resets the environment when it's done + # so the returned next_ob is not actually the next observation + true_next_ob = deepcopy(next_ob) + true_next_ob[done_idx] = np.array([info[i]['true_next_ob'] for i in done_idx]) + if all_dones is not None: + all_dones[done_idx] = True + for dix in done_idx: + self.train_ep_return.append(self.cur_ep_return[dix]) + self.train_ep_len.append(self.cur_ep_len[dix]) + self.cur_ep_return[done_idx] = 0 + self.cur_ep_len[done_idx] = 0 + true_done = deepcopy(done) + for iidx, inf in enumerate(info): + true_done[iidx] = get_true_done(true_done[iidx], inf) + else: + true_next_ob = next_ob + true_done = done + return true_next_ob, true_done, all_dones diff --git a/easyrl/runner/episodic_runner.py b/easyrl/runner/nstep_runner.py similarity index 65% rename from easyrl/runner/episodic_runner.py rename to easyrl/runner/nstep_runner.py index 7554aef..097d1d6 100644 --- a/easyrl/runner/episodic_runner.py +++ b/easyrl/runner/nstep_runner.py @@ -3,7 +3,6 @@ import numpy as np import torch - from easyrl.runner.base_runner import BasicRunner from easyrl.utils.data import StepData from easyrl.utils.data import Trajectory @@ -12,11 +11,20 @@ class EpisodicRunner(BasicRunner): + """ + This only applies to environments that are wrapped by VecEnv. + It assumes the environment is automatically reset if done=True + """ + + def __init__(self, *args, **kwargs): + super(EpisodicRunner, self).__init__(*args, **kwargs) + @torch.no_grad() def __call__(self, time_steps, sample=True, evaluation=False, return_on_done=False, render=False, render_image=False, - sleep_time=0, reset_kwargs=None, action_kwargs=None, - random_action=False): + sleep_time=0, reset_first=False, + reset_kwargs=None, action_kwargs=None, + random_action=False, get_last_val=False): traj = Trajectory() if reset_kwargs is None: reset_kwargs = {} @@ -26,7 +34,9 @@ def __call__(self, time_steps, sample=True, evaluation=False, env = self.eval_env else: env = self.train_env - ob = env.reset(**reset_kwargs) + if self.obs is None or reset_first or evaluation: + self.reset(env=env, **reset_kwargs) + ob = self.obs # this is critical for some environments depending # on the returned ob data. use deepcopy() to avoid # adding the same ob to the traj @@ -36,6 +46,8 @@ def __call__(self, time_steps, sample=True, evaluation=False, ob = deepcopy(ob) if return_on_done: all_dones = np.zeros(env.num_envs, dtype=bool) + else: + all_dones = None for t in range(time_steps): if render: env.render() @@ -53,36 +65,30 @@ def __call__(self, time_steps, sample=True, evaluation=False, sample=sample, **action_kwargs) next_ob, reward, done, info = env.step(action) - next_ob = deepcopy(next_ob) + if render_image: for img, inf in zip(imgs, info): inf['render_image'] = deepcopy(img) - done_idx = np.argwhere(done).flatten() - if done_idx.size > 0 and return_on_done: - # vec env automatically resets the environment when it's done - # so the returned next_ob is not actually the next observation - all_dones[done_idx] = True - - true_done = deepcopy(done) - for iidx, inf in enumerate(info): - true_done[iidx] = true_done[iidx] and not inf.get('TimeLimit.truncated', - False) + true_next_ob, true_done, all_dones = self.get_true_done_next_ob(next_ob, + done, + reward, + info, + all_dones) sd = StepData(ob=ob, - action=deepcopy(action), - action_info=deepcopy(action_info), - next_ob=next_ob, - reward=deepcopy(reward), + action=action, + action_info=action_info, + next_ob=true_next_ob, + reward=reward, done=true_done, - info=deepcopy(info)) + info=info) ob = next_ob traj.add(sd) if return_on_done and np.all(all_dones): break - if not evaluation: + + if get_last_val and not evaluation: last_val = self.agent.get_val(traj[-1].next_ob) traj.add_extra('last_val', torch_to_np(last_val)) + self.obs = ob if not evaluation else None return traj - - def reset(self, *args, **kwargs): - pass diff --git a/easyrl/runner/rnn_runner.py b/easyrl/runner/rnn_runner.py index 526bafa..e4ee2bc 100644 --- a/easyrl/runner/rnn_runner.py +++ b/easyrl/runner/rnn_runner.py @@ -1,9 +1,8 @@ import time from copy import deepcopy - +from collections import deque import numpy as np import torch - from easyrl.runner.base_runner import BasicRunner from easyrl.utils.data import StepData from easyrl.utils.data import Trajectory @@ -11,10 +10,19 @@ class RNNRunner(BasicRunner): + def __init__(self, *args, **kwargs): + super(RNNRunner, self).__init__(*args, **kwargs) + self.hidden_states = None + self.hidden_state_shape = None + + @torch.no_grad() def __call__(self, time_steps, sample=True, evaluation=False, - return_on_done=False, render=False, render_image=False, - sleep_time=0, reset_kwargs=None, action_kwargs=None): + return_on_done=False, render=False, + render_image=False, + sleep_time=0, reset_first=False, + reset_kwargs=None, action_kwargs=None, + get_last_val=False): traj = Trajectory() if reset_kwargs is None: reset_kwargs = {} @@ -24,7 +32,11 @@ def __call__(self, time_steps, sample=True, evaluation=False, env = self.eval_env else: env = self.train_env - ob = env.reset(**reset_kwargs) + # In RL^2, we should always reset in the begining of a rollout + if self.obs is None or reset_first or evaluation: + self.reset(**reset_kwargs) + ob = self.obs + hidden_state = self.hidden_states # this is critical for some environments depending # on the returned ob data. use deepcopy() to avoid # adding the same ob to the traj @@ -34,7 +46,8 @@ def __call__(self, time_steps, sample=True, evaluation=False, ob = deepcopy(ob) if return_on_done: all_dones = np.zeros(env.num_envs, dtype=bool) - hidden_state = None + else: + all_dones = None done = None for t in range(time_steps): if render: @@ -48,33 +61,68 @@ def __call__(self, time_steps, sample=True, evaluation=False, action, action_info, hidden_state = self.agent.get_action(ob, sample=sample, hidden_state=hidden_state, - prev_done=done, **action_kwargs) + if self.hidden_state_shape is None: + self.hidden_state_shape = hidden_state.shape next_ob, reward, done, info = env.step(action) - next_ob = deepcopy(next_ob) + if render_image: for img, inf in zip(imgs, info): inf['render_image'] = deepcopy(img) - done_idx = np.argwhere(done).flatten() - if done_idx.size > 0 and return_on_done: - # vec env automatically resets the environment when it's done - # so the returned next_ob is not actually the next observation - all_dones[done_idx] = True + true_next_ob, true_done, all_dones = self.get_true_done_next_ob(next_ob, + done, + reward, + info, + all_dones) + sd = StepData(ob=ob, - action=deepcopy(action), - action_info=deepcopy(action_info), - next_ob=next_ob, - reward=deepcopy(reward), - done=deepcopy(done), - info=deepcopy(info)) + action=action, + action_info=action_info, + next_ob=true_next_ob, + reward=reward, + done=true_done, + info=info, + extra=done, # this is a flag that can tell whether the environment + # is reset or not so that we know whether we need to + # reset the hidden state or not. We save it in "extra" + ) ob = next_ob traj.add(sd) if return_on_done and np.all(all_dones): break - if not evaluation: - last_val, _ = self.agent.get_val(traj[-1].next_ob, - hidden_state=hidden_state, - prev_done=done) - traj.add_extra('last_val', torch_to_np(last_val)) + + # the order of next few lines matter, do not exchange + if get_last_val and not evaluation and t == time_steps - 1: + last_val, _ = self.agent.get_val(traj[-1].next_ob, + hidden_state=hidden_state) + traj.add_extra('last_val', torch_to_np(last_val)) + hidden_state = self.check_hidden_state(hidden_state, done=done) + self.obs = ob if not evaluation else None + self.hidden_states = hidden_state.detach() if not evaluation else None return traj + + def reset(self, env=None, *args, **kwargs): + super().reset(env, *args, **kwargs) + self.hidden_states = None + + def get_hidden_state_shape(self): + obs = self.train_env.reset() + done = None + action, action_info, hidden_state = self.agent.get_action(ob, + sample=True, + hidden_state=None, + prev_done=done) + self.hidden_state_shape = hidden_state.shape + return self.hidden_state_shape + + def check_hidden_state(self, hidden_state, done=None): + if done is not None: + # if the last step is the end of an episode, + # then reset hidden state + done_idx = np.argwhere(done).flatten() + if done_idx.size > 0: + ld, b, hz = hidden_state.shape + hidden_state[:, done_idx] = torch.zeros(ld, done_idx.size, hz, + device=hidden_state.device) + return hidden_state diff --git a/easyrl/runner/step_runner.py b/easyrl/runner/single_env_nstep_runner.py similarity index 67% rename from easyrl/runner/step_runner.py rename to easyrl/runner/single_env_nstep_runner.py index 73f456e..b75f6c0 100644 --- a/easyrl/runner/step_runner.py +++ b/easyrl/runner/single_env_nstep_runner.py @@ -1,30 +1,33 @@ import time from copy import deepcopy -import numpy as np import torch from easyrl.runner.base_runner import BasicRunner from easyrl.utils.data import StepData from easyrl.utils.data import Trajectory from easyrl.utils.gym_util import get_render_images - +from easyrl.utils.gym_util import is_time_limit_env +from easyrl.utils.common import list_to_numpy class StepRunner(BasicRunner): # Simulate the environment for T steps, # and in the next call, the environment will continue # from where it's left in the previous call. - # we also assume the environment is wrapped by VecEnv + # only single env (no parallel envs) is supported for now. + # we also assume the environment is wrapped by TimeLimit + # from https://github.com/openai/gym/blob/master/gym/wrappers/time_limit.py def __init__(self, agent, env, eval_env=None): super().__init__(agent=agent, env=env, eval_env=eval_env) - self.cur_ob = None + self.obs = None @torch.no_grad() def __call__(self, time_steps, sample=True, evaluation=False, return_on_done=False, render=False, render_image=False, - sleep_time=0, env_reset_kwargs=None, agent_reset_kwargs=None, + sleep_time=0, reset_first=False, + env_reset_kwargs=None, agent_reset_kwargs=None, action_kwargs=None, random_action=False): traj = Trajectory() if env_reset_kwargs is None: @@ -38,14 +41,12 @@ def __call__(self, time_steps, sample=True, evaluation=False, env = self.eval_env else: env = self.train_env - if self.cur_ob is None or evaluation: + if self.obs is None or reset_first or evaluation: self.reset(env=env, env_reset_kwargs=env_reset_kwargs, agent_reset_kwargs=agent_reset_kwargs) - ob = self.cur_ob + ob = self.obs ob = deepcopy(ob) - if return_on_done: - all_dones = np.zeros(env.num_envs, dtype=bool) for t in range(time_steps): if render: env.render() @@ -55,51 +56,41 @@ def __call__(self, time_steps, sample=True, evaluation=False, # get render images at the same time step as ob imgs = get_render_images(env) if random_action: - action = env.random_actions() + action = env.action_space.sample() + if len(action.shape) == 1: + # the first dim is num_envs + action = list_to_numpy(action, expand_dims=0) action_info = dict() else: action, action_info = self.agent.get_action(ob, sample=sample, **action_kwargs) next_ob, reward, done, info = env.step(action) - next_ob = deepcopy(next_ob) if render_image: for img, inf in zip(imgs, info): inf['render_image'] = deepcopy(img) - - done_idx = np.argwhere(done).flatten() - if done_idx.size > 0 and return_on_done: - # vec env automatically resets the environment when it's done - # so the returned next_ob is not actually the next observation - all_dones[done_idx] = True - true_done = deepcopy(done) for iidx, inf in enumerate(info): true_done[iidx] = true_done[iidx] and not inf.get('TimeLimit.truncated', False) sd = StepData(ob=ob, - action=deepcopy(action), - action_info=deepcopy(action_info), + action=action, + action_info=action_info, next_ob=next_ob, reward=reward, done=true_done, info=info) ob = next_ob traj.add(sd) - if return_on_done and np.all(all_dones): + if return_on_done and done: break - self.cur_ob = None if evaluation else deepcopy(ob) + if done: + ob = self.reset(env, env_reset_kwargs, agent_reset_kwargs) + self.obs = None if evaluation else deepcopy(ob) return traj - def reset(self, env=None, env_reset_kwargs=None, agent_reset_kwargs=None): + def reset(self, env=None, *args, **kwargs): if env is None: env = self.train_env - if env_reset_kwargs is None: - env_reset_kwargs = {} - if agent_reset_kwargs is None: - agent_reset_kwargs = {} - ob = env.reset(**env_reset_kwargs) - if hasattr(self.agent, 'reset'): - self.agent.reset(**agent_reset_kwargs) - self.cur_ob = deepcopy(ob) + self.obs = env.reset(*args, **kwargs) return ob diff --git a/easyrl/utils/common.py b/easyrl/utils/common.py index afb2bfd..7ef5015 100644 --- a/easyrl/utils/common.py +++ b/easyrl/utils/common.py @@ -1,3 +1,4 @@ +import importlib import json import numbers import pickle as pkl @@ -10,7 +11,6 @@ import numpy as np import torch import yaml - from easyrl.utils.rl_logger import logger @@ -21,6 +21,29 @@ def set_random_seed(seed): torch.cuda.manual_seed_all(seed) +def module_available(module_path: str) -> bool: + """Testing if given module is avalaible in your env. + + Copied from https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pytorch_lightning/utilities/__init__.py. + + >>> module_available('os') + True + >>> module_available('bla.bla') + False + """ + try: + mods = module_path.split('.') + assert mods, 'nothing given to test' + # it has to be tested as per partets + for i in range(len(mods)): + module_path = '.'.join(mods[:i + 1]) + if importlib.util.find_spec(module_path) is None: + return False + return True + except AttributeError: + return False + + def list_to_numpy(data, expand_dims=None): if isinstance(data, numbers.Number): data = np.array([data]) @@ -133,6 +156,14 @@ def load_from_yaml(file_name): return data +def save_to_yaml(data, file_name): + file_name = pathlib_file(file_name) + if not file_name.parent.exists(): + Path.mkdir(file_name.parent, parents=True) + with file_name.open('w') as f: + yaml.dump(data, f, default_flow_style=False) + + def save_to_pickle(data, file_name): file_name = pathlib_file(file_name) if not file_name.parent.exists(): @@ -238,6 +269,14 @@ def update_from_moments(self, batch_mean, batch_var, batch_count): self.mean, self.var, self.count = update_mean_var_count_from_moments( self.mean, self.var, self.count, batch_mean, batch_var, batch_count) + def get_states(self): + return self.mean, self.var, self.count + + def set_states(self, mean, var, count): + self.mean = mean + self.var = var + self.count = count + def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count): delta = batch_mean - mean diff --git a/easyrl/utils/data.py b/easyrl/utils/data.py index 3490d1f..e06e50b 100644 --- a/easyrl/utils/data.py +++ b/easyrl/utils/data.py @@ -19,6 +19,7 @@ class StepData: reward: Any = None done: Any = None info: Any = None + extra: Any = None def __post_init__(self): """ @@ -92,6 +93,10 @@ def next_states(self): def rewards(self): return np.array([step_data.reward for step_data in self.traj_data]) + @property + def step_extras(self): + return np.array([step_data.extra for step_data in self.traj_data]) + @property def raw_rewards(self): if len(self.traj_data) > 0 and 'raw_reward' in self.traj_data[0].info[0]: diff --git a/easyrl/utils/gym_util.py b/easyrl/utils/gym_util.py index d5cf854..2c45818 100644 --- a/easyrl/utils/gym_util.py +++ b/easyrl/utils/gym_util.py @@ -2,6 +2,14 @@ import gym import numpy as np +from easyrl.envs.dummy_vec_env import DummyVecEnv +from easyrl.envs.shmem_vec_env import ShmemVecEnv +from easyrl.envs.timeout import NoTimeOutEnv +from easyrl.envs.vec_normalize import VecNormalize +from easyrl.utils.common import load_from_pickle +from easyrl.utils.common import pathlib_file +from easyrl.utils.common import save_to_pickle +from easyrl.utils.rl_logger import logger from gym.spaces import Box from gym.spaces import Dict from gym.spaces import Discrete @@ -10,11 +18,6 @@ from gym.spaces import Tuple from gym.wrappers.time_limit import TimeLimit -from easyrl.envs.dummy_vec_env import DummyVecEnv -from easyrl.envs.shmem_vec_env import ShmemVecEnv -from easyrl.envs.timeout import TimeOutEnv -from easyrl.utils.rl_logger import logger - def num_space_dim(space): if isinstance(space, Box): @@ -33,7 +36,8 @@ def num_space_dim(space): raise NotImplementedError -def make_vec_env(env_id, num_envs, seed=1, no_timeout=True, env_kwargs=None): +def make_vec_env(env_id, num_envs, seed=1, no_timeout=False, + env_kwargs=None): logger.info(f'Creating {num_envs} environments.') if env_kwargs is None: env_kwargs = {} @@ -42,7 +46,7 @@ def make_env(env_id, rank, seed, no_timeout, env_kwargs): def _thunk(): env = gym.make(env_id, **env_kwargs) if no_timeout: - env = TimeOutEnv(env) + env = NoTimeOutEnv(env) env.seed(seed + rank) return env @@ -78,3 +82,23 @@ def is_time_limit_env(env): else: return is_time_limit_env(env.env) return True + + +def save_vec_normalized_env(env, save_dir): + save_dir = pathlib_file(save_dir) + save_file = save_dir.joinpath('vecnorm_env.pkl') + assert isinstance(env, VecNormalize) + data = env.get_states() + save_to_pickle(data, save_file) + + +def load_vec_normalized_env(env, save_dir): + save_dir = pathlib_file(save_dir) + save_file = save_dir.joinpath('vecnorm_env.pkl') + assert isinstance(env, VecNormalize) + data = load_from_pickle(save_file) + env.set_states(data) + + +def get_true_done(done, info): + return done and not info.get('TimeLimit.truncated', False) diff --git a/easyrl/utils/hp_sweeper.py b/easyrl/utils/hp_sweeper.py index 074ff6f..0a249b6 100644 --- a/easyrl/utils/hp_sweeper.py +++ b/easyrl/utils/hp_sweeper.py @@ -8,7 +8,7 @@ from pathlib import Path import GPUtil - +import numpy as np from easyrl.utils.common import load_from_yaml from easyrl.utils.non_block_streamreader import NonBlockingStreamReader as NBSR from easyrl.utils.rl_logger import logger @@ -111,6 +111,7 @@ def get_sweep_cmds(yaml_file): excludeUUID=[]) num_exps = len(cmds) gpus_free_mem = [all_gpus_stats[x].memoryFree for x in gpus_to_use] + sorted_gpu_ids = np.argsort(gpus_free_mem)[::-1] allowable_gpu_jobs = [int(math.floor(x / gpu_mem_per_job)) for x in gpus_free_mem] jobs_run_on_gpu = [0 for i in range(len(gpus_to_use))] can_run_on_gpu = [True for i in range(len(gpus_to_use))] @@ -120,11 +121,13 @@ def get_sweep_cmds(yaml_file): if not any(can_run_on_gpu): logger.warning(f'Run out of GPUs!') break - while not can_run_on_gpu[gpu_id]: + sorted_gpu_id = sorted_gpu_ids[gpu_id] + while not can_run_on_gpu[sorted_gpu_id]: gpu_id = (gpu_id + 1) % len(gpus_to_use) - final_cmds.append(cmds[idx] + f' --device=cuda:{gpus_to_use[gpu_id]}') - jobs_run_on_gpu[gpu_id] += 1 - can_run_on_gpu[gpu_id] = jobs_run_on_gpu[gpu_id] < allowable_gpu_jobs[gpu_id] + sorted_gpu_id = sorted_gpu_ids[gpu_id] + final_cmds.append(cmds[idx] + f' --device=cuda:{gpus_to_use[sorted_gpu_id]}') + jobs_run_on_gpu[sorted_gpu_id] += 1 + can_run_on_gpu[sorted_gpu_id] = jobs_run_on_gpu[sorted_gpu_id] < allowable_gpu_jobs[sorted_gpu_id] gpu_id = (gpu_id + 1) % len(gpus_to_use) return final_cmds @@ -142,8 +145,8 @@ def run_sweep_cmds(cmds): processes.append(p) nbsrs.append(NBSR(p.stdout)) try: + all_done = [False for i in range(len(processes))] while True: - all_done = [False for i in range(len(processes))] for idx, p in enumerate(processes): stime = time.time() proc_print = False @@ -154,17 +157,17 @@ def run_sweep_cmds(cmds): logger.info(f'====================================') logger.info(f'Process {idx}:') proc_print = True - print(lines.decode('utf-8')) + logger.info(lines.decode('utf-8')) if time.time() - stime > 10: break else: break if p.poll() is not None: all_done[idx] = True - break if all(all_done): break time.sleep(2) + logger.info('All processes are completed.') except KeyboardInterrupt: logger.warning('Keyboard interruption.') finally: diff --git a/examples/README.md b/examples/README.md index 097d98b..1728862 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,6 +1,8 @@ ### Hyperparameter Sweep -First, define a yaml file (e.g., `sac_sweeper.yml`) that specifies the search values for each hyperparameter. And run the following command: +First, define a yaml file (e.g., `sac_sweeper.yml`) that specifies the search values for each hyperparameter. And run +the following command: + ```bash -hp_sweep --cfg_file sac_sweeper.yml +hpsweep -f sweepers/sac_sweeper.yml ``` \ No newline at end of file diff --git a/examples/ppo.py b/examples/ppo.py index dfce0fa..70e3d61 100644 --- a/examples/ppo.py +++ b/examples/ppo.py @@ -10,7 +10,8 @@ from easyrl.models.diag_gaussian_policy import DiagGaussianPolicy from easyrl.models.mlp import MLP from easyrl.models.value_net import ValueNet -from easyrl.runner.episodic_runner import EpisodicRunner +from easyrl.runner.nstep_runner import EpisodicRunner +from easyrl.utils.common import check_if_run_distributed from easyrl.utils.common import set_random_seed from easyrl.utils.gym_util import make_vec_env @@ -29,11 +30,14 @@ def main(): skip_params = [] cfg.alg.restore_cfg(skip_params=skip_params) if cfg.alg.env_name is None: - cfg.alg.env_name = 'HalfCheetah-v2' + cfg.alg.env_name = 'HalfCheetah-v3' + set_random_seed(cfg.alg.seed) + check_if_run_distributed(cfg.alg) env = make_vec_env(cfg.alg.env_name, cfg.alg.num_envs, - seed=cfg.alg.seed) + seed=cfg.alg.seed, + distributed=cfg.alg.distributed) env.reset() ob_size = env.observation_space.shape[0] @@ -42,6 +46,7 @@ def main(): output_size=64, hidden_act=nn.ReLU, output_act=nn.ReLU) + critic_body = MLP(input_size=ob_size, hidden_sizes=[64], output_size=64, @@ -49,17 +54,21 @@ def main(): output_act=nn.ReLU) if isinstance(env.action_space, gym.spaces.Discrete): act_size = env.action_space.n - actor = CategoricalPolicy(actor_body, action_dim=act_size) + actor = CategoricalPolicy(actor_body, + in_features=64, + action_dim=act_size) elif isinstance(env.action_space, gym.spaces.Box): act_size = env.action_space.shape[0] - actor = DiagGaussianPolicy(actor_body, action_dim=act_size, + actor = DiagGaussianPolicy(actor_body, + in_features=64, + action_dim=act_size, tanh_on_dist=cfg.alg.tanh_on_dist, std_cond_in=cfg.alg.std_cond_in) else: raise TypeError(f'Unknown action space type: {env.action_space}') - critic = ValueNet(critic_body) - agent = PPOAgent(actor, critic) + critic = ValueNet(critic_body, in_features=64) + agent = PPOAgent(actor=actor, critic=critic, env=env) runner = EpisodicRunner(agent=agent, env=env) engine = PPOEngine(agent=agent, runner=runner) diff --git a/examples/rnn_ppo.py b/examples/rnn_ppo.py index 89b0652..64522f8 100644 --- a/examples/rnn_ppo.py +++ b/examples/rnn_ppo.py @@ -14,7 +14,8 @@ from easyrl.runner.rnn_runner import RNNRunner from easyrl.utils.common import set_random_seed from easyrl.utils.gym_util import make_vec_env - +from easyrl.envs.vec_normalize import VecNormalize +from easyrl.envs.reward_wrapper import RewardScaler def main(): set_config('ppo') @@ -30,54 +31,55 @@ def main(): skip_params = [] cfg.alg.restore_cfg(skip_params=skip_params) if cfg.alg.env_name is None: - cfg.alg.env_name = 'HalfCheetah-v2' + cfg.alg.env_name = 'HalfCheetah-v3' set_random_seed(cfg.alg.seed) env = make_vec_env(cfg.alg.env_name, cfg.alg.num_envs, seed=cfg.alg.seed) + + # (1) VecNormalize turns out to be very important in RNN policy for mujoco gym environments + # env = VecNormalize(env, gamma=cfg.alg.rew_discount) + # (2): It works as well if we scale the reward by 0.01. Either works + env = RewardScaler(env, scale=0.01) env.reset() ob_size = env.observation_space.shape[0] - actor_body = MLP(input_size=ob_size, - hidden_sizes=[256], - output_size=256, - hidden_act=nn.ReLU, - output_act=nn.ReLU) - actor_body = RNNBase(body_net=actor_body, - rnn_features=256, - in_features=256, - rnn_layers=1, - ) - critic_body = MLP(input_size=ob_size, - hidden_sizes=[256], - output_size=256, - hidden_act=nn.ReLU, - output_act=nn.ReLU) - critic_body = RNNBase(body_net=critic_body, - rnn_features=256, - in_features=256, + ac_body = MLP(input_size=ob_size, + hidden_sizes=[64], + output_size=64, + hidden_act=nn.ELU, + # hid_layer_norm=True, + output_act=None) + ac_rnn_body = RNNBase(body_net=ac_body, + rnn_features=64, + in_features=64, rnn_layers=1, ) if isinstance(env.action_space, gym.spaces.Discrete): act_size = env.action_space.n - actor = RNNCategoricalPolicy(actor_body, action_dim=act_size) + actor = RNNCategoricalPolicy(ac_rnn_body, action_dim=act_size) elif isinstance(env.action_space, gym.spaces.Box): act_size = env.action_space.shape[0] - actor = RNNDiagGaussianPolicy(actor_body, action_dim=act_size, + actor = RNNDiagGaussianPolicy(ac_rnn_body, + action_dim=act_size, tanh_on_dist=cfg.alg.tanh_on_dist, + init_log_std=0., std_cond_in=cfg.alg.std_cond_in) else: raise TypeError(f'Unknown action space ' f'type: {env.action_space}') - critic = RNNValueNet(critic_body) - agent = PPORNNAgent(actor, critic) + critic = RNNValueNet(ac_rnn_body) + agent = PPORNNAgent(actor=actor, critic=critic, + env=env, same_body=True) runner = RNNRunner(agent=agent, env=env) engine = PPORNNEngine(agent=agent, runner=runner) if not cfg.alg.test: engine.train() else: + # set env.training to False so that the states in the VecNormalize env are not updated + env.training = False stat_info, raw_traj_info = engine.eval(render=cfg.alg.render, save_eval_traj=cfg.alg.save_test_traj, eval_num=cfg.alg.test_num, diff --git a/examples/sac.py b/examples/sac.py index 63df734..c072ab2 100644 --- a/examples/sac.py +++ b/examples/sac.py @@ -1,6 +1,5 @@ import torch import torch.nn as nn -from easyrl.runner.step_runner import StepRunner from easyrl.agents.sac_agent import SACAgent from easyrl.configs import cfg @@ -11,6 +10,8 @@ from easyrl.models.mlp import MLP from easyrl.models.value_net import ValueNet from easyrl.replays.circular_buffer import CyclicBuffer +from easyrl.runner.nstep_runner import EpisodicRunner +from easyrl.utils.common import check_if_run_distributed from easyrl.utils.common import set_random_seed from easyrl.utils.gym_util import make_vec_env @@ -30,17 +31,19 @@ def main(): skip_params = [] cfg.alg.restore_cfg(skip_params=skip_params) if cfg.alg.env_name is None: - cfg.alg.env_name = 'HalfCheetah-v2' + cfg.alg.env_name = 'HalfCheetah-v3' if not cfg.alg.test: cfg.alg.test_num = 10 set_random_seed(cfg.alg.seed) + check_if_run_distributed(cfg.alg) env = make_vec_env(cfg.alg.env_name, cfg.alg.num_envs, seed=cfg.alg.seed) # env = SingleEnvWrapper(gym.make(cfg.alg.env_name)) eval_env = make_vec_env(cfg.alg.env_name, cfg.alg.num_envs, - seed=cfg.alg.seed) + seed=cfg.alg.seed, + distributed=cfg.alg.distributed) ob_size = env.observation_space.shape[0] act_size = env.action_space.shape[0] @@ -66,8 +69,8 @@ def main(): q1 = ValueNet(q1_body) q2 = ValueNet(q2_body) memory = CyclicBuffer(capacity=cfg.alg.replay_size) - agent = SACAgent(actor, q1=q1, q2=q2, env=env, memory=memory) - runner = StepRunner(agent=agent, env=env, eval_env=eval_env) + agent = SACAgent(actor=actor, q1=q1, q2=q2, env=env, memory=memory) + runner = EpisodicRunner(agent=agent, env=env, eval_env=eval_env) engine = SACEngine(agent=agent, runner=runner) diff --git a/setup.py b/setup.py index 6576840..e9cb3fd 100644 --- a/setup.py +++ b/setup.py @@ -30,6 +30,6 @@ def read_requirements_file(filename): packages=easyrl_pkgs, install_requires=read_requirements_file('requirements.txt'), entry_points={ - 'console_scripts': ['hp_sweep=easyrl.utils.hp_sweeper:main'] + 'console_scripts': ['hpsweep=easyrl.utils.hp_sweeper:main'] } ) From 44888683f855d4b83094ad0842653ee915373b93 Mon Sep 17 00:00:00 2001 From: taochenshh Date: Mon, 28 Dec 2020 11:11:35 -0500 Subject: [PATCH 25/35] save vecnorm env --- easyrl/agents/ppo_agent.py | 2 ++ easyrl/agents/sac_agent.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/easyrl/agents/ppo_agent.py b/easyrl/agents/ppo_agent.py index 7dc225d..f8b0b4b 100644 --- a/easyrl/agents/ppo_agent.py +++ b/easyrl/agents/ppo_agent.py @@ -220,6 +220,7 @@ def decay_clip_range(self): cfg.alg.clip_range -= self.clip_range_decay_rate def save_model(self, is_best=False, step=None): + self.save_env(cfg.alg.model_dir) data_to_save = { 'step': step, 'actor_state_dict': self.actor.state_dict(), @@ -234,6 +235,7 @@ def save_model(self, is_best=False, step=None): save_model(data_to_save, cfg.alg, is_best=is_best, step=step) def load_model(self, step=None, pretrain_model=None): + self.load_env(cfg.alg.model_dir) ckpt_data = load_ckpt_data(cfg.alg, step=step, pretrain_model=pretrain_model) load_state_dict(self.actor, diff --git a/easyrl/agents/sac_agent.py b/easyrl/agents/sac_agent.py index c948894..596daca 100644 --- a/easyrl/agents/sac_agent.py +++ b/easyrl/agents/sac_agent.py @@ -202,6 +202,7 @@ def eval_mode(self): self.q2.eval() def save_model(self, is_best=False, step=None): + self.save_env(cfg.alg.model_dir) data_to_save = { 'step': step, 'actor_state_dict': self.actor.state_dict(), @@ -221,6 +222,7 @@ def save_model(self, is_best=False, step=None): logger.info('The replay buffer is saved.') def load_model(self, step=None, pretrain_model=None): + self.load_env(cfg.alg.model_dir) ckpt_data = load_ckpt_data(cfg.alg, step=step, pretrain_model=pretrain_model) load_state_dict(self.actor, From c94085b89140196badb79d00660cb673b3e6be2a Mon Sep 17 00:00:00 2001 From: Tao <15166943+taochenshh@users.noreply.github.com> Date: Fri, 29 Jan 2021 09:45:02 -0500 Subject: [PATCH 26/35] Update sac_sweeper.yml --- examples/sac_sweeper.yml | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/examples/sac_sweeper.yml b/examples/sac_sweeper.yml index 4f183c6..ad46909 100644 --- a/examples/sac_sweeper.yml +++ b/examples/sac_sweeper.yml @@ -6,25 +6,8 @@ hparams: save_dir_root: data max_steps: 3000000 env_name: [Walker2d-v3, Hopper-v3, Humanoid-v3] - # no_pretrain_actor: True - # warmup_steps: 256 - # pretrain_model: None - # freeze_q: True - # polyak: [0.99, 0.92] - # alpha: [0.] - # opt_interval: [1000] - # opt_num: [500, 1000] - # batch_size: [256, 512] - # no_q2: [True] - # no_qent: [True, False] - # no_pent: [True, False] - # no_tgt: [True, False] - # hard_update: [1000, 10000, 5000] seed: [1, 0] - # actor_lr: [0.001, 0.0003] - # critic_lr: [0.001, 0.0003] - # alpha: [0.2, None] # default_true: # tgt_sample: [True, False] default_false: From ec06e31643c865f7e1083d17f20b5aec76daecbf Mon Sep 17 00:00:00 2001 From: Tao <15166943+taochenshh@users.noreply.github.com> Date: Wed, 3 Feb 2021 21:18:31 -0500 Subject: [PATCH 27/35] Update ppo.py --- examples/ppo.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/ppo.py b/examples/ppo.py index 70e3d61..d4f9363 100644 --- a/examples/ppo.py +++ b/examples/ppo.py @@ -11,7 +11,6 @@ from easyrl.models.mlp import MLP from easyrl.models.value_net import ValueNet from easyrl.runner.nstep_runner import EpisodicRunner -from easyrl.utils.common import check_if_run_distributed from easyrl.utils.common import set_random_seed from easyrl.utils.gym_util import make_vec_env @@ -33,7 +32,6 @@ def main(): cfg.alg.env_name = 'HalfCheetah-v3' set_random_seed(cfg.alg.seed) - check_if_run_distributed(cfg.alg) env = make_vec_env(cfg.alg.env_name, cfg.alg.num_envs, seed=cfg.alg.seed, From e3b18fd2659e3464bfe75966bbf93b6a5f5aab0d Mon Sep 17 00:00:00 2001 From: Tao <15166943+taochenshh@users.noreply.github.com> Date: Mon, 15 Mar 2021 15:43:28 -0400 Subject: [PATCH 28/35] Update sac.py --- examples/sac.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/examples/sac.py b/examples/sac.py index c072ab2..ce1b0f0 100644 --- a/examples/sac.py +++ b/examples/sac.py @@ -11,7 +11,6 @@ from easyrl.models.value_net import ValueNet from easyrl.replays.circular_buffer import CyclicBuffer from easyrl.runner.nstep_runner import EpisodicRunner -from easyrl.utils.common import check_if_run_distributed from easyrl.utils.common import set_random_seed from easyrl.utils.gym_util import make_vec_env @@ -35,15 +34,13 @@ def main(): if not cfg.alg.test: cfg.alg.test_num = 10 set_random_seed(cfg.alg.seed) - check_if_run_distributed(cfg.alg) env = make_vec_env(cfg.alg.env_name, cfg.alg.num_envs, seed=cfg.alg.seed) # env = SingleEnvWrapper(gym.make(cfg.alg.env_name)) eval_env = make_vec_env(cfg.alg.env_name, cfg.alg.num_envs, - seed=cfg.alg.seed, - distributed=cfg.alg.distributed) + seed=cfg.alg.seed) ob_size = env.observation_space.shape[0] act_size = env.action_space.shape[0] From f380449e9c431ac8f0dda0ab868e85504c555d26 Mon Sep 17 00:00:00 2001 From: Tao <15166943+taochenshh@users.noreply.github.com> Date: Mon, 15 Mar 2021 17:01:32 -0400 Subject: [PATCH 29/35] Update ppo.py --- examples/ppo.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/ppo.py b/examples/ppo.py index d4f9363..dbae29d 100644 --- a/examples/ppo.py +++ b/examples/ppo.py @@ -34,8 +34,7 @@ def main(): set_random_seed(cfg.alg.seed) env = make_vec_env(cfg.alg.env_name, cfg.alg.num_envs, - seed=cfg.alg.seed, - distributed=cfg.alg.distributed) + seed=cfg.alg.seed) env.reset() ob_size = env.observation_space.shape[0] From 9577418bcaac5c831c9cfe6052469781215c4781 Mon Sep 17 00:00:00 2001 From: taochenshh Date: Tue, 16 Mar 2021 12:07:06 -0400 Subject: [PATCH 30/35] add success log --- easyrl/engine/ppo_engine.py | 5 ++ easyrl/envs/subproc_vec_env.py | 138 +++++++++++++++++++++++++++++++++ easyrl/utils/gym_util.py | 30 +++++-- 3 files changed, 165 insertions(+), 8 deletions(-) create mode 100644 easyrl/envs/subproc_vec_env.py diff --git a/easyrl/engine/ppo_engine.py b/easyrl/engine/ppo_engine.py index 65bbd1a..6b5b73f 100644 --- a/easyrl/engine/ppo_engine.py +++ b/easyrl/engine/ppo_engine.py @@ -58,6 +58,7 @@ def eval(self, render=False, save_eval_traj=False, eval_num=1, time_steps = [] rets = [] lst_step_infos = [] + successes = [] if no_tqdm: disable_tqdm = bool(no_tqdm) else: @@ -80,6 +81,8 @@ def eval(self, render=False, save_eval_traj=False, eval_num=1, time_steps.extend(tsps) if save_eval_traj: save_traj(traj, cfg.alg.eval_dir) + if 'success' in infos[0][0]: + successes.extend([infos[tsps[ej] - 1][ej]['success'] for ej in range(rewards.shape[1])]) raw_traj_info = {'return': rets, 'episode_length': time_steps, @@ -91,6 +94,8 @@ def eval(self, render=False, save_eval_traj=False, eval_num=1, val_stats = get_list_stats(val) for sk, sv in val_stats.items(): log_info['eval/' + key + '/' + sk] = sv + if len(successes) > 0: + log_info['eval/success'] = np.mean(successes) if smooth: if self.smooth_eval_return is None: self.smooth_eval_return = log_info['eval/return/mean'] diff --git a/easyrl/envs/subproc_vec_env.py b/easyrl/envs/subproc_vec_env.py new file mode 100644 index 0000000..168c89f --- /dev/null +++ b/easyrl/envs/subproc_vec_env.py @@ -0,0 +1,138 @@ +import multiprocessing as mp + +import numpy as np +from easyrl.envs.vec_env import VecEnv, CloudpickleWrapper, clear_mpi_env_vars + + +def worker(remote, parent_remote, env_fn_wrappers): + def step_env(env, action): + ob, reward, done, info = env.step(action) + if done: + info['true_next_ob'] = ob + ob = env.reset() + return ob, reward, done, info + + parent_remote.close() + envs = [env_fn_wrapper() for env_fn_wrapper in env_fn_wrappers.x] + try: + while True: + cmd, data = remote.recv() + if cmd == 'step': + remote.send([step_env(env, action) for env, action in zip(envs, data)]) + elif cmd == 'reset': + remote.send([env.reset() for env in envs]) + elif cmd == 'render': + remote.send([env.render(mode='rgb_array') for env in envs]) + elif cmd == 'close': + remote.close() + break + elif cmd == 'get_spaces_spec': + remote.send(CloudpickleWrapper((envs[0].observation_space, envs[0].action_space, envs[0].spec))) + else: + raise NotImplementedError + except KeyboardInterrupt: + print('SubprocVecEnv worker: got KeyboardInterrupt') + finally: + for env in envs: + env.close() + + +class SubprocVecEnv(VecEnv): + """ + VecEnv that runs multiple environments in parallel in subproceses and communicates with them via pipes. + Recommended to use when num_envs > 1 and step() can be a bottleneck. + """ + def __init__(self, env_fns, spaces=None, context='spawn', in_series=1): + """ + Arguments: + env_fns: iterable of callables - functions that create environments to run in subprocesses. Need to be cloud-pickleable + in_series: number of environments to run in series in a single process + (e.g. when len(env_fns) == 12 and in_series == 3, it will run 4 processes, each running 3 envs in series) + """ + self.waiting = False + self.closed = False + self.in_series = in_series + nenvs = len(env_fns) + assert nenvs % in_series == 0, "Number of envs must be divisible by number of envs to run in series" + self.nremotes = nenvs // in_series + env_fns = np.array_split(env_fns, self.nremotes) + ctx = mp.get_context(context) + self.remotes, self.work_remotes = zip(*[ctx.Pipe() for _ in range(self.nremotes)]) + self.ps = [ctx.Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) + for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] + for p in self.ps: + p.daemon = True # if the main process crashes, we should not cause things to hang + with clear_mpi_env_vars(): + p.start() + for remote in self.work_remotes: + remote.close() + + self.remotes[0].send(('get_spaces_spec', None)) + observation_space, action_space, self.spec = self.remotes[0].recv().x + self.viewer = None + VecEnv.__init__(self, nenvs, observation_space, action_space) + + def step_async(self, actions): + self._assert_not_closed() + actions = np.array_split(actions, self.nremotes) + for remote, action in zip(self.remotes, actions): + remote.send(('step', action)) + self.waiting = True + + def step_wait(self): + self._assert_not_closed() + results = [remote.recv() for remote in self.remotes] + results = _flatten_list(results) + self.waiting = False + obs, rews, dones, infos = zip(*results) + return _flatten_obs(obs), np.stack(rews), np.stack(dones), infos + + def reset(self): + self._assert_not_closed() + for remote in self.remotes: + remote.send(('reset', None)) + obs = [remote.recv() for remote in self.remotes] + obs = _flatten_list(obs) + return _flatten_obs(obs) + + def close_extras(self): + self.closed = True + if self.waiting: + for remote in self.remotes: + remote.recv() + for remote in self.remotes: + remote.send(('close', None)) + for p in self.ps: + p.join() + + def get_images(self): + self._assert_not_closed() + for pipe in self.remotes: + pipe.send(('render', None)) + imgs = [pipe.recv() for pipe in self.remotes] + imgs = _flatten_list(imgs) + return imgs + + def _assert_not_closed(self): + assert not self.closed, "Trying to operate on a SubprocVecEnv after calling close()" + + def __del__(self): + if not self.closed: + self.close() + +def _flatten_obs(obs): + assert isinstance(obs, (list, tuple)) + assert len(obs) > 0 + + if isinstance(obs[0], dict): + keys = obs[0].keys() + return {k: np.stack([o[k] for o in obs]) for k in keys} + else: + return np.stack(obs) + +def _flatten_list(l): + assert isinstance(l, (list, tuple)) + assert len(l) > 0 + assert all([len(l_) > 0 for l_ in l]) + + return [l__ for l_ in l for l__ in l_] \ No newline at end of file diff --git a/easyrl/utils/gym_util.py b/easyrl/utils/gym_util.py index 2c45818..94d6604 100644 --- a/easyrl/utils/gym_util.py +++ b/easyrl/utils/gym_util.py @@ -4,6 +4,7 @@ import numpy as np from easyrl.envs.dummy_vec_env import DummyVecEnv from easyrl.envs.shmem_vec_env import ShmemVecEnv +from easyrl.envs.subproc_vec_env import SubprocVecEnv from easyrl.envs.timeout import NoTimeOutEnv from easyrl.envs.vec_normalize import VecNormalize from easyrl.utils.common import load_from_pickle @@ -35,28 +36,41 @@ def num_space_dim(space): else: raise NotImplementedError - -def make_vec_env(env_id, num_envs, seed=1, no_timeout=False, - env_kwargs=None): +def make_vec_env(env_id=None, num_envs=1, seed=1, env_func=None, no_timeout=False, + env_kwargs=None, distributed=False, + extra_wrapper=None, wrapper_kwargs=None): logger.info(f'Creating {num_envs} environments.') if env_kwargs is None: env_kwargs = {} - - def make_env(env_id, rank, seed, no_timeout, env_kwargs): + if wrapper_kwargs is None: + wrapper_kwargs = {} + if distributed: + import horovod.torch as hvd + seed_offset = hvd.rank() * 100000 + seed += seed_offset + + def make_env(env_id, rank, seed, no_timeout, env_kwargs, extra_wrapper, wrapper_kwargs): def _thunk(): - env = gym.make(env_id, **env_kwargs) + if env_func is not None: + env = env_func(**env_kwargs) + else: + env = gym.make(env_id, **env_kwargs) if no_timeout: env = NoTimeOutEnv(env) + if extra_wrapper is not None: + env = extra_wrapper(env, **wrapper_kwargs) env.seed(seed + rank) return env - return _thunk envs = [make_env(env_id, idx, seed, no_timeout, - env_kwargs) for idx in range(num_envs)] + env_kwargs, + extra_wrapper, + wrapper_kwargs + ) for idx in range(num_envs)] if num_envs > 1: envs = ShmemVecEnv(envs, context='spawn') else: From cc0b9fd4372a6911164fd4b0cc93485c4f55c194 Mon Sep 17 00:00:00 2001 From: taochenshh Date: Tue, 16 Mar 2021 12:15:33 -0400 Subject: [PATCH 31/35] add success log --- easyrl/configs/basic_config.py | 1 + easyrl/runner/base_runner.py | 16 ++++++++++------ easyrl/runner/nstep_runner.py | 3 ++- easyrl/runner/rnn_runner.py | 3 ++- 4 files changed, 15 insertions(+), 8 deletions(-) diff --git a/easyrl/configs/basic_config.py b/easyrl/configs/basic_config.py index 2423c07..52ec339 100644 --- a/easyrl/configs/basic_config.py +++ b/easyrl/configs/basic_config.py @@ -17,6 +17,7 @@ class BasicConfig: save_dir_root: str = None eval_interval: int = 100 log_interval: int = 10 + deque_size: int = 100 weight_decay: float = 0.00 max_grad_norm: float = None batch_size: int = 256 diff --git a/easyrl/runner/base_runner.py b/easyrl/runner/base_runner.py index 09c1443..1a97789 100644 --- a/easyrl/runner/base_runner.py +++ b/easyrl/runner/base_runner.py @@ -2,6 +2,7 @@ from copy import deepcopy from easyrl.utils.gym_util import get_true_done from collections import deque +from easyrl.configs import cfg class BasicRunner: def __init__(self, agent, env, eval_env=None): @@ -10,8 +11,9 @@ def __init__(self, agent, env, eval_env=None): self.num_train_envs = env.num_envs self.obs = None self.eval_env = env if eval_env is None else eval_env - self.train_ep_return = deque(maxlen=100) - self.train_ep_len = deque(maxlen=100) + self.train_ep_return = deque(maxlen=cfg.alg.deque_size) + self.train_ep_len = deque(maxlen=cfg.alg.deque_size) + self.train_success = deque(maxlen=cfg.alg.deque_size) self.reset_record() def __call__(self, **kwargs): @@ -27,7 +29,7 @@ def reset_record(self): self.cur_ep_len = np.zeros(self.num_train_envs) self.cur_ep_return = np.zeros(self.num_train_envs) - def get_true_done_next_ob(self, next_ob, done, reward, info, all_dones): + def get_true_done_next_ob(self, next_ob, done, reward, info, all_dones, skip_record=False): done_idx = np.argwhere(done).flatten() self.cur_ep_len += 1 if 'raw_reward' in info[0]: @@ -41,14 +43,16 @@ def get_true_done_next_ob(self, next_ob, done, reward, info, all_dones): true_next_ob[done_idx] = np.array([info[i]['true_next_ob'] for i in done_idx]) if all_dones is not None: all_dones[done_idx] = True - for dix in done_idx: - self.train_ep_return.append(self.cur_ep_return[dix]) - self.train_ep_len.append(self.cur_ep_len[dix]) self.cur_ep_return[done_idx] = 0 self.cur_ep_len[done_idx] = 0 true_done = deepcopy(done) for iidx, inf in enumerate(info): true_done[iidx] = get_true_done(true_done[iidx], inf) + if not skip_record: + self.train_ep_return.extend([self.cur_ep_return[dix] for dix in done_idx]) + self.train_ep_len.extend([self.cur_ep_len[dix] for dix in done_idx]) + if 'success' in info[0]: + self.train_success.extend([info[i]['success'] for i in done_idx]) else: true_next_ob = next_ob true_done = done diff --git a/easyrl/runner/nstep_runner.py b/easyrl/runner/nstep_runner.py index 097d1d6..e41019d 100644 --- a/easyrl/runner/nstep_runner.py +++ b/easyrl/runner/nstep_runner.py @@ -74,7 +74,8 @@ def __call__(self, time_steps, sample=True, evaluation=False, done, reward, info, - all_dones) + all_dones, + skip_record=evaluation) sd = StepData(ob=ob, action=action, action_info=action_info, diff --git a/easyrl/runner/rnn_runner.py b/easyrl/runner/rnn_runner.py index e4ee2bc..92f8243 100644 --- a/easyrl/runner/rnn_runner.py +++ b/easyrl/runner/rnn_runner.py @@ -74,7 +74,8 @@ def __call__(self, time_steps, sample=True, evaluation=False, done, reward, info, - all_dones) + all_dones, + skip_record=evaluation) sd = StepData(ob=ob, action=action, From 27adbaa888d629987bf46b62a0053a7341a3b085 Mon Sep 17 00:00:00 2001 From: taochenshh Date: Tue, 16 Mar 2021 12:31:22 -0400 Subject: [PATCH 32/35] log success --- easyrl/engine/basic_engine.py | 4 +++- easyrl/engine/ppo_engine.py | 21 --------------------- easyrl/engine/sac_engine.py | 26 -------------------------- 3 files changed, 3 insertions(+), 48 deletions(-) diff --git a/easyrl/engine/basic_engine.py b/easyrl/engine/basic_engine.py index 393d245..c1e56ea 100644 --- a/easyrl/engine/basic_engine.py +++ b/easyrl/engine/basic_engine.py @@ -54,7 +54,6 @@ def get_train_log(self, optim_infos, traj=None): k_stats = get_list_stats([inf[key] for inf in optim_infos if key in inf]) for sk, sv in k_stats.items(): log_info[f'{key}/' + sk] = sv - if traj is not None: actions_stats = get_list_stats(traj.actions) for sk, sv in actions_stats.items(): @@ -65,6 +64,9 @@ def get_train_log(self, optim_infos, traj=None): for sk, sv in ep_returns_stats.items(): log_info['episode_return/' + sk] = sv + if len(self.runner.train_success) > 0: + log_info['episode_success'] = np.mean(self.runner.train_success) + train_log_info = dict() for key, val in log_info.items(): train_log_info['train/' + key] = val diff --git a/easyrl/engine/ppo_engine.py b/easyrl/engine/ppo_engine.py index 6b5b73f..e67e56b 100644 --- a/easyrl/engine/ppo_engine.py +++ b/easyrl/engine/ppo_engine.py @@ -165,24 +165,3 @@ def cal_advantages(self, traj): dones=traj.dones) return adv - def get_train_log(self, optim_infos, traj): - log_info = dict() - for key in optim_infos[0].keys(): - log_info[key] = np.mean([inf[key] for inf in optim_infos if key in inf]) - t1 = time.perf_counter() - actions_stats = get_list_stats(traj.actions) - for sk, sv in actions_stats.items(): - log_info['rollout_action/' + sk] = sv - log_info['optim_time'] = t1 - self.optim_stime - log_info['rollout_steps_per_iter'] = traj.total_steps - ep_returns = list(chain(*traj.episode_returns)) - for epr in ep_returns: - self.train_ep_return.append(epr) - ep_returns_stats = get_list_stats(self.train_ep_return) - for sk, sv in ep_returns_stats.items(): - log_info['episode_return/' + sk] = sv - - train_log_info = dict() - for key, val in log_info.items(): - train_log_info['train/' + key] = val - return train_log_info diff --git a/easyrl/engine/sac_engine.py b/easyrl/engine/sac_engine.py index a218ae1..fdbf790 100644 --- a/easyrl/engine/sac_engine.py +++ b/easyrl/engine/sac_engine.py @@ -129,32 +129,6 @@ def train_once(self): optim_infos.append(optim_info) return self.get_train_log(optim_infos) - def get_train_log(self, optim_infos): - log_info = dict() - vector_keys = set() - scalar_keys = set() - for oinf in optim_infos: - for key in oinf.keys(): - if 'vec_' in key: - vector_keys.add(key) - else: - scalar_keys.add(key) - - for key in scalar_keys: - log_info[key] = np.mean([inf[key] for inf in optim_infos if key in inf]) - - for key in vector_keys: - k_stats = get_list_stats([inf[key] for inf in optim_infos if key in inf]) - for sk, sv in k_stats.items(): - log_info[f'{key}/' + sk] = sv - - t1 = time.perf_counter() - log_info['optim_time'] = t1 - self.optim_stime - train_log_info = dict() - for key, val in log_info.items(): - train_log_info['train/' + key] = val - return train_log_info - def add_traj_to_memory(self, traj): obs = traj.obs actions = traj.actions From 9bc4122351191a683bb4797559cded78f6505956 Mon Sep 17 00:00:00 2001 From: taochenshh Date: Tue, 16 Mar 2021 14:50:47 -0400 Subject: [PATCH 33/35] fix return log --- easyrl/runner/base_runner.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/easyrl/runner/base_runner.py b/easyrl/runner/base_runner.py index 1a97789..3ab97a1 100644 --- a/easyrl/runner/base_runner.py +++ b/easyrl/runner/base_runner.py @@ -43,8 +43,7 @@ def get_true_done_next_ob(self, next_ob, done, reward, info, all_dones, skip_rec true_next_ob[done_idx] = np.array([info[i]['true_next_ob'] for i in done_idx]) if all_dones is not None: all_dones[done_idx] = True - self.cur_ep_return[done_idx] = 0 - self.cur_ep_len[done_idx] = 0 + true_done = deepcopy(done) for iidx, inf in enumerate(info): true_done[iidx] = get_true_done(true_done[iidx], inf) @@ -53,6 +52,8 @@ def get_true_done_next_ob(self, next_ob, done, reward, info, all_dones, skip_rec self.train_ep_len.extend([self.cur_ep_len[dix] for dix in done_idx]) if 'success' in info[0]: self.train_success.extend([info[i]['success'] for i in done_idx]) + self.cur_ep_return[done_idx] = 0 + self.cur_ep_len[done_idx] = 0 else: true_next_ob = next_ob true_done = done From 47d8b1697c4cb0ca640f60b13aa62a90782c0747 Mon Sep 17 00:00:00 2001 From: Tao <15166943+taochenshh@users.noreply.github.com> Date: Thu, 18 Mar 2021 11:39:03 -0400 Subject: [PATCH 34/35] Update hp_sweeper.py --- easyrl/utils/hp_sweeper.py | 76 +++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 34 deletions(-) diff --git a/easyrl/utils/hp_sweeper.py b/easyrl/utils/hp_sweeper.py index 0a249b6..d727aca 100644 --- a/easyrl/utils/hp_sweeper.py +++ b/easyrl/utils/hp_sweeper.py @@ -85,7 +85,7 @@ def boolean_cmd(cmd, field, val, default_false=True): return cmd -def get_sweep_cmds(yaml_file): +def get_sweep_cmds(yaml_file, use_gpu=True): configs = load_from_yaml(yaml_file) base_cmd = configs['cmd'] hparams = configs['hparams'] @@ -96,39 +96,42 @@ def get_sweep_cmds(yaml_file): cmd = base_cmd + ' ' + cmd_for_hparams(hps) cmds.append(cmd) - all_gpus_stats = GPUtil.getGPUs() - exclude_gpus = configs['exclude_gpus'] - gpu_mem_per_job = configs['gpu_memory_per_job'] - gpu_mem_pct_per_job = float(gpu_mem_per_job) / all_gpus_stats[0].memoryTotal - if exclude_gpus == 'None': - exclude_gpus = [] - gpus_to_use = GPUtil.getAvailable(order='first', - limit=100, - maxLoad=0.8, - maxMemory=1 - gpu_mem_pct_per_job, - includeNan=False, - excludeID=exclude_gpus, - excludeUUID=[]) - num_exps = len(cmds) - gpus_free_mem = [all_gpus_stats[x].memoryFree for x in gpus_to_use] - sorted_gpu_ids = np.argsort(gpus_free_mem)[::-1] - allowable_gpu_jobs = [int(math.floor(x / gpu_mem_per_job)) for x in gpus_free_mem] - jobs_run_on_gpu = [0 for i in range(len(gpus_to_use))] - can_run_on_gpu = [True for i in range(len(gpus_to_use))] - gpu_id = 0 final_cmds = [] - for idx in range(num_exps): - if not any(can_run_on_gpu): - logger.warning(f'Run out of GPUs!') - break - sorted_gpu_id = sorted_gpu_ids[gpu_id] - while not can_run_on_gpu[sorted_gpu_id]: - gpu_id = (gpu_id + 1) % len(gpus_to_use) + if use_gpu: + all_gpus_stats = GPUtil.getGPUs() + exclude_gpus = configs['exclude_gpus'] + gpu_mem_per_job = configs['gpu_memory_per_job'] + gpu_mem_pct_per_job = float(gpu_mem_per_job) / all_gpus_stats[0].memoryTotal + if exclude_gpus == 'None': + exclude_gpus = [] + gpus_to_use = GPUtil.getAvailable(order='first', + limit=100, + maxLoad=0.8, + maxMemory=1 - gpu_mem_pct_per_job, + includeNan=False, + excludeID=exclude_gpus, + excludeUUID=[]) + num_exps = len(cmds) + gpus_free_mem = [all_gpus_stats[x].memoryFree for x in gpus_to_use] + sorted_gpu_ids = np.argsort(gpus_free_mem)[::-1] + allowable_gpu_jobs = [int(math.floor(x / gpu_mem_per_job)) for x in gpus_free_mem] + jobs_run_on_gpu = [0 for i in range(len(gpus_to_use))] + can_run_on_gpu = [True for i in range(len(gpus_to_use))] + gpu_id = 0 + for idx in range(num_exps): + if not any(can_run_on_gpu): + logger.warning(f'Run out of GPUs!') + break sorted_gpu_id = sorted_gpu_ids[gpu_id] - final_cmds.append(cmds[idx] + f' --device=cuda:{gpus_to_use[sorted_gpu_id]}') - jobs_run_on_gpu[sorted_gpu_id] += 1 - can_run_on_gpu[sorted_gpu_id] = jobs_run_on_gpu[sorted_gpu_id] < allowable_gpu_jobs[sorted_gpu_id] - gpu_id = (gpu_id + 1) % len(gpus_to_use) + while not can_run_on_gpu[sorted_gpu_id]: + gpu_id = (gpu_id + 1) % len(gpus_to_use) + sorted_gpu_id = sorted_gpu_ids[gpu_id] + final_cmds.append(cmds[idx] + f' --device=cuda:{gpus_to_use[sorted_gpu_id]}') + jobs_run_on_gpu[sorted_gpu_id] += 1 + can_run_on_gpu[sorted_gpu_id] = jobs_run_on_gpu[sorted_gpu_id] < allowable_gpu_jobs[sorted_gpu_id] + gpu_id = (gpu_id + 1) % len(gpus_to_use) + else: + final_cmds = cmds return final_cmds @@ -139,9 +142,13 @@ def run_sweep_cmds(cmds): nbsrs = [] for idx, cmd in enumerate(cmds): logger.info(f'CMD_{idx}:{cmd}') - p = subprocess.Popen(shlex.split(cmd), + p = subprocess.Popen(cmd, + shell=True, stderr=subprocess.STDOUT, stdout=subprocess.PIPE) + # p = subprocess.Popen(shlex.split(cmd), + # stderr=subprocess.STDOUT, + # stdout=subprocess.PIPE) processes.append(p) nbsrs.append(NBSR(p.stdout)) try: @@ -181,8 +188,9 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument('--cfg_file', '-f', type=str, required=True, help='config file (yaml)') + parser.add_argument('--cpu', action='store_true', help='not use gpu') args = parser.parse_args() - cmds = get_sweep_cmds(args.cfg_file) + cmds = get_sweep_cmds(args.cfg_file, use_gpu=not args.cpu) run_sweep_cmds(cmds) From a0add65d55fdda13380f831fefc48d81515e5175 Mon Sep 17 00:00:00 2001 From: Tao <15166943+taochenshh@users.noreply.github.com> Date: Sat, 20 Mar 2021 22:40:20 -0400 Subject: [PATCH 35/35] Update common.py --- easyrl/utils/common.py | 97 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 87 insertions(+), 10 deletions(-) diff --git a/easyrl/utils/common.py b/easyrl/utils/common.py index 7ef5015..e8edb57 100644 --- a/easyrl/utils/common.py +++ b/easyrl/utils/common.py @@ -11,9 +11,26 @@ import numpy as np import torch import yaml + from easyrl.utils.rl_logger import logger +def get_all_subdirs(directory): + directory = pathlib_file(directory) + folders = list(directory.iterdir()) + folders = [x for x in folders if x.is_dir()] + return folders + + +def get_all_files_with_suffix(directory, suffix): + directory = pathlib_file(directory) + if not suffix.startswith('.'): + suffix = '.' + suffix + files = directory.glob(f'**/*{suffix}') + files = [x for x in files if x.is_file() and x.suffix == suffix] + return files + + def set_random_seed(seed): np.random.seed(seed) torch.manual_seed(seed) @@ -21,6 +38,11 @@ def set_random_seed(seed): torch.cuda.manual_seed_all(seed) +def chunker_list(seq_list, nchunks): + # split the list into n parts/chunks + return [seq_list[i::nchunks] for i in range(nchunks)] + + def module_available(module_path: str) -> bool: """Testing if given module is avalaible in your env. @@ -44,6 +66,50 @@ def module_available(module_path: str) -> bool: return False +def check_if_run_distributed(cfg): + from easyrl import HOROVOD_AVAILABLE + if HOROVOD_AVAILABLE: + import horovod.torch as hvd + hvd.init() + if hvd.size() > 1: + cfg.distributed = True + if cfg.distributed and not HOROVOD_AVAILABLE: + logger.error('Horovod is not installed! Will not run in distributed training') + distributed = HOROVOD_AVAILABLE and cfg.distributed + cfg.distributed = distributed + if distributed: + gpu_id = hvd.local_rank() + cfg.gpu_shift + if cfg.gpus is not None: + gpu_id = cfg.gpus[gpu_id] + logger.info(f'Rank {hvd.local_rank()} GPU ID: {gpu_id}') + torch.cuda.set_device(gpu_id) + logger.info(f'Using Horovod for distributed training, number of processes:{hvd.size()}') + return distributed + + +def is_dist_and_root_rank(cfg): + if cfg.distributed: + import horovod.torch as hvd + if hvd.rank() == 0: + return True + return False + + +def is_dist_not_root_rank(cfg): + if cfg.distributed: + import horovod.torch as hvd + if hvd.rank() != 0: + return True + return False + + +def get_horovod_size(cfg): + if cfg.distributed: + import horovod.torch as hvd + return hvd.size() + return 0 + + def list_to_numpy(data, expand_dims=None): if isinstance(data, numbers.Number): data = np.array([data]) @@ -58,8 +124,10 @@ def save_traj(traj, save_dir): save_dir = pathlib_file(save_dir) if not save_dir.exists(): Path.mkdir(save_dir, parents=True) + save_ob = traj[0].ob is not None save_state = traj[0].state is not None - ob_is_state = len(np.array(traj[0].ob[0]).shape) <= 1 + if save_ob: + ob_is_state = len(traj[0].ob[0].shape) <= 1 infos = traj.infos action_infos = traj.action_infos actions = traj.actions @@ -67,7 +135,7 @@ def save_traj(traj, save_dir): sub_dirs = sorted([x for x in save_dir.iterdir() if x.is_dir()]) folder_idx = len(sub_dirs) for ei in range(traj.num_envs): - ei_save_dir = save_dir.joinpath('{:06d}'.format(folder_idx)) + ei_save_dir = save_dir.joinpath(f'{folder_idx:06d}') ei_render_imgs = [] concise_info = {} for t in range(tsps[ei]): @@ -85,19 +153,21 @@ def save_traj(traj, save_dir): v = v[ei].tolist() c_info[k] = v concise_info[t] = c_info + if 'success' in concise_info[t]: + ei_save_dir = ei_save_dir.parent.joinpath(f'{folder_idx:06d}_success_{concise_info[t]["success"]}') if len(ei_render_imgs) > 1: img_folder = ei_save_dir.joinpath('render_imgs') save_images(ei_render_imgs, img_folder) video_file = ei_save_dir.joinpath('render_video.mp4') convert_imgs_to_video(ei_render_imgs, video_file.as_posix()) - - if ob_is_state: - ob_file = ei_save_dir.joinpath('obs.json') - save_to_json(traj.obs[:tsps[ei], ei].tolist(), - ob_file) - else: - ob_folder = ei_save_dir.joinpath('obs') - save_images(traj.obs[:tsps[ei], ei], ob_folder) + if save_ob: + if ob_is_state: + ob_file = ei_save_dir.joinpath('obs.json') + save_to_json(traj.obs[:tsps[ei], ei].tolist(), + ob_file) + else: + ob_folder = ei_save_dir.joinpath('obs') + save_images(traj.obs[:tsps[ei], ei], ob_folder) action_file = ei_save_dir.joinpath('actions.json') save_to_json(actions[:tsps[ei], ei].tolist(), action_file) @@ -214,6 +284,13 @@ def linear_decay_percent(epoch, total_epochs): return 1 - epoch / float(total_epochs) +def smooth_value(current_value, past_value, tau): + if past_value is None: + return current_value + else: + return past_value * tau + current_value * (1 - tau) + + def get_list_stats(data): if len(data) < 1: return dict()