From 58b6bce87436719371b839ef4c95bf2a3ec7f914 Mon Sep 17 00:00:00 2001 From: Xavier Canal i Masjuan Date: Thu, 8 Apr 2021 02:30:50 +0200 Subject: [PATCH 1/9] Adding scheduler with intuitive params --- main.py | 4 ++-- trainer.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index d08ca25..d80e1e4 100644 --- a/main.py +++ b/main.py @@ -10,7 +10,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #for concurrent runs and logging -experiment='ppo-nm' +experiment='ppo-nm-scheduler' if __name__ == "__main__": hyperparams = { 'num_epochs': 25000, # Number of training episodes @@ -27,7 +27,7 @@ 'device': device, 'experiment':experiment, 'params_path': f'./params/policy-params-{experiment}.dl', - 'action_set_num': 0, + 'action_set_num': 4, 'train': True } diff --git a/trainer.py b/trainer.py index 1ed15b1..dd2ce78 100644 --- a/trainer.py +++ b/trainer.py @@ -22,17 +22,18 @@ def __init__(self, env, config): self.ppo_epochs = config['num_ppo_epochs'] self.mini_batch = config['mini_batch_size'] self.memory_size = config['memory_size'] + self.experiment = config['experiment'] self.c1, self.c2, self.eps = config['c1'], config['c2'], config['eps'] - self.writer = SummaryWriter(flush_secs=5) + self.writer = SummaryWriter(log_dir=f'runs/{config["experiment"]}', flush_secs=5) self.action_set = get_action(config['action_set_num']) self.policy = Policy(len(self.action_set), 1, self.input_channels).to(self.device) self.last_epoch, optim_params, self.running_reward = self.policy.load_checkpoint(config['params_path']) self.memory = ReplayMemory(self.memory_size) self.value_loss = nn.SmoothL1Loss() self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=config['lr']) - self.experiment = config['experiment'] if optim_params is not None: self.optimizer.load_state_dict(optim_params) + self.scheduler = torch.optim.lr_scheduler.OneCycleLR(self.optimizer, max_lr=0.01, steps_per_epoch=self.memory_size, epochs=self.epochs) def prepare_state(self, state): if state is None: # First state is always None From b4afa29dfed70544e7e960cfd13f4b59d2dd1ce8 Mon Sep 17 00:00:00 2001 From: Xavier Canal i Masjuan Date: Fri, 9 Apr 2021 10:10:15 +0200 Subject: [PATCH 2/9] Adding early stop to the environment --- environment.py | 2 ++ wrappers/early_stop.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/environment.py b/environment.py index c00969c..1c61485 100644 --- a/environment.py +++ b/environment.py @@ -1,5 +1,6 @@ import gym from wrappers.frame_skipper import FrameSkipper +from wrappers.early_stop import EarlyStop from gym.wrappers import FrameStack, GrayScaleObservation, Monitor @@ -20,6 +21,7 @@ def __init__(self, device, seed, stack_frames=4, train=False): self.env = GrayScaleObservation(self.env) self.env = FrameStack(self.env, stack_frames) self.env = FrameSkipper(self.env, 4) + self.env = EarlyStop(self.env, 50) print(self.env.observation_space) def max_episode_steps(self): diff --git a/wrappers/early_stop.py b/wrappers/early_stop.py index 2667033..f9b712a 100644 --- a/wrappers/early_stop.py +++ b/wrappers/early_stop.py @@ -21,7 +21,8 @@ def step(self, action): self.latest_rewards.append(reward) avg = 1 if self.remaining_steps == 0: - avg = np.array(self.latest_rewards).sum() / self.steps + avg = np.array(self.latest_rewards).mean() if avg > 0: self.remaining_steps = self.steps + self.latest_rewards = [] return state, reward, avg < 0, info From 1648b186fd0c18e852fa4aaad6ab133c097c4f04 Mon Sep 17 00:00:00 2001 From: Xavier Canal i Masjuan Date: Sat, 10 Apr 2021 11:28:23 +0200 Subject: [PATCH 3/9] Early stop: prints, render, and fixup --- main.py | 2 +- trainer.py | 4 ++++ wrappers/early_stop.py | 5 ++--- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index d80e1e4..580ae27 100644 --- a/main.py +++ b/main.py @@ -10,7 +10,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #for concurrent runs and logging -experiment='ppo-nm-scheduler' +experiment='ppo-nm-scheduler-early-stop' if __name__ == "__main__": hyperparams = { 'num_epochs': 25000, # Number of training episodes diff --git a/trainer.py b/trainer.py index dd2ce78..49c43bb 100644 --- a/trainer.py +++ b/trainer.py @@ -52,6 +52,7 @@ def select_action(self, state): # It prevents the model from picking always the same action m = torch.distributions.Categorical(probs) action = m.sample() + print(f'Action: {action.item()}') # We return the state in order to make sure that we operate with a valid tensor return action, m.log_prob(action), vs_t, m.entropy(), state @@ -59,9 +60,11 @@ def run_episode(self, epoch, current_steps): state, ep_reward, steps = self.env.reset(), 0, 0 for t in range(self.env.spec().max_episode_steps): # Protecting from scenarios where you are mostly stopped with torch.no_grad(): + self.env.render() state = self.prepare_state(state) action_id, action_log_prob, vs_t, entropy, state = self.select_action(state) next_state, reward, done, _ = self.env.step(self.action_set[action_id.item()]) + print(f'Reward: {reward}') # Store transition to memory self.memory.push(state, action_id, action_log_prob, entropy, reward, vs_t, next_state) @@ -99,6 +102,7 @@ def get_new_prob_of(self, action, state): return m.log_prob(action) def policy_update(self, transitions, v_targ, adv, iteration): + print(f'Updating iteration #{iteration}') # Get transitions values batch = Transition(*zip(*transitions)) state_batch = torch.cat(batch.state) diff --git a/wrappers/early_stop.py b/wrappers/early_stop.py index f9b712a..3f6ab9b 100644 --- a/wrappers/early_stop.py +++ b/wrappers/early_stop.py @@ -22,7 +22,6 @@ def step(self, action): avg = 1 if self.remaining_steps == 0: avg = np.array(self.latest_rewards).mean() - if avg > 0: - self.remaining_steps = self.steps - self.latest_rewards = [] + self.remaining_steps = self.steps + self.latest_rewards = [] return state, reward, avg < 0, info From f37d5e7c6b6eed7e9114cf3b31f09028ebd5a5ae Mon Sep 17 00:00:00 2001 From: Xavier Canal i Masjuan Date: Sat, 10 Apr 2021 19:03:28 +0200 Subject: [PATCH 4/9] Remove unnecessary printing --- trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trainer.py b/trainer.py index 49c43bb..58f49ea 100644 --- a/trainer.py +++ b/trainer.py @@ -52,7 +52,6 @@ def select_action(self, state): # It prevents the model from picking always the same action m = torch.distributions.Categorical(probs) action = m.sample() - print(f'Action: {action.item()}') # We return the state in order to make sure that we operate with a valid tensor return action, m.log_prob(action), vs_t, m.entropy(), state @@ -80,6 +79,7 @@ def run_episode(self, epoch, current_steps): self.running_reward = 0.01 * ep_reward + (1 - 0.01) * self.running_reward self.logging_episode(epoch, ep_reward, self.running_reward) + print(f'Steps: {steps}') return steps def compute_advantages(self): From bf2d22029d74fc17ae2cabd9f1cc7344b55ac86f Mon Sep 17 00:00:00 2001 From: Xavier Canal i Masjuan Date: Sat, 10 Apr 2021 19:06:42 +0200 Subject: [PATCH 5/9] Fixup commit --- trainer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/trainer.py b/trainer.py index 58f49ea..b273c59 100644 --- a/trainer.py +++ b/trainer.py @@ -63,7 +63,6 @@ def run_episode(self, epoch, current_steps): state = self.prepare_state(state) action_id, action_log_prob, vs_t, entropy, state = self.select_action(state) next_state, reward, done, _ = self.env.step(self.action_set[action_id.item()]) - print(f'Reward: {reward}') # Store transition to memory self.memory.push(state, action_id, action_log_prob, entropy, reward, vs_t, next_state) @@ -79,7 +78,7 @@ def run_episode(self, epoch, current_steps): self.running_reward = 0.01 * ep_reward + (1 - 0.01) * self.running_reward self.logging_episode(epoch, ep_reward, self.running_reward) - print(f'Steps: {steps}') + print(f'Ep Reward: {ep_reward}, Ep Steps: {steps}') return steps def compute_advantages(self): From ac9858e99a475ddf6654e9452d9a38106ab6dd86 Mon Sep 17 00:00:00 2001 From: Xavier Canal i Masjuan Date: Sat, 10 Apr 2021 19:12:22 +0200 Subject: [PATCH 6/9] Better printing --- trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trainer.py b/trainer.py index b273c59..686fbe0 100644 --- a/trainer.py +++ b/trainer.py @@ -101,7 +101,6 @@ def get_new_prob_of(self, action, state): return m.log_prob(action) def policy_update(self, transitions, v_targ, adv, iteration): - print(f'Updating iteration #{iteration}') # Get transitions values batch = Transition(*zip(*transitions)) state_batch = torch.cat(batch.state) @@ -159,6 +158,7 @@ def train(self): v_targ, adv = self.compute_advantages() # Train the model num_epochs time with mini-batch strategy + print(f'Updating iteration #{epoch}') for ppo_epoch in range(self.ppo_epochs): # Train the model with batch-size transitions for index in BatchSampler(SubsetRandomSampler(range(self.memory_size)), self.mini_batch, False): From 76419db1500e06b45a514e05eefe43f2afd3e2f4 Mon Sep 17 00:00:00 2001 From: Xavier Canal i Masjuan Date: Sat, 10 Apr 2021 19:22:15 +0200 Subject: [PATCH 7/9] Remove scheduler --- trainer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/trainer.py b/trainer.py index 686fbe0..b084652 100644 --- a/trainer.py +++ b/trainer.py @@ -33,7 +33,6 @@ def __init__(self, env, config): self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=config['lr']) if optim_params is not None: self.optimizer.load_state_dict(optim_params) - self.scheduler = torch.optim.lr_scheduler.OneCycleLR(self.optimizer, max_lr=0.01, steps_per_epoch=self.memory_size, epochs=self.epochs) def prepare_state(self, state): if state is None: # First state is always None From 33a790c8f396bf15b4d106c333b19bc72b1697d7 Mon Sep 17 00:00:00 2001 From: Xavier Canal i Masjuan Date: Sat, 10 Apr 2021 19:32:40 +0200 Subject: [PATCH 8/9] Dont change tensorboard definition --- trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trainer.py b/trainer.py index b084652..476a803 100644 --- a/trainer.py +++ b/trainer.py @@ -24,7 +24,7 @@ def __init__(self, env, config): self.memory_size = config['memory_size'] self.experiment = config['experiment'] self.c1, self.c2, self.eps = config['c1'], config['c2'], config['eps'] - self.writer = SummaryWriter(log_dir=f'runs/{config["experiment"]}', flush_secs=5) + self.writer = SummaryWriter(flush_secs=5) self.action_set = get_action(config['action_set_num']) self.policy = Policy(len(self.action_set), 1, self.input_channels).to(self.device) self.last_epoch, optim_params, self.running_reward = self.policy.load_checkpoint(config['params_path']) From e34d194eef350b7423e7ceb0517111753e4cf78b Mon Sep 17 00:00:00 2001 From: Xavier Canal i Masjuan Date: Sat, 10 Apr 2021 19:48:36 +0200 Subject: [PATCH 9/9] Make the runs visible --- helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helpers.py b/helpers.py index 241b642..18d5329 100644 --- a/helpers.py +++ b/helpers.py @@ -23,7 +23,7 @@ def show_video(): def display_start(): - display = Display(visible=0, size=(1400, 900)) + display = Display(visible=1, size=(1400, 900)) display.start()