Early stop: prints, render, and fixup

xeviknal · xeviknal · Apr 8, 2021 · Apr 9, 2021 · Apr 10, 2021 · Apr 10, 2021
commit 1648b186fd0c18e852fa4aaad6ab133c097c4f04
diff --git a/main.py b/main.py
@@ -10,7 +10,7 @@
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 #for concurrent runs and logging
-experiment='ppo-nm-scheduler'
+experiment='ppo-nm-scheduler-early-stop'
 if __name__ == "__main__":
     hyperparams = {
         'num_epochs': 25000,  # Number of training episodes

diff --git a/trainer.py b/trainer.py
@@ -52,16 +52,19 @@ def select_action(self, state):
         # It prevents the model from picking always the same action
         m = torch.distributions.Categorical(probs)
         action = m.sample()
+        print(f'Action: {action.item()}')
         # We return the state in order to make sure that we operate with a valid tensor
         return action, m.log_prob(action), vs_t, m.entropy(), state
 
     def run_episode(self, epoch, current_steps):
         state, ep_reward, steps = self.env.reset(), 0, 0
         for t in range(self.env.spec().max_episode_steps):  # Protecting from scenarios where you are mostly stopped
             with torch.no_grad():
+                self.env.render()
                 state = self.prepare_state(state)
                 action_id, action_log_prob, vs_t, entropy, state = self.select_action(state)
                 next_state, reward, done, _ = self.env.step(self.action_set[action_id.item()])
+                print(f'Reward: {reward}')
                 # Store transition to memory
                 self.memory.push(state, action_id, action_log_prob, entropy, reward, vs_t, next_state)
 
@@ -99,6 +102,7 @@ def get_new_prob_of(self, action, state):
         return m.log_prob(action)
 
     def policy_update(self, transitions, v_targ, adv, iteration):
+        print(f'Updating iteration #{iteration}')
         # Get transitions values
         batch = Transition(*zip(*transitions))
         state_batch = torch.cat(batch.state)

diff --git a/wrappers/early_stop.py b/wrappers/early_stop.py
@@ -22,7 +22,6 @@ def step(self, action):
         avg = 1
         if self.remaining_steps == 0:
             avg = np.array(self.latest_rewards).mean()
-            if avg > 0:
-                self.remaining_steps = self.steps
-                self.latest_rewards = []
+            self.remaining_steps = self.steps
+            self.latest_rewards = []
         return state, reward, avg < 0, info