xeviknal · ziritrion · Mar 2, 2021 · Mar 11, 2021 · Mar 11, 2021 · Mar 14, 2021
diff --git a/.gitignore b/.gitignore
@@ -136,4 +136,7 @@ dmypy.json
 .idea/
 
 # MacOS bullshit
-.DS_Store
+.DS_Store
+
+# nohup log files
+nohup.out
diff --git a/actions.py b/actions.py
@@ -1,14 +1,40 @@
-available_actions = [
-    [0.0, 0.7, 0.0],  # throttle
-    [0.0, 0.5, 0.0],  # throttle
-    [0.0, 0.2, 0.0],  # throttle
-    [0.0, 0.0, 0.7],  # break
-    [0.0, 0.0, 0.5],  # break
-    [0.0, 0.0, 0.2],  # break
-    [-0.8, 0.1, 0.0],  # left
-    [-0.5, 0.1, 0.0],  # left
-    [-0.2, 0.1, 0.0],  # left
-    [0.8, 0.1, 0.0],  # right
-    [0.5, 0.1, 0.0],  # right
-    [0.2, 0.1, 0.0],  # right
+action_sets = [
+    [
+        [0.0, 0.0, 0.0],  # no action
+        [0.0, 0.8, 0.0],  # throttle
+        [0.0, 0.3, 0.0],  # throttle
+        [0.0, 0.0, 0.6],  # break
+        [0.0, 0.0, 0.2],  # break
+        [-0.9, 0.0, 0.0],  # left
+        [-0.5, 0.0, 0.0],  # left
+        [-0.2, 0.0, 0.0],  # left
+        [0.9, 0.0, 0.0],  # right
+        [0.5, 0.0, 0.0],  # right
+        [0.2, 0.0, 0.0],  # right
+    ],
+    [
+        [0.0, 0.0, 0.0],  # no action
+        [0.0, 0.8, 0.0],  # throttle
+        [0.0, 0.0, 0.6],  # break
+        [-0.9, 0.0, 0.0],  # left
+        [-0.5, 0.0, 0.0],  # left
+        [-0.2, 0.0, 0.0],  # left
+        [0.9, 0.0, 0.0],  # right
+        [0.5, 0.0, 0.0],  # right
+        [0.2, 0.0, 0.0],  # right
+    ],
+    [
+        [0.0, 0.0, 0.0],  # no action
+        [0.0, 0.8, 0.0],  # throttle
+        [0.0, 0.0, 0.6],  # break
+        [-0.9, 0.0, 0.0],  # left
+        [0.9, 0.0, 0.0],  # right
+    ]
 ]
+
+
+def get_action(set_num):
+    if set_num >= len(action_sets):
+        assert "Wrong available set num. It should go from 0 to {}".format(len(action_sets) - 1)
+        return None
+    return action_sets[set_num]
diff --git a/environment.py b/environment.py
@@ -21,6 +21,9 @@ def __init__(self, device, stack_frames=4, train=False):
         self.env = FrameSkipper(self.env, 4)
         print(self.env.observation_space)
 
+    def max_episode_steps(self):
+        return self.spec().max_episode_steps
+
     def step(self, action):
         return self.env.step(action)
 

diff --git a/helpers.py b/helpers.py
@@ -3,6 +3,7 @@
 import glob
 import io
 import base64
+import os
 from IPython.display import HTML
 from IPython import display as ipythondisplay
 
@@ -27,4 +28,11 @@ def display_start():
 
 
 def save_model(model, path):
-    torch.save(model.state_dict(), path)
+    torch.save(model.state_dict(), path)
+
+def create_directory(path):
+    try:
+        os.mkdir(path)
+        print(f'Directory {path} has been created.')
+    except FileExistsError:
+        print(f'Directory {path} already exists.')
diff --git a/main.py b/main.py
@@ -10,21 +10,29 @@
 # if gpu is to be used
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
+#for concurrent runs and logging
+experiment='RL-baseline-v4'
+
 if __name__ == "__main__":
     hyperparams = {
-        'num_episodes': 40000,  # Number of training episodes
-        'lr': 1e-2,  # Learning rate
+        'num_episodes': 20000,  # Number of training episodes
+        'lr': 1e-3,  # Learning rate
         'gamma': 0.99,  # Discount rate
-        'log_interval': 5,  # controls how often we log progress
+        'log_interval': 10,  # controls how often we log progress
         'stack_frames': 4,
         'device': device,
-        'params_path': './params/policy-params.dl',
+        'experiment':experiment,
+        'params_path': f'./params/policy-params-{experiment}.dl',
+        'action_set_num': 0,
         'train': True
     }
 
+    #make sure that params folder exists
+    helpers.create_directory('params')
+
     env = CarRacingEnv(device, hyperparams['stack_frames'], hyperparams['train'])
     helpers.display_start()
-    if(hyperparams['train']):
+    if hyperparams['train']:
         trainer = Trainer(env, hyperparams)
         trainer.train()
     else:

diff --git a/params/policy-params.dl b/params/policy-params.dl
diff --git a/policy.py b/policy.py
@@ -9,28 +9,39 @@ class Policy(nn.Module):
     def __init__(self, actor_output, critic_output, inputs=4):
         super(Policy, self).__init__()
         self.pipeline = nn.Sequential(
-            nn.Conv2d(inputs, 32, 3),  # [32, 94, 94]
+            nn.Conv2d(inputs, 12, kernel_size=3, stride=2, padding=1),  # [12, 48, 48]
             nn.ReLU(),
-            nn.MaxPool2d(2),  # [32, 47, 47]
-            nn.Conv2d(32, 64, 4),  # [64, 44, 44]
+            nn.MaxPool2d(2),  # [12, 24, 24]
+            nn.Conv2d(12, 24, kernel_size=3),  # [24, 22, 22]
             nn.ReLU(),
-            nn.MaxPool2d(2),  # [64, 22, 22]
+            nn.MaxPool2d(2),  # [24, 11, 11]
+            nn.Conv2d(24, 32, 4),  # [32, 8, 8]
+            nn.ReLU(),
+            nn.MaxPool2d(2),  # [32, 4, 4]
             nn.Flatten(),
-            nn.Linear(64 * 22 * 22, 512),
+            nn.Linear(32 * 4 * 4, 256),  # [ 512, 256 ]
+            nn.ReLU(),
+            nn.Linear(256, 128),
             nn.ReLU(),
-            nn.Linear(512, 128),
-#            nn.LogSoftmax(dim=-1)
-            nn.ReLU()
         )
 
         # actor's layer
-        self.actor_head = nn.Linear(128, actor_output)
+        self.actor_head = nn.Sequential(
+            nn.Linear(128, 64),
+            nn.ReLU(),
+            nn.Linear(64, actor_output)
+        )
 
         # critic's layer
-        self.critic_head = nn.Linear(128, critic_output)
+        self.critic_head = nn.Sequential(
+            nn.Linear(128, 64),
+            nn.ReLU(),
+            nn.Linear(64, critic_output)
+        )
 
         self.saved_log_probs = []
         self.rewards = []
+        self.entropies = []
 
     def forward(self, x):
 
@@ -46,28 +57,34 @@ def forward(self, x):
         # 1. a list with the probability of each action over the action space
         # 2. the value from state s_t 
         return action_prob, state_values
-#        return self.pipeline(x)
 
     def load_checkpoint(self, params_path):
         epoch = 0
+        running_reward = 10
+        optim_params = None
         if path.exists(params_path):
             params_descriptor = torch.load(params_path)
             epoch = 0
+            running_reward = 0
             if 'params' in params_descriptor:
                 self.load_state_dict(params_descriptor['params'])
+                optim_params = params_descriptor['optimizer_params']
                 epoch = params_descriptor['epoch']
+                running_reward = params_descriptor['running_reward']
             else:
                 self.load_state_dict(params_descriptor)
 
             print("Model params are loaded now")
         else:
             print("Params not found: training from scratch")
 
-        return epoch
+        return epoch, optim_params, running_reward
 
-    def save_checkpoint(self, params_path, epoch):
+    def save_checkpoint(self, params_path, epoch, running_reward, optimizer):
         torch.save({
             'epoch': epoch,
             'params': self.state_dict(),
+            'running_reward': running_reward,
+            'optimizer_params': optimizer.state_dict(),
         }, params_path)
         print("Relax, params are saved now")
diff --git a/runner.py b/runner.py
@@ -2,31 +2,35 @@
 import numpy as np
 
 from policy import Policy
-from actions import available_actions
+from actions import get_action
 
 class Runner:
     def __init__(self, env, config):
         super().__init__()
         self.env = env
         self.config = config
         self.input_channels = config['stack_frames']
-        #self.device = config['device']
-        self.policy = Policy(self.input_channels, len(available_actions))
-        self.policy.load_checkpoint(config['params_path'])
+        self.device = config['device']
+        self.action_set = get_action(config['action_set_num'])
+        self.policy = Policy(len(self.action_set), 1, self.input_channels).to(self.device)
+        self.last_epoch, optim_params, self.running_reward = self.policy.load_checkpoint(config['params_path'])
+        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=config['lr'])
+        if optim_params is not None:
+            self.optimizer.load_state_dict(optim_params)
 
     def select_action(self, state):
         if state is None:  # First state is always None
             # Adding the starting signal as a 0's tensor
             state = np.zeros((self.input_channels, 96, 96))
         else:
             state = np.asarray(state)
-        state = torch.from_numpy(state).float().unsqueeze(0)
-        probs = self.policy(state)
+        state = torch.from_numpy(state).float().unsqueeze(0).view(1, self.input_channels, 96, 96).to(self.device)
+        probs, state_value = self.policy(state)
         # We pick the action from a sample of the probabilities
         # It prevents the model from picking always the same action
         m = torch.distributions.Categorical(probs)
         action = m.sample()
-        return available_actions[action.item()]
+        return self.action_set[action.item()]
 
     def run(self):
         state, done, total_rew = self.env.reset(), False, 0

diff --git a/runs/Feb20_17-34-59_kenya/events.out.tfevents.1613838899.kenya.8005.0 b/runs/Feb20_17-34-59_kenya/events.out.tfevents.1613838899.kenya.8005.0
diff --git a/runs/Feb21_23-07-23_kenya/events.out.tfevents.1613945243.kenya.14009.0 b/runs/Feb21_23-07-23_kenya/events.out.tfevents.1613945243.kenya.14009.0
diff --git a/trainer.py b/trainer.py
@@ -6,7 +6,7 @@
 
 
 from policy import Policy
-from actions import available_actions
+from actions import get_action
 
 
 class Trainer:
@@ -20,29 +20,29 @@ def __init__(self, env, config):
         self.input_channels = config['stack_frames']
         self.device = config['device']
         self.writer = SummaryWriter(flush_secs=5)
-        self.policy = Policy(len(available_actions), 1, self.input_channels).to(self.device)
-        self.last_epoch = self.policy.load_checkpoint(config['params_path'])
+        self.action_set = get_action(config['action_set_num'])
+        self.policy = Policy(len(self.action_set), 1, self.input_channels).to(self.device)
+        self.last_epoch, optim_params, self.running_reward = self.policy.load_checkpoint(config['params_path'])
         self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=config['lr'])
+        self.experiment = config['experiment']
+        if optim_params is not None:
+            self.optimizer.load_state_dict(optim_params)
 
     def select_action(self, state):
         if state is None:  # First state is always None
             # Adding the starting signal as a 0's tensor
             state = np.zeros((self.input_channels, 96, 96))
         else:
             state = np.asarray(state)
-#        state = torch.from_numpy(state).float().unsqueeze(0)
         state = torch.from_numpy(state).float().unsqueeze(0).view(1, self.input_channels, 96, 96).to(self.device)
-#        probs = self.policy(state)
         probs, state_value = self.policy(state)
         # We pick the action from a sample of the probabilities
         # It prevents the model from picking always the same action
         m = torch.distributions.Categorical(probs)
         action = m.sample()
-        #print(m.log_prob(action))
-        #self.policy.saved_log_probs.append(m.log_prob(action))
         self.policy.saved_log_probs.append(self.SavedAction(m.log_prob(action), state_value))
-        #print(self.policy.saved_log_probs)
-        return available_actions[action.item()]
+        self.policy.entropies.append(m.entropy().item())
+        return self.action_set[action.item()]
 
     def episode_train(self, iteration):
         g = 0
@@ -58,37 +58,39 @@ def episode_train(self, iteration):
         # Normalize returns (this usually accelerates convergence)
         eps = np.finfo(np.float32).eps.item()
         returns = (returns - returns.mean()) / (returns.std() + eps)
-#        for log_prob, G in zip(self.policy.saved_log_probs, returns):
         for (log_prob, baseline) ,G in zip(self.policy.saved_log_probs, returns):
-        #    policy_loss.append(-G * log_prob)
+            baseline = baseline.to(self.device)
+            log_prob = log_prob.to(self.device)
+
             advantage = G - baseline.item()
-        # calculate actor (policy) loss 
+
+            # calculate actor (policy) loss
             policy_loss.append(-log_prob * advantage)
 
-        # calculate critic (value) loss using L1 smooth loss
-            value_losses.append(F.smooth_l1_loss(baseline, torch.tensor([G]).to(self.device)))
+            # calculate critic (value) loss using L1 smooth loss
+            value_losses.append(F.smooth_l1_loss(baseline.squeeze(), G))
 
         # Update policy:
         self.optimizer.zero_grad()
-        #policy_loss = torch.cat(policy_loss).sum()
         policy_loss = torch.stack(policy_loss).sum() + torch.stack(value_losses).sum()
-        self.writer.add_scalar('loss', policy_loss.item(), iteration)
+        self.writer.add_scalar(f'{self.experiment}/loss', policy_loss.item(), iteration)
         policy_loss.backward()
         self.optimizer.step()
         del self.policy.rewards[:]
         del self.policy.saved_log_probs[:]
+        del self.policy.entropies[:]
 
     def train(self):
         # Training loop
         print("Target reward: {}".format(self.env.spec().reward_threshold))
-        running_reward = 10
-        ep_rew_history = []
         for i_episode in range(self.config['num_episodes'] - self.last_epoch):
+            # Convert to 1-indexing to reduce complexity
+            i_episode+=1
             # The episode counting starts from last checkpoint
             i_episode = i_episode + self.last_epoch
             # Collect experience
             state, ep_reward = self.env.reset(), 0
-            for t in range(self.env.spec().max_episode_steps):  # Protecting from scenarios where you are mostly stopped
+            for t in range(self.env.max_episode_steps()):  # Protecting from scenarios where you are mostly stopped
                 action = self.select_action(state)
                 state, reward, done, _ = self.env.step(action)
                 self.policy.rewards.append(reward)
@@ -98,20 +100,23 @@ def train(self):
                     break
 
             # Update running reward
-            running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
+            self.running_reward = 0.05 * ep_reward + (1 - 0.05) * self.running_reward
+
+            # Plotting
+            self.writer.add_scalar(f'{self.experiment}/reward', ep_reward, i_episode)
+            self.writer.add_scalar(f'{self.experiment}/running reward', self.running_reward, i_episode)
+            self.writer.add_scalar(f'{self.experiment}/mean entropy', np.mean(self.policy.entropies), i_episode)
 
             # Perform training step
             self.episode_train(i_episode)
-            ep_rew_history.append((i_episode, ep_reward))
-            self.writer.add_scalar('reward', ep_reward, i_episode)
-            self.writer.add_scalar('running reward', running_reward, i_episode)
-            if i_episode % self.config['log_interval'] == 0:
+
+            if i_episode % self.config['log_interval'] == 0 or i_episode == self.config['num_episodes'] or self.running_reward > self.env.spec().reward_threshold:
                 print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
-                    i_episode, ep_reward, running_reward))
-                self.policy.save_checkpoint(self.config['params_path'], i_episode)
+                    i_episode, ep_reward, self.running_reward))
+                self.policy.save_checkpoint(self.config['params_path'], i_episode, self.running_reward, self.optimizer)
 
-            if running_reward > self.env.spec().reward_threshold:
+            if self.running_reward > self.env.spec().reward_threshold:
                 print("Solved!")
                 break
 
-        print("Finished training! Running reward is now {:.2f}".format(running_reward))
+        print("Finished training! Running reward is now {:.2f}".format(self.running_reward))