Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RL-baseline] Model v4 #38

Open
wants to merge 25 commits into
base: RL-with-baseline
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
2106b4b
Add metrics and logsoftmax
xeviknal Mar 2, 2021
0f2f195
Updating the model
xeviknal Mar 11, 2021
39b3ad3
Add new model to baseline
xeviknal Mar 11, 2021
907af7a
The line that fixes all
xeviknal Mar 14, 2021
8e4ee6c
Add mean entropy - to reduce tensorboard runs
xeviknal Mar 14, 2021
3970702
Add action prob mean: mean of prob of actions taken in the episode
xeviknal Mar 14, 2021
957a3b4
Added simple directory check to params folder
ziritrion Mar 14, 2021
0105564
Added additional param save conditions (end of log_interval, last epi…
ziritrion Mar 14, 2021
b5a5184
Merge branch 'RL-baseline-new-model' of github.com:xeviknal/aidl-2021…
ziritrion Mar 14, 2021
c6954ec
Removing old runs; they don't apply to this branch
ziritrion Mar 14, 2021
a7c907c
RL-baseline-NM-save-optim
xeviknal Mar 14, 2021
d5b676c
Load optimizer params
xeviknal Mar 14, 2021
bd7f6c0
8k runs
ziritrion Mar 15, 2021
5f246c0
Fresh start with latest checkpoint load-save changes. Also, small git…
ziritrion Mar 15, 2021
e8aa5e4
bugfix
ziritrion Mar 15, 2021
c52a4f2
10k runs
ziritrion Mar 15, 2021
192bece
Almost 20k runs. Reward is starting to improve little by little
ziritrion Mar 16, 2021
11b46c4
Fixed runner.py for generating videos
ziritrion Mar 16, 2021
befd201
25k runs. Slight improvement but far from desirable
ziritrion Mar 16, 2021
610d5a8
Adding more linear layers for each head
xeviknal Mar 23, 2021
e9ad814
Cleaned up the code and removed params and runs in order to ease merg…
ziritrion Mar 25, 2021
4409cf6
Modified actions to include the no_action possibility, as well as fin…
ziritrion Mar 25, 2021
db1bd06
Tweaked the way the params filename is generated, as well as tensorbo…
ziritrion Mar 25, 2021
e5a9707
In train_episode, moved some vars to GPU that weren't being moved before
ziritrion Mar 25, 2021
c3aa2c7
Base start for Baseline model v4
ziritrion Mar 29, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -136,4 +136,7 @@ dmypy.json
.idea/

# MacOS bullshit
.DS_Store
.DS_Store

# nohup log files
nohup.out
52 changes: 39 additions & 13 deletions actions.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,40 @@
available_actions = [
[0.0, 0.7, 0.0], # throttle
[0.0, 0.5, 0.0], # throttle
[0.0, 0.2, 0.0], # throttle
[0.0, 0.0, 0.7], # break
[0.0, 0.0, 0.5], # break
[0.0, 0.0, 0.2], # break
[-0.8, 0.1, 0.0], # left
[-0.5, 0.1, 0.0], # left
[-0.2, 0.1, 0.0], # left
[0.8, 0.1, 0.0], # right
[0.5, 0.1, 0.0], # right
[0.2, 0.1, 0.0], # right
action_sets = [
[
[0.0, 0.0, 0.0], # no action
[0.0, 0.8, 0.0], # throttle
[0.0, 0.3, 0.0], # throttle
[0.0, 0.0, 0.6], # break
[0.0, 0.0, 0.2], # break
[-0.9, 0.0, 0.0], # left
[-0.5, 0.0, 0.0], # left
[-0.2, 0.0, 0.0], # left
[0.9, 0.0, 0.0], # right
[0.5, 0.0, 0.0], # right
[0.2, 0.0, 0.0], # right
],
[
[0.0, 0.0, 0.0], # no action
[0.0, 0.8, 0.0], # throttle
[0.0, 0.0, 0.6], # break
[-0.9, 0.0, 0.0], # left
[-0.5, 0.0, 0.0], # left
[-0.2, 0.0, 0.0], # left
[0.9, 0.0, 0.0], # right
[0.5, 0.0, 0.0], # right
[0.2, 0.0, 0.0], # right
],
[
[0.0, 0.0, 0.0], # no action
[0.0, 0.8, 0.0], # throttle
[0.0, 0.0, 0.6], # break
[-0.9, 0.0, 0.0], # left
[0.9, 0.0, 0.0], # right
]
]


def get_action(set_num):
if set_num >= len(action_sets):
assert "Wrong available set num. It should go from 0 to {}".format(len(action_sets) - 1)
return None
return action_sets[set_num]
3 changes: 3 additions & 0 deletions environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ def __init__(self, device, stack_frames=4, train=False):
self.env = FrameSkipper(self.env, 4)
print(self.env.observation_space)

def max_episode_steps(self):
return self.spec().max_episode_steps

def step(self, action):
return self.env.step(action)

Expand Down
10 changes: 9 additions & 1 deletion helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import glob
import io
import base64
import os
from IPython.display import HTML
from IPython import display as ipythondisplay

Expand All @@ -27,4 +28,11 @@ def display_start():


def save_model(model, path):
torch.save(model.state_dict(), path)
torch.save(model.state_dict(), path)

def create_directory(path):
try:
os.mkdir(path)
print(f'Directory {path} has been created.')
except FileExistsError:
print(f'Directory {path} already exists.')
18 changes: 13 additions & 5 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,29 @@
# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#for concurrent runs and logging
experiment='RL-baseline-v4'

if __name__ == "__main__":
hyperparams = {
'num_episodes': 40000, # Number of training episodes
'lr': 1e-2, # Learning rate
'num_episodes': 20000, # Number of training episodes
'lr': 1e-3, # Learning rate
'gamma': 0.99, # Discount rate
'log_interval': 5, # controls how often we log progress
'log_interval': 10, # controls how often we log progress
'stack_frames': 4,
'device': device,
'params_path': './params/policy-params.dl',
'experiment':experiment,
'params_path': f'./params/policy-params-{experiment}.dl',
'action_set_num': 0,
'train': True
}

#make sure that params folder exists
helpers.create_directory('params')

env = CarRacingEnv(device, hyperparams['stack_frames'], hyperparams['train'])
helpers.display_start()
if(hyperparams['train']):
if hyperparams['train']:
trainer = Trainer(env, hyperparams)
trainer.train()
else:
Expand Down
Binary file removed params/policy-params.dl
Binary file not shown.
43 changes: 30 additions & 13 deletions policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,28 +9,39 @@ class Policy(nn.Module):
def __init__(self, actor_output, critic_output, inputs=4):
super(Policy, self).__init__()
self.pipeline = nn.Sequential(
nn.Conv2d(inputs, 32, 3), # [32, 94, 94]
nn.Conv2d(inputs, 12, kernel_size=3, stride=2, padding=1), # [12, 48, 48]
nn.ReLU(),
nn.MaxPool2d(2), # [32, 47, 47]
nn.Conv2d(32, 64, 4), # [64, 44, 44]
nn.MaxPool2d(2), # [12, 24, 24]
nn.Conv2d(12, 24, kernel_size=3), # [24, 22, 22]
nn.ReLU(),
nn.MaxPool2d(2), # [64, 22, 22]
nn.MaxPool2d(2), # [24, 11, 11]
nn.Conv2d(24, 32, 4), # [32, 8, 8]
nn.ReLU(),
nn.MaxPool2d(2), # [32, 4, 4]
nn.Flatten(),
nn.Linear(64 * 22 * 22, 512),
nn.Linear(32 * 4 * 4, 256), # [ 512, 256 ]
nn.ReLU(),
nn.Linear(256, 128),
nn.ReLU(),
nn.Linear(512, 128),
# nn.LogSoftmax(dim=-1)
nn.ReLU()
)

# actor's layer
self.actor_head = nn.Linear(128, actor_output)
self.actor_head = nn.Sequential(
nn.Linear(128, 64),
nn.ReLU(),
nn.Linear(64, actor_output)
)

# critic's layer
self.critic_head = nn.Linear(128, critic_output)
self.critic_head = nn.Sequential(
nn.Linear(128, 64),
nn.ReLU(),
nn.Linear(64, critic_output)
)

self.saved_log_probs = []
self.rewards = []
self.entropies = []

def forward(self, x):

Expand All @@ -46,28 +57,34 @@ def forward(self, x):
# 1. a list with the probability of each action over the action space
# 2. the value from state s_t
return action_prob, state_values
# return self.pipeline(x)

def load_checkpoint(self, params_path):
epoch = 0
running_reward = 10
optim_params = None
if path.exists(params_path):
params_descriptor = torch.load(params_path)
epoch = 0
running_reward = 0
if 'params' in params_descriptor:
self.load_state_dict(params_descriptor['params'])
optim_params = params_descriptor['optimizer_params']
epoch = params_descriptor['epoch']
running_reward = params_descriptor['running_reward']
else:
self.load_state_dict(params_descriptor)

print("Model params are loaded now")
else:
print("Params not found: training from scratch")

return epoch
return epoch, optim_params, running_reward

def save_checkpoint(self, params_path, epoch):
def save_checkpoint(self, params_path, epoch, running_reward, optimizer):
torch.save({
'epoch': epoch,
'params': self.state_dict(),
'running_reward': running_reward,
'optimizer_params': optimizer.state_dict(),
}, params_path)
print("Relax, params are saved now")
18 changes: 11 additions & 7 deletions runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,35 @@
import numpy as np

from policy import Policy
from actions import available_actions
from actions import get_action

class Runner:
def __init__(self, env, config):
super().__init__()
self.env = env
self.config = config
self.input_channels = config['stack_frames']
#self.device = config['device']
self.policy = Policy(self.input_channels, len(available_actions))
self.policy.load_checkpoint(config['params_path'])
self.device = config['device']
self.action_set = get_action(config['action_set_num'])
self.policy = Policy(len(self.action_set), 1, self.input_channels).to(self.device)
self.last_epoch, optim_params, self.running_reward = self.policy.load_checkpoint(config['params_path'])
self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=config['lr'])
if optim_params is not None:
self.optimizer.load_state_dict(optim_params)

def select_action(self, state):
if state is None: # First state is always None
# Adding the starting signal as a 0's tensor
state = np.zeros((self.input_channels, 96, 96))
else:
state = np.asarray(state)
state = torch.from_numpy(state).float().unsqueeze(0)
probs = self.policy(state)
state = torch.from_numpy(state).float().unsqueeze(0).view(1, self.input_channels, 96, 96).to(self.device)
probs, state_value = self.policy(state)
# We pick the action from a sample of the probabilities
# It prevents the model from picking always the same action
m = torch.distributions.Categorical(probs)
action = m.sample()
return available_actions[action.item()]
return self.action_set[action.item()]

def run(self):
state, done, total_rew = self.env.reset(), False, 0
Expand Down
Binary file not shown.
Binary file not shown.
61 changes: 33 additions & 28 deletions trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@


from policy import Policy
from actions import available_actions
from actions import get_action


class Trainer:
Expand All @@ -20,29 +20,29 @@ def __init__(self, env, config):
self.input_channels = config['stack_frames']
self.device = config['device']
self.writer = SummaryWriter(flush_secs=5)
self.policy = Policy(len(available_actions), 1, self.input_channels).to(self.device)
self.last_epoch = self.policy.load_checkpoint(config['params_path'])
self.action_set = get_action(config['action_set_num'])
self.policy = Policy(len(self.action_set), 1, self.input_channels).to(self.device)
self.last_epoch, optim_params, self.running_reward = self.policy.load_checkpoint(config['params_path'])
self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=config['lr'])
self.experiment = config['experiment']
if optim_params is not None:
self.optimizer.load_state_dict(optim_params)

def select_action(self, state):
if state is None: # First state is always None
# Adding the starting signal as a 0's tensor
state = np.zeros((self.input_channels, 96, 96))
else:
state = np.asarray(state)
# state = torch.from_numpy(state).float().unsqueeze(0)
state = torch.from_numpy(state).float().unsqueeze(0).view(1, self.input_channels, 96, 96).to(self.device)
# probs = self.policy(state)
probs, state_value = self.policy(state)
# We pick the action from a sample of the probabilities
# It prevents the model from picking always the same action
m = torch.distributions.Categorical(probs)
action = m.sample()
#print(m.log_prob(action))
#self.policy.saved_log_probs.append(m.log_prob(action))
self.policy.saved_log_probs.append(self.SavedAction(m.log_prob(action), state_value))
#print(self.policy.saved_log_probs)
return available_actions[action.item()]
self.policy.entropies.append(m.entropy().item())
return self.action_set[action.item()]

def episode_train(self, iteration):
g = 0
Expand All @@ -58,37 +58,39 @@ def episode_train(self, iteration):
# Normalize returns (this usually accelerates convergence)
eps = np.finfo(np.float32).eps.item()
returns = (returns - returns.mean()) / (returns.std() + eps)
# for log_prob, G in zip(self.policy.saved_log_probs, returns):
for (log_prob, baseline) ,G in zip(self.policy.saved_log_probs, returns):
# policy_loss.append(-G * log_prob)
baseline = baseline.to(self.device)
log_prob = log_prob.to(self.device)

advantage = G - baseline.item()
# calculate actor (policy) loss

# calculate actor (policy) loss
policy_loss.append(-log_prob * advantage)

# calculate critic (value) loss using L1 smooth loss
value_losses.append(F.smooth_l1_loss(baseline, torch.tensor([G]).to(self.device)))
# calculate critic (value) loss using L1 smooth loss
value_losses.append(F.smooth_l1_loss(baseline.squeeze(), G))

# Update policy:
self.optimizer.zero_grad()
#policy_loss = torch.cat(policy_loss).sum()
policy_loss = torch.stack(policy_loss).sum() + torch.stack(value_losses).sum()
self.writer.add_scalar('loss', policy_loss.item(), iteration)
self.writer.add_scalar(f'{self.experiment}/loss', policy_loss.item(), iteration)
policy_loss.backward()
self.optimizer.step()
del self.policy.rewards[:]
del self.policy.saved_log_probs[:]
del self.policy.entropies[:]

def train(self):
# Training loop
print("Target reward: {}".format(self.env.spec().reward_threshold))
running_reward = 10
ep_rew_history = []
for i_episode in range(self.config['num_episodes'] - self.last_epoch):
# Convert to 1-indexing to reduce complexity
i_episode+=1
# The episode counting starts from last checkpoint
i_episode = i_episode + self.last_epoch
# Collect experience
state, ep_reward = self.env.reset(), 0
for t in range(self.env.spec().max_episode_steps): # Protecting from scenarios where you are mostly stopped
for t in range(self.env.max_episode_steps()): # Protecting from scenarios where you are mostly stopped
action = self.select_action(state)
state, reward, done, _ = self.env.step(action)
self.policy.rewards.append(reward)
Expand All @@ -98,20 +100,23 @@ def train(self):
break

# Update running reward
running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
self.running_reward = 0.05 * ep_reward + (1 - 0.05) * self.running_reward

# Plotting
self.writer.add_scalar(f'{self.experiment}/reward', ep_reward, i_episode)
self.writer.add_scalar(f'{self.experiment}/running reward', self.running_reward, i_episode)
self.writer.add_scalar(f'{self.experiment}/mean entropy', np.mean(self.policy.entropies), i_episode)

# Perform training step
self.episode_train(i_episode)
ep_rew_history.append((i_episode, ep_reward))
self.writer.add_scalar('reward', ep_reward, i_episode)
self.writer.add_scalar('running reward', running_reward, i_episode)
if i_episode % self.config['log_interval'] == 0:

if i_episode % self.config['log_interval'] == 0 or i_episode == self.config['num_episodes'] or self.running_reward > self.env.spec().reward_threshold:
print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
i_episode, ep_reward, running_reward))
self.policy.save_checkpoint(self.config['params_path'], i_episode)
i_episode, ep_reward, self.running_reward))
self.policy.save_checkpoint(self.config['params_path'], i_episode, self.running_reward, self.optimizer)

if running_reward > self.env.spec().reward_threshold:
if self.running_reward > self.env.spec().reward_threshold:
print("Solved!")
break

print("Finished training! Running reward is now {:.2f}".format(running_reward))
print("Finished training! Running reward is now {:.2f}".format(self.running_reward))