Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve continuous-ppo #44

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
203 changes: 89 additions & 114 deletions ppo-continuous.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,87 +3,74 @@
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
import numpy as np

#Hyperparameters
learning_rate = 0.0003
gamma = 0.9
lmbda = 0.9
eps_clip = 0.2
K_epoch = 10
rollout_len = 3
buffer_size = 30
minibatch_size = 32
entropy_coef = 1e-2
critic_coef = 1
learning_rate = 0.0003
gamma = 0.9
lmbda = 0.9
eps_clip = 0.2
K_epoch = 10
T_horizon = 20


class PPO(nn.Module):
def __init__(self):
super(PPO, self).__init__()
self.data = []

self.fc1 = nn.Linear(3,128)
self.fc_mu = nn.Linear(128,1)
self.fc_std = nn.Linear(128,1)
self.fc_v = nn.Linear(128,1)
self.fc1 = nn.Linear(3,64)
self.fc2 = nn.Linear(64,256)
self.fc_v = nn.Linear(256,1)
self.fc_pi = nn.Linear(256,1)
self.fc_sigma = nn.Linear(256,1)


self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
self.optimization_step = 0

def pi(self, x, softmax_dim = 0):
def pi(self, x):
x = F.relu(self.fc1(x))
mu = 2.0*torch.tanh(self.fc_mu(x))
std = F.softplus(self.fc_std(x))
return mu, std

x = F.relu(self.fc2(x))
mu = 2 * F.tanh(self.fc_pi(x))
sigma = F.softplus(self.fc_sigma(x)) +1e-3

return mu,sigma

def v(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
v = self.fc_v(x)
return v

def put_data(self, transition):
self.data.append(transition)

def make_batch(self):
s_batch, a_batch, r_batch, s_prime_batch, prob_a_batch, done_batch = [], [], [], [], [], []
data = []

for j in range(buffer_size):
for i in range(minibatch_size):
rollout = self.data.pop()
s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst = [], [], [], [], [], []

for transition in rollout:
s, a, r, s_prime, prob_a, done = transition

s_lst.append(s)
a_lst.append([a])
r_lst.append([r])
s_prime_lst.append(s_prime)
prob_a_lst.append([prob_a])
done_mask = 0 if done else 1
done_lst.append([done_mask])

s_batch.append(s_lst)
a_batch.append(a_lst)
r_batch.append(r_lst)
s_prime_batch.append(s_prime_lst)
prob_a_batch.append(prob_a_lst)
done_batch.append(done_lst)

mini_batch = torch.tensor(s_batch, dtype=torch.float), torch.tensor(a_batch, dtype=torch.float), \
torch.tensor(r_batch, dtype=torch.float), torch.tensor(s_prime_batch, dtype=torch.float), \
torch.tensor(done_batch, dtype=torch.float), torch.tensor(prob_a_batch, dtype=torch.float)
data.append(mini_batch)

return data

def calc_advantage(self, data):
data_with_adv = []
for mini_batch in data:
s, a, r, s_prime, done_mask, old_log_prob = mini_batch
with torch.no_grad():
td_target = r + gamma * self.v(s_prime) * done_mask
delta = td_target - self.v(s)
delta = delta.numpy()
s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst = [], [], [], [], [], []
for transition in self.data:
s, a, r, s_prime, prob_a, done = transition

s_lst.append(s)
a_lst.append([a])
r_lst.append([r])
s_prime_lst.append(s_prime)
prob_a_lst.append([prob_a])
done_mask = 0 if done else 1
done_lst.append([done_mask])

s,a,r,s_prime,done_mask, prob_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst)
self.data = []
return s, a, r, s_prime, done_mask, prob_a

def train_net(self):
s, a, r, s_prime, done_mask, old_log_prob = self.make_batch()
for i in range(K_epoch):
td_target = r + gamma * self.v(s_prime) * done_mask
delta = td_target - self.v(s)
delta = delta.detach().numpy()

advantage_lst = []
advantage = 0.0
Expand All @@ -92,70 +79,58 @@ def calc_advantage(self, data):
advantage_lst.append([advantage])
advantage_lst.reverse()
advantage = torch.tensor(advantage_lst, dtype=torch.float)
data_with_adv.append((s, a, r, s_prime, done_mask, old_log_prob, td_target, advantage))

return data_with_adv


def train_net(self):
if len(self.data) == minibatch_size * buffer_size:
data = self.make_batch()
data = self.calc_advantage(data)

for i in range(K_epoch):
for mini_batch in data:
s, a, r, s_prime, done_mask, old_log_prob, td_target, advantage = mini_batch

mu, std = self.pi(s, softmax_dim=1)
dist = Normal(mu, std)
log_prob = dist.log_prob(a)
ratio = torch.exp(log_prob - old_log_prob) # a/b == exp(log(a)-log(b))

surr1 = ratio * advantage
surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage
loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.v(s) , td_target)

self.optimizer.zero_grad()
loss.mean().backward()
nn.utils.clip_grad_norm_(self.parameters(), 1.0)
self.optimizer.step()
self.optimization_step += 1

def main():
env = gym.make('Pendulum-v0')
model = PPO()
curr_mu,curr_sigma = self.pi(s)

curr_dist = torch.distributions.Normal(curr_mu,curr_sigma)
curr_log_prob = curr_dist.log_prob(a)
entropy = curr_dist.entropy() * entropy_coef

ratio = torch.exp(curr_log_prob - old_log_prob.detach())
surr1 = ratio * advantage
surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage
actor_loss = (-torch.min(surr1, surr2) - entropy).mean()
critic_loss = critic_coef * F.smooth_l1_loss(self.v(s).float() , td_target.detach().float())
loss = actor_loss + critic_loss
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()


env = gym.make('Pendulum-v0')
model = PPO()

print_interval = 20

def main(render = False):
score = 0.0
print_interval = 20
rollout = []

global_step = 0
for n_epi in range(10000):

s = env.reset()
done = False
while not done:
for t in range(rollout_len):
mu, std = model.pi(torch.from_numpy(s).float())
dist = Normal(mu, std)
a = dist.sample()
log_prob = dist.log_prob(a)
s_prime, r, done, info = env.step([a.item()])

rollout.append((s, a, r/10.0, s_prime, log_prob.item(), done))
if len(rollout) == rollout_len:
model.put_data(rollout)
rollout = []

for t in range(T_horizon):
global_step += 1
if render:
env.render()
mu,sigma = model.pi(torch.from_numpy(s).float())
dist = torch.distributions.Normal(mu,sigma)

action = dist.sample()
log_prob = dist.log_prob(action)
s_prime, r, done, info = env.step([action.item()])

model.put_data((s, action, r/10.0, s_prime, \
log_prob, done))
s = s_prime

score += r
if done:
break

model.train_net()

if n_epi%print_interval==0 and n_epi!=0:
print("# of episode :{}, avg score : {:.1f}, opt step: {}".format(n_epi, score/print_interval, model.optimization_step))
print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval))
score = 0.0

env.close()

if __name__ == '__main__':
main()
main()