-
Notifications
You must be signed in to change notification settings - Fork 0
/
reinforce_continuous.py
72 lines (57 loc) · 2.29 KB
/
reinforce_continuous.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import sys
import math
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.utils as utils
import torchvision.transforms as T
from torch.autograd import Variable
pi = Variable(torch.FloatTensor([math.pi])).cuda()
def normal(x, mu, sigma_sq):
a = (-1*(Variable(x)-mu).pow(2)/(2*sigma_sq)).exp()
b = 1/(2*sigma_sq*pi.expand_as(sigma_sq)).sqrt()
return a*b
class Policy(nn.Module):
def __init__(self, hidden_size, num_inputs, action_space):
super(Policy, self).__init__()
self.action_space = action_space
num_outputs = action_space.shape[0]
self.linear1 = nn.Linear(num_inputs, hidden_size)
self.linear2 = nn.Linear(hidden_size, num_outputs)
self.linear2_ = nn.Linear(hidden_size, num_outputs)
def forward(self, inputs):
x = inputs
x = F.relu(self.linear1(x))
mu = self.linear2(x)
sigma_sq = self.linear2_(x)
return mu, sigma_sq
class REINFORCE:
def __init__(self, hidden_size, num_inputs, action_space):
self.action_space = action_space
self.model = Policy(hidden_size, num_inputs, action_space)
self.model = self.model.cuda()
self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
self.model.train()
def select_action(self, state):
mu, sigma_sq = self.model(Variable(state).cuda())
sigma_sq = F.softplus(sigma_sq)
eps = torch.randn(mu.size())
# calculate the probability
action = (mu + sigma_sq.sqrt()*Variable(eps).cuda()).data
prob = normal(action, mu, sigma_sq)
entropy = -0.5*((sigma_sq+2*pi.expand_as(sigma_sq)).log()+1)
log_prob = prob.log()
return action, log_prob, entropy
def update_parameters(self, rewards, log_probs, entropies, gamma):
R = torch.zeros(1, 1)
loss = 0
for i in reversed(range(len(rewards))):
R = gamma * R + rewards[i]
loss = loss - (log_probs[i]*(Variable(R).expand_as(log_probs[i])).cuda()).sum() - (0.0001*entropies[i].cuda()).sum()
loss = loss / len(rewards)
self.optimizer.zero_grad()
loss.backward()
utils.clip_grad_norm(self.model.parameters(), 40)
self.optimizer.step()