Skip to content

Commit 1c64881

Browse files
author
zilong
committed
second commit
1 parent de7c465 commit 1c64881

12 files changed

+189
-30
lines changed

ActorNetwork.py

+9-15
Original file line numberDiff line numberDiff line change
@@ -10,28 +10,22 @@
1010
HIDDEN2_UNITS = 600
1111

1212
class ActorNetwork(nn.Module):
13-
def __init__(self, sess, state_size, action_size, BATCH_SIZE, TAU, LEARNING_RATE):
14-
super(Net, self).__init__()
15-
self.fc1 = nn.Linear(state_size, HIDEEN1_UNITS)
13+
def __init__(self, state_size):
14+
super(ActorNetwork, self).__init__()
15+
self.fc1 = nn.Linear(state_size, HIDDEN1_UNITS)
1616
self.fc2 = nn.Linear(HIDDEN1_UNITS, HIDDEN2_UNITS)
1717
self.steering = nn.Linear(HIDDEN2_UNITS, 1)
18-
self.acceleration = nn.Linear(HIDDEN1_UNITS, 1)
19-
self.brake = nn.Linear(HIDDEN1_UNITS, 1)
20-
21-
self.sess = sess
22-
self.BATCH_SIZE = BATCH_SIZE
23-
self.TAU = TAU
24-
self.LEARNING_RATE = LEARNING_RATE
18+
self.acceleration = nn.Linear(HIDDEN2_UNITS, 1)
19+
self.brake = nn.Linear(HIDDEN2_UNITS, 1)
2520

2621

2722
def forward(self, x):
2823
x = F.relu(self.fc1(x))
2924
x = F.relu(self.fc2(x))
30-
out = F.tanh(self.steering(x))
31-
out2 = F.sigmoid(self.acceleration(x))
32-
out3 = F.sigmoid(self.brake(x))
33-
out += out2
34-
out += out3
25+
out1 = t.tanh(self.steering(x))
26+
out2 = t.sigmoid(self.acceleration(x))
27+
out3 = t.sigmoid(self.brake(x))
28+
out = t.cat((out1, out2, out3), 1)
3529
return out
3630

3731

CriticNetwork.py

+13-14
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,26 @@
11
import numpy as np
22
import math
3-
import torch as t
3+
import torch
44
import torch.nn as nn
55
import torch.nn.functional as F
66

77
HIDDEN1_UNITS = 300
88
HIDDEN2_UNITS = 600
99

1010
class CriticNetwork(nn.Module):
11-
def __init__(self, sess, state_size, action_size, action_dim, BATCH_SIZE, TAU, LEARNING_RATE):
12-
super(Net, self).__init__()
11+
def __init__(self, state_size, action_size):
12+
super(CriticNetwork, self).__init__()
1313
self.w1 = nn.Linear(state_size, HIDDEN1_UNITS)
14-
self.a1 = nn.Linear(action_dim, HIDDEN2_UNITS)
14+
self.a1 = nn.Linear(action_size, HIDDEN2_UNITS)
1515
self.h1 = nn.Linear(HIDDEN1_UNITS, HIDDEN2_UNITS)
1616
self.h3 = nn.Linear(HIDDEN2_UNITS, HIDDEN2_UNITS)
17-
self.V = nn.Linear(HIDDEN2_UNITS, action_dim)
17+
self.V = nn.Linear(HIDDEN2_UNITS, action_size)
1818

19-
self.sess = sess
20-
self.BATCH_SIZE = BATCH_SIZE
21-
self.TAU = TAU
22-
self.LEARNING_RATE = LEARNING_RATE
23-
self.action_size = action_size
24-
25-
def forward(self, x):
26-
x = F.relu(self.w1(x))
27-
x =
19+
def forward(self, s, a):
20+
w1 = F.relu(self.w1(s))
21+
a1 = self.a1(a)
22+
h1 = self.h1(w1)
23+
h2 = h1 + a1
24+
h3 = F.relu(self.h3(h2))
25+
out = self.V(h3)
26+
return out

ReplayBuffer.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def size(self):
1818
return self.buffer_size
1919

2020
def add(self, state, action, reward, new_state, done):
21-
experience = (state, action, rewward, new_state, done)
21+
experience = (state, action, reward, new_state, done)
2222
if self.num_experiences < self.buffer_size:
2323
self.buffer.append(experience)
2424
self.num_experiences += 1
1.16 KB
Binary file not shown.
1.05 KB
Binary file not shown.

__pycache__/OU.cpython-36.pyc

476 Bytes
Binary file not shown.
1.36 KB
Binary file not shown.

__pycache__/gym_torcs.cpython-36.pyc

5.72 KB
Binary file not shown.
14.9 KB
Binary file not shown.

autostart.sh

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/bash
2+
xte 'key Return'
3+
xte 'usleep 100000'
4+
xte 'key Return'
5+
xte 'usleep 100000'
6+
xte 'key Up'
7+
xte 'usleep 100000'
8+
xte 'key Up'
9+
xte 'usleep 100000'
10+
xte 'key Return'
11+
xte 'usleep 100000'
12+
xte 'key Return'

ddpg.py

+4
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
import numpy as np
33
import random
44
import argparse
5+
import torch
6+
import torchvision
7+
import torch.nn as nn
58

69
from ReplayBuffer import ReplayBuffer
710
from ActorNetwork import ActorNetwork
@@ -37,6 +40,7 @@ def palygame(train_indicator = 0):
3740

3841
#Torch GPU optimization
3942
#to do
43+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
4044

4145

4246
env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

test.py

+150
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
import torch
2+
from torch.autograd import Variable
3+
import numpy as np
4+
import random
5+
from gym_torcs import TorcsEnv
6+
import argparse
7+
import collections
8+
9+
from ReplayBuffer import ReplayBuffer
10+
from ActorNetwork import ActorNetwork
11+
from CriticNetwork import CriticNetwork
12+
from OU import OU
13+
14+
state_size = 29
15+
action_size = 3
16+
LRA = 0.0001
17+
LRC = 0.001
18+
BUFFER_SIZE = 1000 #to change
19+
BATCH_SIZE = 32
20+
GAMMA = 0.99
21+
EXPLORE = 10000
22+
epsilon = 1
23+
train_indicator = 1 # train or not
24+
TAU = 0.001
25+
26+
VISION = False
27+
28+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
29+
30+
OU = OU()
31+
32+
actor = ActorNetwork(state_size).to(device)
33+
critic = CriticNetwork(state_size, action_size).to(device)
34+
buff = ReplayBuffer(BUFFER_SIZE)
35+
target_actor = ActorNetwork(state_size).to(device)
36+
target_critic = CriticNetwork(state_size, action_size).to(device)
37+
38+
criterion_critic = torch.nn.MSELoss(reduction='sum')
39+
40+
optimizer_actor = torch.optim.Adam(actor.parameters(), lr=LRA)
41+
optimizer_critic = torch.optim.Adam(critic.parameters(), lr=LRC)
42+
43+
#env environment
44+
env = TorcsEnv(vision=VISION, throttle=True, gear_change=False)
45+
46+
torch.set_default_tensor_type('torch.FloatTensor')
47+
48+
for i in range(2000):
49+
50+
print(str(i) + "-th episode starts")
51+
52+
if np.mod(i, 3) == 0:
53+
ob = env.reset(relaunch = True)
54+
else:
55+
ob = env.reset()
56+
57+
s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
58+
59+
for j in range(1000):
60+
loss = 0
61+
epsilon -= 1.0 / EXPLORE
62+
a_t = np.zeros([1, action_size])
63+
noise_t = np.zeros([1, action_size])
64+
65+
a_t_original = actor(torch.tensor(s_t.reshape(1, s_t.shape[0]), device=device).float())
66+
a_t_original = a_t_original.data.numpy()
67+
noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0], 0.0, 0.60, 0.30)
68+
noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.5, 1.00, 0.10)
69+
noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1, 1.00, 0.05)
70+
71+
a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
72+
a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
73+
a_t[0][2] = a_t_original[0][2] + noise_t[0][2]
74+
75+
ob, r_t, done, info = env.step(a_t[0])
76+
77+
s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
78+
79+
#add to replay buffer
80+
buff.add(s_t, a_t[0], r_t, s_t1, done)
81+
82+
batch = buff.getBatch(BATCH_SIZE)
83+
84+
states = torch.tensor(np.asarray([e[0] for e in batch]), device=device).float() #torch.cat(batch[0])
85+
actions = torch.tensor(np.asarray([e[1] for e in batch]), device=device).float()
86+
rewards = torch.tensor(np.asarray([e[2] for e in batch]), device=device).float()
87+
new_states = torch.tensor(np.asarray([e[3] for e in batch]), device=device).float()
88+
dones = np.asarray([e[4] for e in batch])
89+
y_t = torch.tensor(np.asarray([e[1] for e in batch]), device=device).float()
90+
91+
#use target network to calculate target_q_value
92+
target_q_values = target_critic(new_states, target_actor(new_states))
93+
94+
for k in range(len(batch)):
95+
if dones[k]:
96+
y_t[k] = rewards[k]
97+
else:
98+
y_t[k] = rewards[k] + GAMMA * target_q_values[k]
99+
100+
if(train_indicator):
101+
102+
#training
103+
q_values = critic(states, actions)
104+
loss = criterion_critic(y_t, q_values)
105+
optimizer_critic.zero_grad()
106+
loss.backward() ##for param in critic.parameters(): param.grad.data.clamp(-1, 1)
107+
optimizer_critic.step()
108+
109+
a_for_grad = actor(states)
110+
a_for_grad.requires_grad_()
111+
q_values_for_grad = critic(states, a_for_grad)
112+
critic.zero_grad()
113+
q_values_for_grad.sum().backward()
114+
grads = a_for_grad.grad #a_for_grad is not a Variable, Variable input to varibale output?
115+
116+
act = actor(states)
117+
actor.zero_grad()
118+
act.sum().backward(grads)
119+
optimizer_actor.step()
120+
121+
#soft update for target network
122+
#actor_params = list(actor.parameters())
123+
#critic_params = list(critic.parameters())
124+
new_actor_state_dict = collections.OrderedDict()
125+
new_critic_state_dict = collections.OrderedDict()
126+
for var_name in target_actor.state_dict():
127+
new_actor_state_dict[var_name] = TAU * actor.state_dict()[var_name] + (1-TAU) * target_actor.state_dict()[var_name]
128+
target_actor.load_state_dict(new_actor_state_dict)
129+
130+
for var_name in target_critic.state_dict():
131+
new_critic_state_dict[var_name] = TAU * critic.state_dict()[var_name] + (1-TAU) * target_critic.state_dict()[var_name]
132+
target_critic.load_state_dict(new_critic_state_dict)
133+
134+
s_t = s_t1
135+
136+
if done:
137+
break
138+
139+
if np.mod(1, 3) == 0:
140+
if (train_indicator):
141+
print("saving model")
142+
torch.save(actor.state_dict(), 'actormodel.pth')
143+
torch.save(critic.state_dict(), 'criticmodel.pth')
144+
145+
146+
env.end()
147+
print("Finish.")
148+
149+
#for param in critic.parameters(): param.grad.data.clamp(-1, 1)
150+

0 commit comments

Comments
 (0)