jastfkjg
diff --git a/‎ActorNetwork.py
+9-15 b/‎ActorNetwork.py
+9-15
diff --git a/‎CriticNetwork.py
+13-14 b/‎CriticNetwork.py
+13-14
diff --git a/‎ReplayBuffer.py
+1-1 b/‎ReplayBuffer.py
+1-1
diff --git a/‎__pycache__/ActorNetwork.cpython-36.pyc
1.16 KB b/‎__pycache__/ActorNetwork.cpython-36.pyc
1.16 KB
diff --git a/‎__pycache__/CriticNetwork.cpython-36.pyc
1.05 KB b/‎__pycache__/CriticNetwork.cpython-36.pyc
1.05 KB
diff --git a/‎__pycache__/OU.cpython-36.pyc
476 Bytes b/‎__pycache__/OU.cpython-36.pyc
476 Bytes
diff --git a/‎__pycache__/ReplayBuffer.cpython-36.pyc
1.36 KB b/‎__pycache__/ReplayBuffer.cpython-36.pyc
1.36 KB
diff --git a/‎__pycache__/gym_torcs.cpython-36.pyc
5.72 KB b/‎__pycache__/gym_torcs.cpython-36.pyc
5.72 KB
diff --git a/‎__pycache__/snakeoil3_gym.cpython-36.pyc
14.9 KB b/‎__pycache__/snakeoil3_gym.cpython-36.pyc
14.9 KB
diff --git a/‎autostart.sh
+12 b/‎autostart.sh
+12
diff --git a/‎ddpg.py
+4 b/‎ddpg.py
+4
diff --git a/‎test.py
+150 b/‎test.py
+150
@@ -10,28 +10,22 @@
 HIDDEN2_UNITS = 600
 
 class ActorNetwork(nn.Module):
-    def __init__(self, sess, state_size, action_size, BATCH_SIZE, TAU, LEARNING_RATE):
-        super(Net, self).__init__()
-        self.fc1 = nn.Linear(state_size, HIDEEN1_UNITS)
+    def __init__(self, state_size):
+        super(ActorNetwork, self).__init__()
+        self.fc1 = nn.Linear(state_size, HIDDEN1_UNITS)
         self.fc2 = nn.Linear(HIDDEN1_UNITS, HIDDEN2_UNITS)
         self.steering = nn.Linear(HIDDEN2_UNITS, 1)
-        self.acceleration = nn.Linear(HIDDEN1_UNITS, 1)
-        self.brake = nn.Linear(HIDDEN1_UNITS, 1)
-
-        self.sess = sess
-        self.BATCH_SIZE = BATCH_SIZE
-        self.TAU = TAU
-        self.LEARNING_RATE = LEARNING_RATE
+        self.acceleration = nn.Linear(HIDDEN2_UNITS, 1)
+        self.brake = nn.Linear(HIDDEN2_UNITS, 1)
 
 
     def forward(self, x):
         x = F.relu(self.fc1(x))
         x = F.relu(self.fc2(x))
-        out = F.tanh(self.steering(x))
-        out2 = F.sigmoid(self.acceleration(x))
-        out3 = F.sigmoid(self.brake(x))
-        out += out2
-        out += out3
+        out1 = t.tanh(self.steering(x))
+        out2 = t.sigmoid(self.acceleration(x))
+        out3 = t.sigmoid(self.brake(x))
+        out = t.cat((out1, out2, out3), 1) 
         return out
 
 
 
@@ -1,27 +1,26 @@
 import numpy as np
 import math
-import torch as t
+import torch 
 import torch.nn as nn
 import torch.nn.functional as F
 
 HIDDEN1_UNITS = 300
 HIDDEN2_UNITS = 600
 
 class CriticNetwork(nn.Module):
-    def __init__(self, sess, state_size, action_size, action_dim, BATCH_SIZE, TAU, LEARNING_RATE):
-        super(Net, self).__init__()
+    def __init__(self, state_size, action_size):
+        super(CriticNetwork, self).__init__()
         self.w1 = nn.Linear(state_size, HIDDEN1_UNITS)
-        self.a1 = nn.Linear(action_dim, HIDDEN2_UNITS)
+        self.a1 = nn.Linear(action_size, HIDDEN2_UNITS)
         self.h1 = nn.Linear(HIDDEN1_UNITS, HIDDEN2_UNITS)
         self.h3 = nn.Linear(HIDDEN2_UNITS, HIDDEN2_UNITS)
-        self.V = nn.Linear(HIDDEN2_UNITS, action_dim)
+        self.V = nn.Linear(HIDDEN2_UNITS, action_size)
 
-        self.sess = sess
-        self.BATCH_SIZE = BATCH_SIZE
-        self.TAU = TAU
-        self.LEARNING_RATE = LEARNING_RATE
-        self.action_size = action_size
-
-    def forward(self, x):
-        x = F.relu(self.w1(x))
-        x = 
+    def forward(self, s, a):
+        w1 = F.relu(self.w1(s))
+        a1 = self.a1(a)
+        h1 = self.h1(w1)
+        h2 = h1 + a1
+        h3 = F.relu(self.h3(h2))
+        out = self.V(h3)
+        return out
@@ -18,7 +18,7 @@ def size(self):
         return self.buffer_size
 
     def add(self, state, action, reward, new_state, done):
-        experience = (state, action, rewward, new_state, done)
+        experience = (state, action, reward, new_state, done)
         if self.num_experiences < self.buffer_size:
             self.buffer.append(experience)
             self.num_experiences += 1
 
@@ -0,0 +1,12 @@
+#!/bin/bash
+xte 'key Return'
+xte 'usleep 100000'
+xte 'key Return'
+xte 'usleep 100000'
+xte 'key Up'
+xte 'usleep 100000'
+xte 'key Up'
+xte 'usleep 100000'
+xte 'key Return'
+xte 'usleep 100000'
+xte 'key Return'
@@ -2,6 +2,9 @@
 import numpy as np 
 import random 
 import argparse
+import torch
+import torchvision
+import torch.nn as nn
 
 from ReplayBuffer import ReplayBuffer
 from ActorNetwork import ActorNetwork
@@ -37,6 +40,7 @@ def palygame(train_indicator = 0):
 
     #Torch GPU optimization
     #to do
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
 
     env = TorcsEnv(vision=vision, throttle=True, gear_change=False)
 
@@ -0,0 +1,150 @@
+import torch
+from torch.autograd import Variable
+import numpy as np
+import random
+from gym_torcs import TorcsEnv
+import argparse
+import collections
+
+from ReplayBuffer import ReplayBuffer
+from ActorNetwork import ActorNetwork
+from CriticNetwork import CriticNetwork
+from OU import OU
+
+state_size = 29
+action_size = 3
+LRA = 0.0001
+LRC = 0.001
+BUFFER_SIZE = 1000  #to change
+BATCH_SIZE = 32
+GAMMA = 0.99
+EXPLORE = 10000
+epsilon = 1
+train_indicator = 1    # train or not
+TAU = 0.001
+
+VISION = False
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+OU = OU()
+
+actor = ActorNetwork(state_size).to(device)
+critic = CriticNetwork(state_size, action_size).to(device)
+buff = ReplayBuffer(BUFFER_SIZE)
+target_actor = ActorNetwork(state_size).to(device)
+target_critic = CriticNetwork(state_size, action_size).to(device)
+
+criterion_critic = torch.nn.MSELoss(reduction='sum')
+
+optimizer_actor = torch.optim.Adam(actor.parameters(), lr=LRA)
+optimizer_critic = torch.optim.Adam(critic.parameters(), lr=LRC)
+
+#env environment
+env = TorcsEnv(vision=VISION, throttle=True, gear_change=False)
+
+torch.set_default_tensor_type('torch.FloatTensor')
+
+for i in range(2000):
+
+    print(str(i) + "-th episode starts")
+
+    if np.mod(i, 3) == 0:
+        ob = env.reset(relaunch = True)
+    else:
+        ob = env.reset()
+
+    s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
+    
+    for j in range(1000):
+        loss = 0
+        epsilon -= 1.0 / EXPLORE
+        a_t = np.zeros([1, action_size])
+        noise_t = np.zeros([1, action_size])
+
+        a_t_original = actor(torch.tensor(s_t.reshape(1, s_t.shape[0]), device=device).float())
+        a_t_original = a_t_original.data.numpy()
+        noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0], 0.0, 0.60, 0.30)
+        noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.5, 1.00, 0.10)
+        noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1, 1.00, 0.05)
+        
+        a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
+        a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
+        a_t[0][2] = a_t_original[0][2] + noise_t[0][2]
+
+        ob, r_t, done, info = env.step(a_t[0])
+
+        s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
+
+        #add to replay buffer
+        buff.add(s_t, a_t[0], r_t, s_t1, done)
+
+        batch = buff.getBatch(BATCH_SIZE)
+
+        states = torch.tensor(np.asarray([e[0] for e in batch]), device=device).float()    #torch.cat(batch[0])
+        actions = torch.tensor(np.asarray([e[1] for e in batch]), device=device).float()
+        rewards = torch.tensor(np.asarray([e[2] for e in batch]), device=device).float()
+        new_states = torch.tensor(np.asarray([e[3] for e in batch]), device=device).float()
+        dones = np.asarray([e[4] for e in batch])
+        y_t = torch.tensor(np.asarray([e[1] for e in batch]), device=device).float()
+        
+        #use target network to calculate target_q_value
+        target_q_values = target_critic(new_states, target_actor(new_states))
+
+        for k in range(len(batch)):
+            if dones[k]:
+                y_t[k] = rewards[k]
+            else:
+                y_t[k] = rewards[k] + GAMMA * target_q_values[k]
+
+        if(train_indicator):
+            
+            #training
+            q_values = critic(states, actions)
+            loss = criterion_critic(y_t, q_values)  
+            optimizer_critic.zero_grad()
+            loss.backward()                         ##for param in critic.parameters(): param.grad.data.clamp(-1, 1)
+            optimizer_critic.step()
+
+            a_for_grad = actor(states)
+            a_for_grad.requires_grad_()
+            q_values_for_grad = critic(states, a_for_grad)
+            critic.zero_grad()
+            q_values_for_grad.sum().backward()
+            grads = a_for_grad.grad        #a_for_grad is not a Variable, Variable input to varibale output?
+
+            act = actor(states)
+            actor.zero_grad()
+            act.sum().backward(grads)
+            optimizer_actor.step()
+
+            #soft update for target network
+            #actor_params = list(actor.parameters())
+            #critic_params = list(critic.parameters())
+            new_actor_state_dict = collections.OrderedDict()
+            new_critic_state_dict = collections.OrderedDict()
+            for var_name in target_actor.state_dict():
+                new_actor_state_dict[var_name] = TAU * actor.state_dict()[var_name] + (1-TAU) * target_actor.state_dict()[var_name]
+            target_actor.load_state_dict(new_actor_state_dict)
+
+            for var_name in target_critic.state_dict():
+                new_critic_state_dict[var_name] = TAU * critic.state_dict()[var_name] + (1-TAU) * target_critic.state_dict()[var_name]
+            target_critic.load_state_dict(new_critic_state_dict)
+        
+        s_t = s_t1
+
+        if done:
+            break
+
+    if np.mod(1, 3) == 0:
+        if (train_indicator):
+            print("saving model")
+            torch.save(actor.state_dict(), 'actormodel.pth')
+            torch.save(critic.state_dict(), 'criticmodel.pth')
+
+    
+env.end()
+print("Finish.")
+
+#for param in critic.parameters(): param.grad.data.clamp(-1, 1)
+