-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathDDPG_analyse.py
140 lines (120 loc) · 5.86 KB
/
DDPG_analyse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import matplotlib
matplotlib.use('TkAgg')
import tensorflow as tf
import numpy as np
import gym
import time
import csv
import matplotlib.pyplot as plt
from cartpole_env import CartPoleEnv_adv
import scipy.io as scio
##################### hyper parameters ####################
MAX_EPISODES = 2000
MAX_EP_STEPS = 2000
LR_A = 0.001 # learning rate for actor
LR_C = 0.002 # learning rate for critic
GAMMA = 0.9 # reward discount
TAU = 0.01 # soft replacement
MEMORY_CAPACITY = 10000
BATCH_SIZE = 32
RENDER = True
# ENV_NAME = 'CartPole-v2'
env = CartPoleEnv_adv()
# env = gym.make(ENV_NAME)
env = env.unwrapped
############################### DDPG ####################################
class DDPG(object):
def __init__(self, a_dim, s_dim, a_bound,):
self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
self.pointer = 0
self.sess = tf.Session()
self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
self.R = tf.placeholder(tf.float32, [None, 1], 'r')
self.a = self._build_a(self.S,)# 这个网络用于及时更新参数
q = self._build_c(self.S, self.a, )# 这个网络是用于及时更新参数
a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Actor')
c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Critic')
ema = tf.train.ExponentialMovingAverage(decay=1 - TAU) # soft replacement
def ema_getter(getter, name, *args, **kwargs):
return ema.average(getter(name, *args, **kwargs))
target_update = [ema.apply(a_params), ema.apply(c_params)] # soft update operation
# 这个网络不及时更新参数, 用于预测 Critic 的 Q_target 中的 action
a_ = self._build_a(self.S_, reuse=True, custom_getter=ema_getter) # replaced target parameters
# 这个网络不及时更新参数, 用于给出 Actor 更新参数时的 Gradient ascent 强度
q_ = self._build_c(self.S_, a_, reuse=True, custom_getter=ema_getter)
a_loss = - tf.reduce_mean(q) # maximize the q
self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=a_params)#以learning_rate去训练,方向是minimize loss,调整列表参数,用adam
self.q=tf.reshape(q,[-1])
with tf.control_dependencies(target_update): # soft replacement happened at here
q_target = self.R + GAMMA * q_
td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=c_params)
self.saver = tf.train.Saver()
self.saver.restore(self.sess, "Model/cartpole_g10_M1_m0.1_l0.5_tau_0.02_final.ckpt")#1 0.1 0.5 0.001
# self.saver.restore(self.sess, "Model/cartpole_plus.ckpt") # 1 0.1 0.5 0.001
def choose_action(self, s):
return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]
def learn(self):
indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
bt = self.memory[indices, :]
bs = bt[:, :self.s_dim]
ba = bt[:, self.s_dim: self.s_dim + self.a_dim]
br = bt[:, -self.s_dim - 1: -self.s_dim]
bs_ = bt[:, -self.s_dim:]
self.sess.run(self.atrain, {self.S: bs})
self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_})
def store_transition(self, s, a, r, s_):
transition = np.hstack((s, a, [r], s_))
index = self.pointer % MEMORY_CAPACITY # replace the old memory with new memory
self.memory[index, :] = transition
self.pointer += 1
#action 选择模块也是actor模块
def _build_a(self, s, reuse=None, custom_getter=None):
trainable = True if reuse is None else False
with tf.variable_scope('Actor', reuse=reuse, custom_getter=custom_getter):
net = tf.layers.dense(s, 128, activation=tf.nn.relu, name='l1', trainable=trainable)
a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable)
return tf.multiply(a, self.a_bound, name='scaled_a')
#critic模块
def _build_c(self, s, a, reuse=None, custom_getter=None):
trainable = True if reuse is None else False
with tf.variable_scope('Critic', reuse=reuse, custom_getter=custom_getter):
n_l1 = 128
w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
return tf.layers.dense(net, 1, trainable=trainable) # Q(s,a)
def show_q(self,s,a,r):
print(self.sess.run(self.q, {self.S: s[np.newaxis, :],self.a:a[np.newaxis, :]}),r)
return self.sess.run(self.q, {self.S: s[np.newaxis, :],self.a:a[np.newaxis, :]}),r
############################### training ####################################
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.shape[0]
a_bound = env.action_space.high
ddpg = DDPG(a_dim, s_dim, a_bound)
EWMA_p=0.95
EWMA=np.zeros((1,MAX_EPISODES+1))
iteration=np.zeros((1,MAX_EPISODES+1))
t1 = time.time()
Q=np.zeros(2000)
R=np.zeros(2000)
for i in range(MAX_EPISODES):
s = env.reset()
ep_reward = 0
for j in range(MAX_EP_STEPS):
if RENDER:
env.render()
a = ddpg.choose_action(s)
a = np.clip(np.random.normal(a, 1), -a_bound, a_bound)
s_, r, done, hit = env.step(a,i)
# print(r)
Q[j],R[j]=ddpg.show_q(s,a,r)
s = s_
ep_reward += r
print("Saved")
scio.savemat('QR',
{'Q': Q,
'R': R,})