-
Notifications
You must be signed in to change notification settings - Fork 0
/
dqn2013.py
119 lines (109 loc) · 5.18 KB
/
dqn2013.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import time
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import base
import enviroment
import history
import layers
class DQN(base.AgentBase):
# Modeling
ENV_ID = 'CartPole-v0'
STATE_SPACE = 4
HIDDEN_NEURONS = 20
ACTION_SPACE = 2
# Training
LR = 1e-3 # learning rate
DF = 0.9
def __init__(self):
super().__init__()
# sub module
self._env = enviroment.make(self.ENV_ID)
self._replay_buffer = history.ReplayBuffer()
# build network
self._sess = tf.Session()
self._net = self._build_network()
def __del__(self):
self._sess.close()
def _build_network(self):
with tf.name_scope('input'):
state = tf.placeholder(dtype=tf.float32, shape=[None, self.STATE_SPACE])
with tf.name_scope('hidden'):
y1, _, _ = layers.fc(state, n_neurons=self.HIDDEN_NEURONS, activation=tf.nn.tanh)
with tf.name_scope('q_value'):
q_values, _, _ = layers.fc(y1, n_neurons=self.ACTION_SPACE)
# loss
with tf.name_scope('loss'):
action = tf.placeholder(tf.int32, [None])
action_mask = tf.one_hot(action, depth=self.ACTION_SPACE, on_value=1.0, off_value=0.0, dtype=tf.float32)
q_current = tf.reduce_sum(tf.multiply(q_values, action_mask), axis=1)
q_target = tf.placeholder(tf.float32, [None])
loss = tf.reduce_mean(tf.squared_difference(q_current, q_target))
tf.summary.scalar('loss', loss)
# train
with tf.name_scope('train'):
global_step = tf.Variable(0, trainable=False, name='global_step')
train_step = tf.train.AdamOptimizer().minimize(loss, global_step=global_step)
# tensor board
merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter('/tmp/tensorflow-drl/dqn/train', self._sess.graph)
test_writer = tf.summary.FileWriter('/tmp/tensorflow-drl/dqn/test')
#
self._sess.run(tf.global_variables_initializer())
#
return {'state': state,
'q_values': q_values,
'action': action,
'q_current': q_current,
'q_target': q_target,
'loss': loss,
'train_step': train_step,
'global_step': global_step,
'merged': merged,
'train_writer': train_writer,
'test_writer': test_writer}
def train(self, episodes=500, max_step=200):
for episode in tqdm(range(episodes)):
if episode % 50 == 0:
total_reward = self._test_impl(max_step, delay=0, gui=False)
tqdm.write('current reward: {total_reward}'.format(total_reward=total_reward))
else:
# train step
self._train_impl(max_step)
def test(self, episodes=1, max_step=200, delay=0.1, gui=True):
for episode in range(episodes):
total_reward = self._test_impl(max_step, delay, gui)
print('current reward: {total_reward}'.format(total_reward=total_reward))
def _random_action(self):
action = self._env.action_space.sample()
return action
def _optimal_action(self, state):
q_values = self._sess.run(self._net['q_values'], feed_dict={self._net['state']: [state]})
return np.argmax(q_values)
def _perceive(self, state, action, state_, reward, done):
self._replay_buffer.insert([state, action, state_, reward, done])
if self._replay_buffer.is_full():
#
state_batch, action_batch, next_state_batch, reward_batch, done_batch = \
self._replay_buffer.get_batch(n_batch=128, n_lost=8)
#
reward_batch = [-1.0 if done else reward for reward, done in zip(reward_batch, done_batch)]
#
# action_mask_batch = np.eye(self.ACTION_SPACE)[list(action_batch)]
# q_current_batch = self._sess.run(self._tensor['q_current'],
# feed_dict={self._tensor['state']: state_batch,
# self._tensor['action_mask']: action_mask_batch})
#
q_predict_batch = self._sess.run(self._net['q_values'], feed_dict={self._net['state']: next_state_batch})
# q_target = reward if done else reward + df * q_predict
q_target_batch = reward_batch + np.multiply(np.subtract(1.0, done_batch),
self.DF * np.max(q_predict_batch, axis=1))
#
summary, _, loss = self._sess.run([self._net['merged'],
self._net['train_step'],
self._net['loss']],
feed_dict={self._net['state']: state_batch,
self._net['action']: action_batch,
self._net['q_target']: q_target_batch})
self._net['train_writer'].add_summary(summary,
tf.train.global_step(self._sess, self._net['global_step']))