-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdouble_q_learning.py
105 lines (84 loc) · 3.75 KB
/
double_q_learning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from collections import defaultdict, Counter
from envs.DoubleQLearningEnv import DoubleQLearningEnv
import numpy as np
import plotly.offline as py
import plotly.graph_objs as go
from utils import epsilon_prob, randomargmax, Algorithm
class QLearning(Algorithm):
def __init__(self, env, alpha=0.1, gamma=1, epsilon=0.1):
self.alpha = alpha
self.gamma = gamma
self.epsilon = epsilon
self.q = defaultdict(dict)
for state in range(env.observation_space.n):
actions = env.available_actions(state)
for action in actions:
self.q[state][action] = .0
def action(self, state):
greedy = self.greedy_action(state)
actions = list(self.q[state].keys())
probs = [epsilon_prob(greedy, action, len(actions), self.epsilon) for action in actions]
return np.random.choice(actions, p=probs)
def greedy_action(self, state):
tmp = self.q[state]
return randomargmax(tmp, key=tmp.get)
def greedy_value(self, state):
return self.q[state][self.greedy_action(state)]
def on_new_state(self, state, action, reward, next_state, done):
greedy_value = self.greedy_value(next_state) if not done else 0
delta = self.gamma * greedy_value - self.q[state][action]
self.q[state][action] += self.alpha * (reward + delta)
class DoubleQLearning(Algorithm):
def __init__(self, env, alpha=0.1, gamma=1, epsilon=0.1):
self.alpha = alpha
self.gamma = gamma
self.epsilon = epsilon
self.q1 = defaultdict(Counter)
self.q2 = defaultdict(Counter)
for state in range(env.observation_space.n):
actions = env.available_actions(state)
for action in actions:
self.q1[state][action] = .0
self.q2[state][action] = .0
def action(self, state):
greedy = self.greedy_action(state)
actions = list(self.q1[state].keys())
prob = [epsilon_prob(greedy, action, len(actions), self.epsilon) for action in actions]
return np.random.choice(actions, p=prob)
def greedy_action(self, state):
tmp = Counter(self.q1[state])
tmp.update(self.q2[state])
return randomargmax(tmp, key=tmp.get)
def greedy_action_q(self, q, state):
return randomargmax(q[state], key=q[state].get)
def on_new_state(self, state, action, reward, next_state, done):
q1, q2 = (self.q1, self.q2) if np.random.rand() < 0.5 else (self.q2, self.q1)
q1_greedy_action = 0 if done else self.greedy_action_q(q1, next_state)
q2_value = 0 if done else q2[next_state][q1_greedy_action]
delta = self.gamma * q2_value - q1[state][action]
q1[state][action] += self.alpha * (reward + delta)
def generate_episode(env: DoubleQLearningEnv, algorithm: Algorithm):
done = False
obs = env.reset()
while not done:
prev_obs = obs
action = algorithm.action(prev_obs)
# env.render()
obs, reward, done, _ = env.step(action)
# print('Reward:', reward)
algorithm.on_new_state(prev_obs, action, reward, obs, done)
return obs
def perform_algorithm_eval(env, algorithm_supplier, n_episodes=300, n_avg=10000):
ret = np.zeros((n_episodes,))
for i in range(n_avg):
print('Averaging:', i)
algorithm = algorithm_supplier(env)
for ep in range(n_episodes):
ret[ep] += generate_episode(env, algorithm) == DoubleQLearningEnv.POS_TERM_LEFT
return ret / n_avg
if __name__ == '__main__':
env = DoubleQLearningEnv()
q = perform_algorithm_eval(env, QLearning)
double_q = perform_algorithm_eval(env, DoubleQLearning)
data = [go.Scatter(y=q, name='Q-Learning'), go.Scatter(y=double_q, name='Double Q-Learning')]
py.plot(data)