-
Notifications
You must be signed in to change notification settings - Fork 1
/
REINFORCE.py
102 lines (86 loc) · 3.42 KB
/
REINFORCE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import numpy as np
class REINFORCE:
def __init__(self, feature_space, action_space, alpha = 0.0001, gamma = 0.99):
self.alpha = alpha
self.gamma = gamma
self.feature_space = feature_space
self.action_space = action_space
self.reset_weights()
def step(self, obs):
'''
Picks the action given the observations with the probabilities
of each action
'''
# Gets the probability for each action
probs = self._policy(obs)
# Pick the action using probabilities
action = np.random.choice(self.action_space, p=probs)
return action
def _policy(self, obs):
'''
Probabilities are calculated using softmax.
So for each action
-> e^{h(s,a,theta)}/sum_b{e^{h(s,b,theta)}}
Returns to the probabilitiess of each action being taken
'''
probs = np.zeros(self.action_space)
for a in range(self.action_space):
probs[a] = self._h(obs, a)
probs = np.exp(probs)
return probs/np.sum(probs)
def _gradient(self, obs, action):
'''
Softmax gradient calculated given observations and action
'''
grads = np.zeros_like(self.theta)
probs = self._policy(obs)
for b in range(self.action_space):
# sum_b{x(s,b) * π(b|s, theta)}
grads += self._x(obs, b) * probs[b]
# returns x(s,a) - grads
return self._x(obs, action) - grads
def _h(self, obs, action):
'''
h(s, a, theta) from the book. Here we just have a linear
function so its theta^T.x(s,a).
'''
return self.theta.T.dot(self._x(obs, action))
def _x(self, obs, action):
'''
x(s,a). State representation is created here. We have the same state representation
for each action, except that only the selected action will have them activated meaning
the rest will be just zeros in our case (since we are using one hot vectors for the
representations)
-> Return to the representation created i.e. [0 0 0 0 0 1 0 0 0 0 0 0]
'''
one_hot = np.zeros_like(self.theta)
# need to improve this
j = 0
for i in range(action * self.feature_space, ((action+1) * self.feature_space)):
one_hot[i] = obs[j]
j += 1
return one_hot
def update(self, observations, actions, rewards):
'''
Since REINFORCE is a Monte Carlo method, there will be no
gradual updating until the Terminal state
'''
pass
def end(self, observations, actions, rewards):
'''
Updates the weights given the observations actions and rewards.
Formula is:
-> theta = theta + alpha x ∆π(s,a) x G
where ∆ is the gradient
and G is the discounted future rewards
'''
for i in range(len(observations)):
G = sum([r * (self.gamma ** t) for t,r in enumerate(rewards[i:])])
self.theta += self.alpha * self._gradient(observations[i], actions[i]) * G
def reset_weights(self):
'''
Resets the weights
Weights dimension is d x a where a is action space and d is the feature space
- Think of it as one one hot vector for each action
'''
self.theta = np.random.rand(self.feature_space * self.action_space)