-
Notifications
You must be signed in to change notification settings - Fork 0
/
AbstractUCRL.py
61 lines (44 loc) · 1.75 KB
/
AbstractUCRL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import sys
import numpy as np
class AbstractUCRL:
def __init__( self, MDP, RCB, TCB, EVI ):
self.MDP = MDP # The MDP to be solved
self.RCB = RCB # List of functions for computing confidence bounds on reward
self.TCB = TCB # List of functions for computing confidence bounds on transition kernel
self.EVI = EVI # The version of Extended Value Iteration
# compute upper confidence bounds on reward
def rewardbound( self, S, r_hat, Nplus, N, delta, t ):
r_upper = sys.maxsize * np.ones( r_hat.shape )
for CB in self.RCB:
LB, UB = CB.confidencebound( S, r_hat, Nplus, N, delta, t )
r_upper = np.minimum( r_upper, UB )
return r_upper
# compute lower and upper confidence bounds on transition kernel
def transitionbound( self, S, p_hat, Nplus, N, delta, t ):
p_lower = np.zeros( p_hat.shape )
p_upper = np.ones ( p_hat.shape )
for CB in self.TCB:
LB, UB = CB.confidencebound( S, p_hat, Nplus, N, delta, t )
p_lower = np.maximum( p_lower, LB )
p_upper = np.minimum( p_upper, UB )
return p_lower, p_upper
def updatepolicy( self, delta, t ):
pass
def updateparams( self, s, a, r, sp ):
pass
def runUCRL( self, delta, time_horizon, ival, g_opt ):
compute_pi = True
regret = [0] * ( time_horizon // ival )
tmp_regret = 0
s_t = self.MDP.resetstate()
for t in range( time_horizon ):
if compute_pi:
pi = self.updatepolicy( delta, t + 1 )
#print('{}: policy {}'.format(t, pi))
s_next, r_t = self.MDP.act ( s_t, pi[s_t] )
compute_pi = self.updateparams( s_t, pi[s_t], r_t, s_next )
s_t = s_next
tmp_regret = tmp_regret + ( g_opt - np.mean( r_t ) )
if t % ival == 0:
regret[t // ival] = tmp_regret
return regret