-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3_2.py
117 lines (91 loc) · 2.92 KB
/
3_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import numpy as np
import matplotlib.pyplot as plt
from GridWorld_env import *
from nSARSA import *
from Q_Learning import *
from n_step_bt import *
from On_MC import *
from Policy_Iteration import *
import seaborn as sns
SEED = 184
ba_s=[None for _ in range(1)]
br_s=[0 for _ in range(1)]
def mean_without_outliers(data, k=1.5):
"""
Calculate the mean of the data while ignoring outliers using Tukey's fences.
Parameters:
- data (numpy.ndarray or list): Input data.
- k (float): Tukey's fences constant. Typically set to 1.5.
Returns:
- float: Mean without outliers.
"""
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
iqr = q3 - q1
lower_bound = q1 - k * iqr
upper_bound = q3 + k * iqr
filtered_data = [x for x in data if lower_bound <= x <= upper_bound]
return np.mean(filtered_data)
def get_optimal_policy(q_table):
new_q_table = np.zeros((q_table.shape[0], q_table.shape[1]))
for i in range(len(q_table)):
index = np.argmax(q_table[i])
new_q_table[i][index] = 1
return new_q_table
def draw_curves(values):
cmap = plt.get_cmap('tab10')
random_colors = [cmap(i) for i in np.linspace(0, 1, 10)]
plt.figure()
flag =True
for _ in values:
vals1=_[0]
label1=_[1]
mean_value = np.mean(vals1, axis=0)
std_value = np.std(vals1, axis=0)
upper_bound = mean_value + 1.96 * std_value / np.sqrt(len(vals1[0]))
lower_bound = mean_value - 1.96 * std_value / np.sqrt(len(vals1[0]))
plt.plot(mean_value, label=label1, color=random_colors[values.index(_)])
plt.fill_between(
np.arange(len(mean_value)),
lower_bound,
upper_bound,
alpha=0.5,
label=f"Confidence Interval for {label1}",
color=random_colors[values.index(_)]
)
flag=False
plt.legend()
plt.show()
rewards=[]
gwe = GridWorldEnv(size=6)
np.random.seed(SEED)
random.seed(SEED)
start=0.1
list1=[[[],'episode_length']]
for i in range(10):
print(f"Agent{i+1}")
gwe.reset()
agent = Policy_Iteration(gwe, 0.9, 0.1)
reward_in_each_episode, q_table,llen = agent.fit_episode_generating()
print(reward_in_each_episode)
list1[0][0].append(reward_in_each_episode)
if mean_without_outliers(reward_in_each_episode)>br_s[0]:
br_s[0]=mean_without_outliers(reward_in_each_episode)
ba_s[0]=agent
for z in range(1):
reward=[]
ep_len=[]
for _ in range(10):
gwe.reset()
gwe.set_render()
ep=ba_s[z].generate_episode()
ep_len=len(ep)
rew=0
for x in ep:
s,a,r=x
rew+=r
reward.append(rew)
print('i assumed that entering fianl state is equal to getting 1000 reward')
print(f'mean reward for agent {z}={np.mean(reward)}')
print(f'mean episode length for agent {z}={np.mean(ep_len)}')
draw_curves(list1)