-
Notifications
You must be signed in to change notification settings - Fork 14
/
Qlearning
54 lines (51 loc) · 1.83 KB
/
Qlearning
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author:CatZiyan
# @Time :2019/9/16 15:32
# 问题http://mnemstudio.org/path-finding-q-learning-tutorial.htm的Q学习方法实现,对书中的native—Qlearning.py做了一些修改
import numpy as np
import random
import matplotlib.pyplot as plt
q_value = np.zeros((5,6)) #不需要终止环境5的q值
alpha = 0.9 #越大,每次新的q值对估值的影响越大
gamma = 0.5 #越大对远期的回报越重视
LAYOUT = 221
Step = 10
Episode = 1000
epsilon = 0.1
action = np.array([[4],
[3, 5],
[3],
[1, 2, 4],
[0, 3, 5],
[1, 4, 5]])
finish_state = 5
for ep in range(1,Episode+1): #到达结束状态或者步长超过Step为一个ep
state = random.randint(0,4) #随机选择初始位置
state_list = []
for step in range(1,Step+1):
state_list.append(state)
if random.random()<epsilon: #采用epsilon—贪婪法策略进行动作选择
a = random.choice(action[state])
else:
next_state_list = []
next_state_list = [(q_value[state,action[state][i]],action[state][i]) for i in range(len(action[state]))]
np.random.shuffle(next_state_list)
next_state_list.sort(key=lambda x: x[0], reverse=True)
a = next_state_list[0][1]
next_state=a
if next_state != finish_state:
reward=-1
q_value[state, a] = q_value[state, a] + alpha * (reward + gamma * q_value[next_state].max() - q_value[state, a])
state = next_state
else:
reward=100
q_value[state, a] = reward
break
# print(state_list)
if ep % (Episode//4)==0:
print(q_value)
plt.subplot(LAYOUT)
plt.imshow(q_value, cmap='gray_r')
LAYOUT += 1
plt.show()