forked from tairov/QStarLearning.mojo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
qlearn.py
42 lines (35 loc) · 1.63 KB
/
qlearn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import random
class QLearning:
def __init__(self, num_states, num_actions, learning_rate, discount_factor):
self.num_states = num_states
self.num_actions = num_actions
self.learning_rate = learning_rate
self.discount_factor = discount_factor
# Initialize Q table with zeros
self.Q = [[0 for _ in range(num_actions)] for _ in range(num_states)]
def choose_action(self, state, epsilon):
# Implementing epsilon-greedy policy
if random.uniform(0, 1) < epsilon:
return random.randint(0, self.num_actions - 1) # Explore
else:
return self.max_action(state) # Exploit (greedy)
def max_action(self, state):
# Helper function to choose the action with the highest Q value for a state
max_value = max(self.Q[state])
return self.Q[state].index(max_value)
def update(self, state, action, reward, next_state):
# Q-learning update rule
max_next_q = max(self.Q[next_state])
self.Q[state][action] += self.learning_rate * \
(reward + self.discount_factor *
max_next_q - self.Q[state][action])
def train_astar(self, environment, num_episodes, epsilon_decay):
for episode in range(num_episodes):
state = environment.reset()
done = False
epsilon = 1.0 / (episode + 1) * epsilon_decay
while not done:
action = self.choose_action(state, epsilon)
next_state, reward, done = environment.take_action(state, action)
self.learn(state, action, reward, next_state)
state = next_state