qlearn.py

import random


class QLearning:
    def __init__(self, num_states, num_actions, learning_rate, discount_factor):
        self.num_states = num_states
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        # Initialize Q table with zeros
        self.Q = [[0 for _ in range(num_actions)] for _ in range(num_states)]

    def choose_action(self, state, epsilon):
        # Implementing epsilon-greedy policy
        if random.uniform(0, 1) < epsilon:
            return random.randint(0, self.num_actions - 1)  # Explore
        else:
            return self.max_action(state)  # Exploit (greedy)

    def max_action(self, state):
        # Helper function to choose the action with the highest Q value for a state
        max_value = max(self.Q[state])
        return self.Q[state].index(max_value)

    def update(self, state, action, reward, next_state):
        # Q-learning update rule
        max_next_q = max(self.Q[next_state])
        self.Q[state][action] += self.learning_rate * \
            (reward + self.discount_factor *
             max_next_q - self.Q[state][action])

    def train_astar(self, environment, num_episodes, epsilon_decay):
        for episode in range(num_episodes):
            state = environment.reset()
            done = False
            epsilon = 1.0 / (episode + 1) * epsilon_decay

            while not done:
                action = self.choose_action(state, epsilon)
                next_state, reward, done = environment.take_action(state, action)
                self.learn(state, action, reward, next_state)
                state = next_state