mcts.py

# -*- coding: utf-8 -*-
"""MCTS.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1sv9UrEZ5DxtDtYpITJ7Hp4gvl7SAZmX7
"""


from collections import defaultdict
import rlcard
import numpy as np
import math

from tqdm import tqdm

# Import packages for overrides
from rlcard import models
from rlcard.agents import LeducholdemHumanAgent as HumanAgent
from rlcard.utils import print_card
from rlcard.games.leducholdem import Dealer
from rlcard.games.leducholdem import Player
from rlcard.games.leducholdem import Judger
from rlcard.games.leducholdem import Round
from rlcard.games.limitholdem import Game
from rlcard.games.base import Card
import random
from run import run

import types

from overrides import reset, init_game, run


trajectory = []
current_player = None


class TreeSearch:
    # Initialize environment, current player, trajectory, etc
    def __init__(self, env, state_nodes, player_id, first_player):
        self.env = env
        self.state_nodes = state_nodes
        self.player_id = player_id
        self.use_raw = False
        self.trajectory = []
        self.vector_to_string_S = {0: "SJ", 1: "SQ", 2: "SK"}
        self.vector_to_string = {0: "J", 1: "Q", 2: "K"}
        self.string_to_vector = {"J": (1, 0, 0), "Q": (0, 1, 0), "K": (0, 0, 1)}
        self.first_player = first_player

    # Run one step
    def eval_step(self, state):
        # Get full current info of game
        obs = self.env.get_perfect_information()

        # make sure that the hand card and number of chips has our agent's values first
        if self.player_id == 1:
            obs["chips"] = [obs["chips"][1], obs["chips"][0]]
            obs["hand_cards"] = (obs["hand_cards"][1][1], obs["hand_cards"][0][1])
        else:
            obs["chips"] = [obs["chips"][0], obs["chips"][1]]
            obs["hand_cards"] = (obs["hand_cards"][0][1], obs["hand_cards"][1][1])

        # Set public card
        if obs["public_card"] != None:
            obs["public_card"] = obs["public_card"][1]

        # Initialize necessary state structure
        state = (
            tuple(obs["chips"]),
            obs["public_card"],
            tuple(obs["hand_cards"]),
            obs["current_round"],
        )

        legal_actions = obs["legal_actions"]

        # Query card from state
        own_card = self.string_to_vector[obs["hand_cards"][0]]
        opponent_card = self.string_to_vector[obs["hand_cards"][1]]

        chips = obs["chips"]
        public_card = obs["public_card"]

        action_UCB = []
        new_round = None

        new_chips = [0, 0]
        current_round = obs["current_round"]

        # Iterate through actions to retrieve UCB scores
        for action in legal_actions:
            # Sample a public card
            weights = self.probs(own_card, opponent_card)
            new_public_card = np.random.choice(["J", "Q", "K"], p=weights)

            if action == "call":
                # Set new_round = True based on state conditions
                if chips[0] != 1 and obs["current_round"] == 0:
                    new_round = True

                new_chips[0] = chips[1]
                if new_round == True:
                    # Reset state for opponent
                    next_state = (
                        (chips[1], new_chips[0]),
                        new_public_card,
                        (obs["hand_cards"][1], obs["hand_cards"][0]),
                        current_round + 1,
                    )

                    # Append UCB for action
                    if self.state_nodes[next_state][2] != float("inf"):
                        action_UCB.append(-self.state_nodes[next_state][2])

                    else:
                        action_UCB.append(self.state_nodes[next_state][2])

                else:
                    # Reset state for opponent
                    next_state = (
                        (chips[1], new_chips[0]),
                        public_card,
                        (obs["hand_cards"][1], obs["hand_cards"][0]),
                        current_round,
                    )

                    # Append UCB for action
                    if self.state_nodes[next_state][2] != float("inf"):
                        action_UCB.append(-self.state_nodes[next_state][2])

                    else:
                        action_UCB.append(self.state_nodes[next_state][2])

            elif action == "check":
                if current_round == 0:
                    new_round = True
                # Reset state for opponent
                if new_round == True:
                    next_state = (
                        (chips[1], chips[0]),
                        new_public_card,
                        (obs["hand_cards"][1], obs["hand_cards"][0]),
                        current_round + 1,
                    )

                    # Append UCB for action
                    if self.state_nodes[next_state][2] != float("inf"):
                        action_UCB.append(-self.state_nodes[next_state][2])

                    else:
                        action_UCB.append(self.state_nodes[next_state][2])

                # Reset state for opponent
                else:
                    next_state = (
                        (chips[1], chips[0]),
                        public_card,
                        (obs["hand_cards"][1], obs["hand_cards"][0]),
                        current_round,
                    )

                    # Append UCB for action
                    if self.state_nodes[next_state][2] != float("inf"):
                        action_UCB.append(-self.state_nodes[next_state][2])

                    else:
                        action_UCB.append(self.state_nodes[next_state][2])

            elif action == "raise":
                # Reset chips for raise
                if current_round == 0:
                    new_chips[0] = (
                        max([chips[0], chips[1]]) + self.env.game.round.raise_amount
                    )
                else:
                    new_chips[0] = (
                        max([chips[0], chips[1]]) + self.env.game.round.raise_amount
                    )

                # Reset state for opponent
                next_state = (
                    (chips[1], new_chips[0]),
                    public_card,
                    (obs["hand_cards"][1], obs["hand_cards"][0]),
                    current_round,
                )

                # Append UCB for action
                if self.state_nodes[next_state][2] != float("inf"):
                    action_UCB.append(-self.state_nodes[next_state][2])

                else:
                    action_UCB.append(self.state_nodes[next_state][2])

            elif action == "fold":
                action_UCB.append(float("-inf"))

            else:
                raise Exception("Illegal action")

        # Retrieve best action from UCB scores
        take_action_index = np.argmax(action_UCB)
        take_action = legal_actions[take_action_index]

        reverse_action_mapping = {"call": 0, "raise": 1, "fold": 2, "check": 3}
        actual_action = reverse_action_mapping[take_action]

        # before running this, we need a global trajectory
        global trajectory

        # update trajectory
        trajectory.append([state, self.player_id])

        info = {take_action}

        # Set global current player
        global current_player
        current_player = self.player_id

        # Return best action
        return actual_action, info

    # same as eval_step
    def step(self, state):
        take_action_index, info = self.eval_step(self, state)
        return take_action_index

    def return_trajectory(self):
        return self.trajectory

    # Find opponent card probabilities
    def probs(self, state, opponent=[0, 0, 0]):
        probs = np.array([2, 2, 2])
        state = np.array(state)

        opponent = np.array(opponent)

        probs = probs - state[0:3] - opponent
        probs = probs / sum(probs)

        return list(probs)


#################################################################################
class MCTS:
    # Initialize environment, rollouts, current player, trajectory history
    def __init__(self, env, num_rollouts, player_id, model_path="./model"):
        self.env = env
        self.model_path = model_path
        self.num_rollouts = num_rollouts
        self.player_id = player_id
        self.use_raw = False

        self.state_nodes = defaultdict(lambda: (0, 0, float("inf")))
        self.vector_to_string_S = {0: "SJ", 1: "SQ", 2: "SK"}
        self.vector_to_string = {0: "J", 1: "Q", 2: "K"}
        self.string_to_vector = {"J": (1, 0, 0), "Q": (0, 1, 0), "K": (0, 0, 1)}

    # updates values at state_nodes from a trajectory
    def update_nodes(self, payoff):
        global trajectory

        for i in range(0, len(trajectory)):
            # Process trajectory
            state = trajectory[i][0]
            current_player = trajectory[i][1]

            total_return, total_visits, UCB = self.state_nodes[state]

            if current_player == self.player_id:
                total_return += payoff
            else:
                total_return += -payoff

            total_visits += 1

            prev_state = trajectory[i - 1][0]
            _, prev_visits, _ = self.state_nodes[prev_state]

            # if not the root node, update the UCB score
            if i != 0:
                # Calculate UCB score
                UCB = (
                    ((total_return + 7 * total_visits) / (14 * total_visits))
                    / total_visits
                ) + math.sqrt(np.log(prev_visits) / total_visits)

            # Update dictionairy
            self.state_nodes[state] = (total_return, total_visits, UCB)

    # given current state, take the next action (running through the game tree)
    def step(self, state):
        # get current state
        obs = self.env.get_state(self.player_id)
        obs = obs["obs"]

        # Set public card
        if max(obs[3:6]) == 0:
            public_card = None
        else:
            public_card = self.vector_to_string_S[np.argmax(obs[3:6])]

        if max(obs[0:3]) == 0:
            raise Exception("We don't have a card?")
        else:
            hand = self.vector_to_string_S[np.argmax(obs[0:3])]

        # Set chip counts
        chips = [0, 0]
        chips[0] = np.argmax(obs[6:21])
        chips[1] = np.argmax(obs[21:36])

        have_raised = self.env.game.round.have_raised
        not_raise_num = self.env.game.round.not_raise_num

        raise_amount = self.env.game.round.raise_amount
        allowed_raise_num = self.env.game.round.allowed_raise_num

        # Initialize state
        state = {
            "current_player": self.player_id,
            "public_card": public_card,
            "hand": hand,
            "all_chips": chips,
            "have_raised": have_raised,
            "not_raise_num": not_raise_num,
            "raise_amount": raise_amount,
            "allowed_raise_num": allowed_raise_num,
        }

        # Run rollouts to collect history
        for i in tqdm(range(self.num_rollouts)):
            # create env initialized to the given start state (randomize the card for the opponent player)
            initialized_env = rlcard.make("leduc-holdem")
            initialized_env.run = types.MethodType(run, initialized_env)
            initialized_env.reset = types.MethodType(reset, initialized_env)
            initialized_env.game.init_game = types.MethodType(
                init_game, initialized_env.game
            )

            # Set first player
            first_player = self.env.game.init_player == self.player_id

            my_player = TreeSearch(
                initialized_env, self.state_nodes, self.player_id, first_player
            )
            opponent = TreeSearch(
                initialized_env, self.state_nodes, 1 - self.player_id, not first_player
            )

            # create the environment such that we have the correct player_id
            if self.player_id == 0:
                initialized_env.set_agents([my_player, opponent])
            else:
                initialized_env.set_agents([opponent, my_player])

            # Run a single rollout
            trajectories, payoffs = initialized_env.run(is_training=False, state=state)

            temp = trajectories[self.player_id][-1]

            action_record = temp["action_record"]

            global trajectory
            global current_player

            final_state_obs = trajectories[1 - current_player][-1]["raw_obs"]

            my_card = final_state_obs["hand"][1]

            other_player_last_traj = trajectories[current_player][-1]["raw_obs"]
            other_player_card = other_player_last_traj["hand"][1]

            # Check if last action wasn't fold
            if action_record[-1][1] != "fold":
                my_tuple = (
                    tuple(final_state_obs["all_chips"]),
                    final_state_obs["public_card"][1],
                    (my_card, other_player_card),
                    1,
                )  # setting current round to 1 since it will always be 1 if the last action isn't fold

                trajectory.append([my_tuple, 1 - action_record[-1][0]])

            # Update history
            self.update_nodes(payoffs[self.player_id])

            # reset the trajectory after each rollout because we don't want to keep appending to the same trajectory
            trajectory = []

        # after finishing rollouts, decide what action to take (same as eval_step)
        final_action_index, info = self.eval_step(state)
        return final_action_index

    # take the next action, but do not do rollouts or update any nodes
    def eval_step(self, state):
        global current_player

        obs = self.env.get_state(self.player_id)
        temp = obs["legal_actions"]
        obs = obs["obs"]

        # temp2 = []
        legal_actions = []

        for key in temp:
            legal_actions.append(key)

        # Generate opponent card
        weights = self.probs(obs)
        opponent_card = np.random.choice(["J", "Q", "K"], p=weights)

        # figure out your card
        if max(obs[0:3]) == 0:
            raise Exception("We don't have a card?")
        else:
            own_card = self.vector_to_string[np.argmax(obs[0:3])]

        # figure out the public card
        if max(obs[3:6]) == 0:
            public_card = None
        else:
            public_card = self.vector_to_string[np.argmax(obs[3:6])]

        # figure out the current round
        if public_card == None:
            current_round = 0
        else:
            current_round = 1

        # figure out the number of chips in the pot
        chips = [0, 0]
        chips[0] = np.argmax(obs[6:21])
        chips[1] = np.argmax(obs[21:36])

        # maximize over average return of taking any action at root node
        new_round = False
        win_rates = []

        new_chips = [0, 0]

        # Cycle through legal actions to get average reward
        for action_number in legal_actions:
            action_mapping = {0: "call", 1: "raise", 2: "fold", 3: "check"}

            action = action_mapping[action_number]

            weights = self.probs(obs, self.string_to_vector[opponent_card])
            new_public_card = np.random.choice(["J", "Q", "K"], p=weights)

            if action == "call":
                if chips[0] != 1 and current_round == 0:
                    new_round = True

                new_chips[0] = chips[1]

                # Reset opponent state
                if new_round == True:
                    next_state = (
                        (chips[1], new_chips[0]),
                        new_public_card,
                        (opponent_card, own_card),
                        current_round + 1,
                    )

                    # Check if no history of state
                    try:
                        win_rates.append(
                            -self.state_nodes[next_state][0]
                            / self.state_nodes[next_state][1]
                        )
                    except ZeroDivisionError:
                        win_rates.append(float("-inf"))
                else:
                    next_state = (
                        (chips[1], new_chips[0]),
                        public_card,
                        (opponent_card, own_card),
                        current_round,
                    )

                    # Check if no history of state
                    try:
                        win_rates.append(
                            -self.state_nodes[next_state][0]
                            / self.state_nodes[next_state][1]
                        )
                    except ZeroDivisionError:
                        win_rates.append(float("-inf"))

            elif action == "check":
                if current_round == 0:
                    new_round = True
                # Reset opponent state
                if new_round == True:
                    next_state = (
                        (chips[1], chips[0]),
                        new_public_card,
                        (opponent_card, own_card),
                        current_round + 1,
                    )
                else:
                    next_state = (
                        (chips[1], chips[0]),
                        public_card,
                        (opponent_card, own_card),
                        current_round,
                    )

                # Check if no history of state
                try:
                    win_rates.append(
                        -self.state_nodes[next_state][0]
                        / self.state_nodes[next_state][1]
                    )
                except ZeroDivisionError:
                    win_rates.append(float("-inf"))

            elif action == "raise":
                # Reset chips due to raise
                if current_round == 0:
                    new_chips[0] = (
                        max([chips[0], chips[1]]) + self.env.game.round.raise_amount
                    )
                else:
                    new_chips[0] = (
                        max([chips[0], chips[1]]) + self.env.game.round.raise_amount
                    )

                # Reset opponent state
                next_state = (
                    (chips[1], new_chips[0]),
                    public_card,
                    (opponent_card, own_card),
                    current_round,
                )

                # Check if no history of state
                try:
                    win_rates.append(
                        -self.state_nodes[next_state][0]
                        / self.state_nodes[next_state][1]
                    )
                except ZeroDivisionError:
                    win_rates.append(float("-inf"))

            elif action == "fold":
                win_rates.append(-chips[0] / 2)

            else:
                raise Exception("Illegal action")

        final_action_index = np.argmax(win_rates)
        final_action = legal_actions[final_action_index]

        info = {}

        return final_action, info

    # Find probabilities of opponent card
    def probs(self, state, opponent=[0, 0, 0]):
        probs = np.array([2, 2, 2])
        probs = probs - state[0:3] - state[3:6] - opponent
        probs = probs / sum(probs)

        return probs