diff --git a/FSP.py b/FSP.py index bc22f55..a1e99c1 100644 --- a/FSP.py +++ b/FSP.py @@ -53,6 +53,7 @@ def run_algo(self): tic = time.perf_counter() exploit_learner = learners.actor_critic(learners.softmax, learners.value_advantage, \ self.game.num_actions[0], self.game.num_states[0]) + solver = learners.kuhn_exact_solver() for j in range(1,self.max_iters): # start from 1 or 2? eta_j = 1/j #eta_j = 1/2 @@ -77,13 +78,18 @@ def run_algo(self): #import pdb; pdb.set_trace() if j%self.est_exploit_freq == 0: - exploit_calced_br, br_pols, _, values = calc_exploitability(new_pi, self.game, exploit_learner) + true_exploit, true_br_pols, _, _ = calc_exploitability(new_pi, self.game, solver,\ + num_iters = -1, num_exploit_iters=-1) + exploit_calced_br, br_pols, _, values = calc_exploitability(new_pi, self.game, exploit_learner,\ + num_iters = -1, num_exploit_iters=-1) exploit = self.est_exploitability(new_pi, new_beta) - #import pdb; pdb.set_trace() # compare br_pols with beta - log.info("exploitability: " + str(exploit_calced_br)) + for p in range(self.num_players): + log.debug("p" + str(p+1) + " br: " + str(br_pols[p])) + log.info("exact exploitability: " + str(true_exploit)) + log.info("exploitability with learned br: " + str(exploit_calced_br)) log.info("exploitability using beta: " + str(exploit)) - exploitability.append(exploit) + exploitability.append(true_exploit) toc = time.perf_counter() if toc-tic > self.max_time: break diff --git a/agents/learners.py b/agents/learners.py index bd7d3d2..563aa18 100644 --- a/agents/learners.py +++ b/agents/learners.py @@ -104,11 +104,12 @@ def reset(self): class actor_critic(RL_base): - def __init__(self, pol_func, advantage_func, num_actions, num_states, init_adv = 0, extra_samples=10, init_lr=0.05, df=1.0): + def __init__(self, pol_func, advantage_func, num_actions, num_states, init_adv = 0, extra_samples=10, init_lr=0.05, df=1.0, tol=999): self.pol_func = pol_func(num_states, num_actions) self.advantage_func = advantage_func(init_adv, num_states, num_actions, df) self.opt_pol = self.pol_func.policy self.memory = [] + self.tol=tol super().__init__(extra_samples, init_lr, df) def reset(self): @@ -116,20 +117,24 @@ def reset(self): self.advantage_func.reset() def learn(self): + self.iteration += 1 lr = self.init_lr/(1+0.003*np.sqrt(self.iteration)) - RL_buff = random.sample(self.memory, min(self.extra_samples, len(self.memory))) - RL_buff += self.memory[-min(self.last_round, len(self.memory)):] - - for elem in RL_buff: - grad_log_theta = self.pol_func.grad_log(elem["s"], elem["a"]) - advantage = self.advantage_func.eval(elem["s"], elem["a"], elem["r"], elem["s'"]) - theta_update = grad_log_theta*advantage - a_prime = np.argmax(np.random.multinomial(1, pvals=self.opt_pol[elem["s'"]])) - delta = self.advantage_func.calc_delta(elem["s"], elem["a"], elem["r"], elem["s'"], a_prime) - self.advantage_func.update(lr*delta, elem["s"], elem["a"]) - self.pol_func.thetas += lr*theta_update - self.opt_pol = self.pol_func.update() + prev_pol = np.copy(self.opt_pol) - self.tol - 1 + while np.linalg.norm(prev_pol - self.opt_pol) > self.tol: + prev_pol = np.copy(self.opt_pol) + RL_buff = random.sample(self.memory, min(self.extra_samples, len(self.memory))) + RL_buff += self.memory[-min(self.last_round, len(self.memory)):] + + for elem in RL_buff: + grad_log_theta = self.pol_func.grad_log(elem["s"], elem["a"]) + advantage = self.advantage_func.eval(elem["s"], elem["a"], elem["r"], elem["s'"]) + theta_update = grad_log_theta*advantage + a_prime = np.argmax(np.random.multinomial(1, pvals=self.opt_pol[elem["s'"]])) + delta = self.advantage_func.calc_delta(elem["s"], elem["a"], elem["r"], elem["s'"], a_prime) + self.advantage_func.update(lr*delta, elem["s"], elem["a"]) + self.pol_func.thetas += lr*theta_update + self.opt_pol = self.pol_func.update() return self.opt_pol class pol_func_base: @@ -286,6 +291,7 @@ def update(self, update, s, _): def reset(self): self.V = np.ones_like(self.V) * self.init_adv + class complete_learner: def __init__(self, RL, SL, num_loops=1): @@ -305,3 +311,102 @@ def learn(self): self.beta = self.RL_learner.learn() self.pi = self.SL_learner.learn() return self.beta, self.pi + +class kuhn_exact_solver: + + def calc_opt(self, opp_pol, p_id): + opt_pol = np.zeros((6,2)) + if p_id == 1: + opt_pol[3,1] = 1 # always fold with a 1 if raised + opt_pol[5,0] = 1 # always call with a 3 if raised + + p_bet_given_card = opp_pol[0:3,0]/2 + p_bet = np.sum(p_bet_given_card) - p_bet_given_card[1] + p_cards_given_bet = p_bet_given_card/p_bet + p_cards_given_bet[1] = 0 + if p_cards_given_bet[0] > 0.25: + opt_pol[4,0] = 1 + elif p_cards_given_bet[0] < 0.25: + opt_pol[4,1] = 1 + else: + opt_pol[4,0] = 1/3 + opt_pol[4,1] = 2/3 + + for i in range(3): + bet_r = 0 + check_r = 0 + for j in range(3): + if i != j: + bet_r += 0.5*opp_pol[j+3,1] + if j < i: + bet_r += 0.5*opp_pol[j+3,0]*2 + check_r += 0.5*opp_pol[j,1]*1 + check_r += 0.5*opp_pol[j,0]*(opt_pol[i+3,0]*2-opt_pol[i+3,1]) + else: + bet_r += 0.5*opp_pol[j+3,0]*(-2) + check_r += 0.5*opp_pol[j,1]*(-1) + check_r += 0.5*opp_pol[j,0]*((-opt_pol[i+3,0]*2)-opt_pol[i+3,1]) + if bet_r > check_r: + opt_pol[i,0] = 1 + elif check_r > bet_r: + opt_pol[i,1] = 1 + else: + if i == 0: + opt_pol[i,0] = 1/3 + opt_pol[i,1] = 2/3 + if i == 1: + opt_pol[i,0] = 1 + if i == 2: + opt_pol[i,0] = 1 + else: + opt_pol[3,1] = 1 + opt_pol[5,0] = 1 + opt_pol[2,0] = 1 + + p_act_given_card_times_p_card = (opp_pol[0:3,:]/2) + belief = np.zeros((6,3)) + for i in range(3): + p_act = np.sum(p_act_given_card_times_p_card, axis = 0) - p_act_given_card_times_p_card[i,:] + p_cards_given_act = p_act_given_card_times_p_card/p_act + rem_state = p_cards_given_act + rem_state[i,:] = 0 + p_cards_given_state = rem_state.T + belief[3+i, :] = p_cards_given_state[0,:] + belief[i, :] = p_cards_given_state[1,:] + if belief[4,0] < 0.25: + opt_pol[4,1] = 1 + elif belief[4,0] > 0.25: + opt_pol[4,0] = 1 + else: + opt_pol[4,0] = 1/3 # doesn't actually matter what this is + opt_pol[4,1] = 2/3 # doesn't actually matter what this is + + belief = belief[0:3] # discard the 2nd half of belief since we're done with it + opp_pol = opp_pol[3:] # same for policy + check_rewards = [-1, belief[1,0]-belief[1,2], 1] + bet_rewards = [] + for i in range(2): + bet_r = 0 + for j in range(3): + if i != j: + bet_r += belief[i,j] * opp_pol[j,1] + if j < i: + bet_r += belief[i, j]*opp_pol[j, 0]*2 + else: + bet_r += belief[i, j]*opp_pol[j,0]*(-2) + bet_rewards.append(bet_r) + if bet_rewards[0] > check_rewards[0]: + opt_pol[0,0] = 1 + elif bet_rewards[0] < check_rewards[0]: + opt_pol[0, 1] = 1 + else: + opt_pol[0,0] = 1/3 + opt_pol[0,1] = 2/3 + if bet_rewards[1] > check_rewards[1]: + opt_pol[1,0] = 1 + elif bet_rewards[1] < check_rewards[1]: + opt_pol[1, 1] = 1 + else: + opt_pol[1,1] = 1 + + return opt_pol diff --git a/agents/players.py b/agents/players.py index d4a76a3..2386482 100644 --- a/agents/players.py +++ b/agents/players.py @@ -98,7 +98,7 @@ def __init__(self, opt_pol): def reset(self): pass - def observe(self, observation): + def observe(self, observation, fict=False): self.state = observation[0] self.r = observation[1] @@ -126,9 +126,15 @@ def observe(self, observation, fict=False): if not fict: if self.state != -1: belief_probs = self.belief[self.state, :] - belief_state = np.argmax(np.random.multinomial(1, pvals=belief_probs)) #Here we do OBL - self.fict_game.set_state(self.state, belief_state, self.id) + res = -1 + while res != 0: + belief_state = np.argmax(np.random.multinomial(1, pvals=belief_probs)) + res = self.fict_game.set_state(self.state, belief_state, self.id) + if res == -1: + false_prob = belief_probs[belief_state] + belief_probs[:] += false_prob/(belief_probs.size-1) + belief_probs[belief_state] = 0 # set prob to 0 if it was an impossible state act = self.action() #if self.state == 0: # import pdb; pdb.set_trace() diff --git a/functions.py b/functions.py index eadef80..10a30cd 100644 --- a/functions.py +++ b/functions.py @@ -1,4 +1,5 @@ from agents.players import RL, fixed_pol +from agents import learners import numpy as np def play_game(players, game): @@ -16,54 +17,90 @@ def play_game(players, game): player.wipe_mem() return reward -def calc_exploitability(pol, game, learner, num_iters=100000, num_exploit_iters = 1000): - players = [RL(learner,0), fixed_pol(pol[1])] - - reward_hist = [[],[]] - - change = [[], []] - p_avg_exploitability = [[],[]] - exploit_rewards = [[],[]] - - for i in range(num_iters): - old_pol = np.copy(players[0].opt_pol) - reward_hist[0].append(float(play_game(players, game))) - change[0].append(np.linalg.norm(players[0].opt_pol-old_pol)) - +def calc_exploitability(pol, game, learner, num_iters=100000, num_exploit_iters = 1000, tol=1e-10, exploit_tol = 1e-4): new_pols = [] - new_pols.append(players[0].opt_pol) + p_avg_exploitability = [0,0] + exploit_rewards = [[],[]] + if isinstance(learner, learners.kuhn_exact_solver): + new_pols.append(learner.calc_opt(pol[1],1)) + reward_hist = None + V_1 = None + else: + players = [RL(learner,0), fixed_pol(pol[1])] + + reward_hist = [[],[]] + + change = [[], []] + i = 0 + while True: + old_pol = np.copy(players[0].opt_pol) + reward_hist[0].append(float(play_game(players, game))) + change[0].append(np.linalg.norm(players[0].opt_pol-old_pol)) + i += 1 + if i == num_iters: + break + elif change[0][-1] <= tol: + break + + new_pols.append(players[0].opt_pol) + V_1 = learner.advantage_func.V players = [fixed_pol(new_pols[0]), fixed_pol(pol[1])] - - for i in range(num_exploit_iters): + i = 0 + while True: + old_exploitability = p_avg_exploitability[0] exploit_rewards[0].append(float(play_game(players, game))) + p_avg_exploitability[0] = sum(exploit_rewards[0])/len(exploit_rewards[0]) + i += 1 + if i == num_exploit_iters: + break + elif i>100 and np.abs(old_exploitability - p_avg_exploitability[0]) < exploit_tol: + break p_avg_exploitability[0] = sum(exploit_rewards[0])/len(exploit_rewards[0]) - V_1 = learner.advantage_func.V - learner.reset() - learner.wipe_memory() - - players = [fixed_pol(pol[0]), RL(learner,1)] + if isinstance(learner, learners.kuhn_exact_solver): + new_pols.append(learner.calc_opt(pol[0],2)) + V_2 = None + else: - for i in range(num_iters): - old_pol = np.copy(players[1].opt_pol) - reward_hist[1].append(-float(play_game(players, game))) - change[1].append(np.linalg.norm(players[1].opt_pol-old_pol)) - - new_pols.append(players[1].opt_pol) + learner.reset() + learner.wipe_memory() + + players = [fixed_pol(pol[0]), RL(learner,1)] + + i = 0 + while True: + old_pol = np.copy(players[1].opt_pol) + reward_hist[1].append(-float(play_game(players, game))) + change[1].append(np.linalg.norm(players[1].opt_pol-old_pol)) + i += 1 + if i == num_iters: + break + elif change[1][-1] <= tol: + break + + V_2 = learner.advantage_func.V + new_pols.append(players[1].opt_pol) + learner.reset() + learner.wipe_memory() players = [fixed_pol(pol[0]), fixed_pol(new_pols[1])] - for i in range(num_exploit_iters): + i = 0 + while True: + old_exploitability = p_avg_exploitability[1] exploit_rewards[1].append(-float(play_game(players, game))) + p_avg_exploitability[1] = sum(exploit_rewards[1])/len(exploit_rewards[1]) + i+= 1 + if i == num_exploit_iters: + break + elif i > 100 and np.abs(old_exploitability - p_avg_exploitability[1]) < exploit_tol: + break p_avg_exploitability[1] = sum(exploit_rewards[1])/len(exploit_rewards[1]) - V_2 = learner.advantage_func.V avg_exploitability = sum(p_avg_exploitability) - learner.reset() - learner.wipe_memory() #import pdb; pdb.set_trace() return avg_exploitability, new_pols, reward_hist, (V_1, V_2) diff --git a/games/kuhn.py b/games/kuhn.py index 0c22ebc..5dc3c4d 100644 --- a/games/kuhn.py +++ b/games/kuhn.py @@ -107,7 +107,8 @@ def set_state(self, p_state, hidden_state, p_id): self.cards = list(self.poss_hidden[hidden_state]) player_card = (p_state % (self.num_players+1))+1 self.cards.insert(p_id, player_card) - + if player_card == self.poss_hidden[hidden_state]: + return -1 # impossible state p_pot = (p_state // (self.num_players+1)) self.curr_bets = list(self.poss_pots[p_pot]) @@ -128,6 +129,7 @@ def set_state(self, p_state, hidden_state, p_id): self.folded[p] = not self.betted[p] for p in range(first, self.num_players): self.folded[p] = not self.betted[p] + return 0 def get_hidden(self, p_id): curr_cards = self.cards.copy() diff --git a/games/leduc.py b/games/leduc.py index 50f4abc..e3eb438 100644 --- a/games/leduc.py +++ b/games/leduc.py @@ -11,18 +11,19 @@ class leduc(base): ended = False pub_card_revealed = False num_players = 2 + num_cards = 6 def __init__(self, _num_players=2): random.seed(1) self.num_players = _num_players - self.num_states = [6*(3+6*3*3), 6*(3+6*3*3)] + self.num_states = [self.num_cards*(3+self.num_cards*3*3), self.num_cards*(3+self.num_cards*3*3)] # 6 possible cards in hand, 3 information states for each player in first round, # then 3 possible starting pots for second round & new card in public (one of 5 left) self.num_actions = [3 for i in range(self.num_players)] # call/check, raise, fold (sometimes some not allowed) def start_game(self): self.curr_bets = [1 for i in range(self.num_players)] - cards_available = list(range(6)) # 6 cards, % 2 for suit, // 2 for number + cards_available = list(range(self.num_cards)) # 6 cards, % 2 for suit, // 2 for number random.shuffle(cards_available) self.cards = cards_available[:self.num_players] self.pub_card = cards_available[self.num_players] @@ -116,16 +117,20 @@ def observe(self): pot.pop(self.curr_player) if pub_card == -1: pot_ind = self.first_poss_pots.index(tuple(pot)) - return pot_ind*(6)+card, reward + return pot_ind*(self.num_cards)+card, reward else: end_round1_ind = (self.end_first_bets-1)/2 true_second_poss_pots = [tuple([elem+self.end_first_bets for elem in pot]) \ for pot in self.second_poss_pots] pot_ind = true_second_poss_pots.index(tuple(pot)) - poss_pub_cards = list(range(6)) + poss_pub_cards = list(range(self.num_cards)) poss_pub_cards.pop(card) - pub_card_ind = poss_pub_cards.index(pub_card) + 1 - return int((pub_card_ind)*3*3*6+end_round1_ind*3*6+pot_ind*6+card), reward + try: + pub_card_ind = poss_pub_cards.index(pub_card) + 1 + except ValueError: + import pdb; pdb.set_trace() + return int((pub_card_ind)*3*3*self.num_cards\ + +end_round1_ind*3*self.num_cards+pot_ind*self.num_cards+card), reward else: return -1, reward @@ -136,3 +141,64 @@ def action(self, act): super().action("check") else: super().action("fold") + +class leduc_fict(leduc_int): + + def __init__(self): + super().__init__() + self.poss_hidden = list(product(list(range(0,self.num_cards)), \ + repeat=self.num_players-1)) + + + def set_state(self, p_state, hidden_state, p_id): + if self.num_players > 2: + raise NotImplementedError + else: + self.ended = False + self.curr_player = p_id + self.cards = list(self.poss_hidden[hidden_state]) + player_card = (p_state % (self.num_cards)) + self.cards.insert(p_id, player_card) + + p_pot = (p_state // self.num_cards) % 3 + + + if p_state > self.num_cards*3: + end_round1_ind = (p_state // (self.num_cards*3)) % 3 + end_round1_bets = (end_round1_ind*2)+1 + self.end_first_bets = end_round1_bets + pub_card_ind = ((p_state // (self.num_cards*3*3)) % 5)-1 + poss_pub_cards = list(range(self.num_cards)) + poss_pub_cards.pop(player_card) + self.pub_card = poss_pub_cards[pub_card_ind] + if self.pub_card in self.cards: + return -1 # impossible state + self.pub_card_revealed = True + bets = self.second_poss_pots[p_pot] + bets = [bet+end_round1_bets for bet in bets] + else: + self.pub_card_revealed = False + bets = self.first_poss_pots[p_pot] + + pub_card_sel = False + while not pub_card_sel: + pub_card = random.randint(0, self.num_cards-1) + if pub_card not in self.cards: + pub_card_sel = True + self.pub_card = pub_card + + + self.curr_bets = list(bets) + if self.pub_card_revealed: + self.curr_bets.insert(p_id, max(end_round1_bets, bets[0]-4)) + else: + self.curr_bets.insert(p_id, max(1, bets[0]-2)) + + self.folded = [False for folded in self.folded] + return 0 + + def get_hidden(self, p_id): + curr_cards = self.cards.copy() + curr_cards.pop(p_id) + hidden_state = self.poss_hidden.index(tuple(curr_cards)) + return hidden_state diff --git a/main.py b/main.py index 0a4b3e8..4076cd9 100644 --- a/main.py +++ b/main.py @@ -6,10 +6,10 @@ from functions import * import numpy as np import sys +import time +from tqdm import tqdm def main(): - game = Kuhn_Poker_int_io() - game = leduc_int() if len(sys.argv) > 1: if '--lvls' in sys.argv: level_ind = sys.argv.index('--lvls') @@ -24,26 +24,57 @@ def main(): return(-1) else: num_lvls = 10 + if '--game' in sys.argv: + game_ind = sys.argv.index('--game') + if len(sys.argv) > game_ind: + if sys.argv[game_ind+1] == "kuhn": + game = Kuhn_Poker_int_io() + fict_game = Fict_Kuhn_int() + elif sys.argv[game_ind+1] == "leduc": + game = leduc_int() + fict_game = leduc_fict() + else: + print("Please enter a game choice") + return -1 + else: + print("Please select a game") + return(-1) + else: + num_lvls = 10 else: num_lvls = 10 - averaged_bel ='--avg_bel' in sys.argv or '-ab' in sys.argv - averaged_pol ='--avg_pol' in sys.argv or '-ap' in sys.argv + game = Kuhn_Poker_int_io() + fict_game = Fict_Kuhn_int() + if '--all_avg' in sys.argv or '-a' in sys.argv: + averaged_bel = True + averaged_pol = True + learn_with_avg = True + else: + averaged_bel ='--avg_bel' in sys.argv or '-ab' in sys.argv + averaged_pol ='--avg_pol' in sys.argv or '-ap' in sys.argv + learn_with_avg = '--avg_learn' in sys.argv or '-al' in sys.argv games_per_lvl=100000 + exploit_freq= 1 num_players = 2 RL_learners = [learners.actor_critic(learners.softmax, learners.value_advantage,\ game.num_actions[p], game.num_states[p], extra_samples = 0)\ for p in range(num_players)] - fict_game = Fict_Kuhn_int() exploit_learner = learners.actor_critic(learners.softmax, learners.value_advantage, \ - game.num_actions[0], game.num_states[0]) - players = [RL(RL_learners[p],p) for p in range(num_players)] - #players = [OBL(RL_learners[p], p, fict_game) for p in range(num_players)] + game.num_actions[0], game.num_states[0], tol=9999) + #players = [RL(RL_learners[p],p) for p in range(num_players)] + players = [OBL(RL_learners[p], p, fict_game) for p in range(num_players)] + fixed_players = [fixed_pol(players[p].opt_pol) for p in range(num_players)] for p in range(num_players): curr_player = players.pop(p) + fixed_curr = fixed_players.pop(p) if curr_player.belief is not None: - curr_player.set_other_players(players) + if learn_with_avg: + curr_player.set_other_players(fixed_players) + else: + curr_player.set_other_players(players) + fixed_players.insert(p, fixed_curr) players.insert(p, curr_player) reward_hist = [[0 for i in range(games_per_lvl)] for lvl in range(num_lvls)] @@ -52,6 +83,8 @@ def main(): avg_pols = [] avg_bels = [] exploitability = [] + times = [] + tic = time.perf_counter() for lvl in range(num_lvls): pols = [] bels = [] @@ -74,7 +107,7 @@ def main(): p.belief = np.copy(avg_bel) new_avg_bels.append(avg_bel) avg_bels.append(new_avg_bels) - if averaged_pol: + if averaged_pol or learn_with_avg: new_avg_pols = [] for p_id, p in enumerate(players): total_pol = np.zeros_like(pol_hist[0][p_id]) @@ -83,15 +116,23 @@ def main(): avg_pol = total_pol / (lvl+1) new_avg_pols.append(avg_pol) avg_pols.append(new_avg_pols) - exploit, _, _ = calc_exploitability(new_avg_pols, game, exploit_learner) - else: - exploit, _, _ = calc_exploitability(pols, game, exploit_learner) - exploitability.append(exploit) - print(exploit) + if lvl % exploit_freq == 0: + if averaged_pol: + exploit, _, _, _ = calc_exploitability(new_avg_pols, game, exploit_learner) + else: + exploit, _, _, _ = calc_exploitability(pols, game, exploit_learner) + exploitability.append(exploit) + print(exploit) + if learn_with_avg: + for p_id, p in enumerate(players): + for other_p_id, other_pol in enumerate(new_avg_pols): + if other_p_id != p_id: + p.other_players[other_p_id].opt_pol = other_pol for p in players: p.reset() for i in range(games_per_lvl): reward_hist[lvl][i] = float(play_game(players, game)) + times.append(time.perf_counter()-tic) pols = [] bels = [] for p in players: @@ -122,9 +163,9 @@ def main(): avg_pol = total_pol / (lvl+1) new_avg_pols.append(avg_pol) avg_pols.append(new_avg_pols) - exploit, _, _ = calc_exploitability(new_avg_pols, game, exploit_learner) + exploit, _, _, _ = calc_exploitability(new_avg_pols, game, exploit_learner) else: - exploit, _, _ = calc_exploitability(pols, game, exploit_learner) + exploit, _, _, _ = calc_exploitability(pols, game, exploit_learner) exploitability.append(exploit) #pol_hist = pol_hist[-5:] #belief_hist = belief_hist[-5:] diff --git a/main_FSP.py b/main_FSP.py index 08a9735..b084c60 100644 --- a/main_FSP.py +++ b/main_FSP.py @@ -5,6 +5,7 @@ import agents.learners as learners from UI.plot_funcs import FSP_plots import logging +import numpy as np #sort of working logging.basicConfig(level=logging.INFO, format='%(relativeCreated)6d %(threadName)s %(message)s') @@ -18,18 +19,22 @@ extras = 0 num_BR = 30000 num_mixed = 20000 -iters = 1000000 -time = 3600 +iters = 100000000 +time = 36000 RL_iters = 1 -check_freq = 10 +check_freq = 1 #new test #extras = 0 -#num_BR = 30 -#num_mixed = 0 -#iters = 10000000 -#time = 100 - +#Num_BR = 3000 +#Num_mixed = 2000 +#Iters = 10000000 +#RL_iters = 1000 +#Time = 300 +#pol = np.array([[1/3,2/3],[0,1],[1,0],[0,1],[1/3,2/3],[1,0]]) +#pol = np.ones((6,2))/2 +#exact = learners.kuhn_exact_solver(pol,1) +#import pdb; pdb.set_trace() game_obj = leduc.leduc_int() game_obj = KP.Kuhn_Poker_int_io()