Skip to content

Commit

Permalink
Implemented kuhn solver to check exploit properly
Browse files Browse the repository at this point in the history
  • Loading branch information
lukearcus committed Aug 25, 2022
1 parent ef02593 commit f738621
Show file tree
Hide file tree
Showing 8 changed files with 352 additions and 84 deletions.
14 changes: 10 additions & 4 deletions FSP.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def run_algo(self):
tic = time.perf_counter()
exploit_learner = learners.actor_critic(learners.softmax, learners.value_advantage, \
self.game.num_actions[0], self.game.num_states[0])
solver = learners.kuhn_exact_solver()
for j in range(1,self.max_iters): # start from 1 or 2?
eta_j = 1/j
#eta_j = 1/2
Expand All @@ -77,13 +78,18 @@ def run_algo(self):
#import pdb; pdb.set_trace()
if j%self.est_exploit_freq == 0:

exploit_calced_br, br_pols, _, values = calc_exploitability(new_pi, self.game, exploit_learner)
true_exploit, true_br_pols, _, _ = calc_exploitability(new_pi, self.game, solver,\
num_iters = -1, num_exploit_iters=-1)
exploit_calced_br, br_pols, _, values = calc_exploitability(new_pi, self.game, exploit_learner,\
num_iters = -1, num_exploit_iters=-1)
exploit = self.est_exploitability(new_pi, new_beta)
#import pdb; pdb.set_trace()
# compare br_pols with beta
log.info("exploitability: " + str(exploit_calced_br))
for p in range(self.num_players):
log.debug("p" + str(p+1) + " br: " + str(br_pols[p]))
log.info("exact exploitability: " + str(true_exploit))
log.info("exploitability with learned br: " + str(exploit_calced_br))
log.info("exploitability using beta: " + str(exploit))
exploitability.append(exploit)
exploitability.append(true_exploit)
toc = time.perf_counter()
if toc-tic > self.max_time:
break
Expand Down
131 changes: 118 additions & 13 deletions agents/learners.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,32 +104,37 @@ def reset(self):

class actor_critic(RL_base):

def __init__(self, pol_func, advantage_func, num_actions, num_states, init_adv = 0, extra_samples=10, init_lr=0.05, df=1.0):
def __init__(self, pol_func, advantage_func, num_actions, num_states, init_adv = 0, extra_samples=10, init_lr=0.05, df=1.0, tol=999):
self.pol_func = pol_func(num_states, num_actions)
self.advantage_func = advantage_func(init_adv, num_states, num_actions, df)
self.opt_pol = self.pol_func.policy
self.memory = []
self.tol=tol
super().__init__(extra_samples, init_lr, df)

def reset(self):
self.pol_func.reset()
self.advantage_func.reset()

def learn(self):

self.iteration += 1
lr = self.init_lr/(1+0.003*np.sqrt(self.iteration))
RL_buff = random.sample(self.memory, min(self.extra_samples, len(self.memory)))
RL_buff += self.memory[-min(self.last_round, len(self.memory)):]

for elem in RL_buff:
grad_log_theta = self.pol_func.grad_log(elem["s"], elem["a"])
advantage = self.advantage_func.eval(elem["s"], elem["a"], elem["r"], elem["s'"])
theta_update = grad_log_theta*advantage
a_prime = np.argmax(np.random.multinomial(1, pvals=self.opt_pol[elem["s'"]]))
delta = self.advantage_func.calc_delta(elem["s"], elem["a"], elem["r"], elem["s'"], a_prime)
self.advantage_func.update(lr*delta, elem["s"], elem["a"])
self.pol_func.thetas += lr*theta_update
self.opt_pol = self.pol_func.update()
prev_pol = np.copy(self.opt_pol) - self.tol - 1
while np.linalg.norm(prev_pol - self.opt_pol) > self.tol:
prev_pol = np.copy(self.opt_pol)
RL_buff = random.sample(self.memory, min(self.extra_samples, len(self.memory)))
RL_buff += self.memory[-min(self.last_round, len(self.memory)):]

for elem in RL_buff:
grad_log_theta = self.pol_func.grad_log(elem["s"], elem["a"])
advantage = self.advantage_func.eval(elem["s"], elem["a"], elem["r"], elem["s'"])
theta_update = grad_log_theta*advantage
a_prime = np.argmax(np.random.multinomial(1, pvals=self.opt_pol[elem["s'"]]))
delta = self.advantage_func.calc_delta(elem["s"], elem["a"], elem["r"], elem["s'"], a_prime)
self.advantage_func.update(lr*delta, elem["s"], elem["a"])
self.pol_func.thetas += lr*theta_update
self.opt_pol = self.pol_func.update()
return self.opt_pol

class pol_func_base:
Expand Down Expand Up @@ -286,6 +291,7 @@ def update(self, update, s, _):
def reset(self):
self.V = np.ones_like(self.V) * self.init_adv


class complete_learner:

def __init__(self, RL, SL, num_loops=1):
Expand All @@ -305,3 +311,102 @@ def learn(self):
self.beta = self.RL_learner.learn()
self.pi = self.SL_learner.learn()
return self.beta, self.pi

class kuhn_exact_solver:

def calc_opt(self, opp_pol, p_id):
opt_pol = np.zeros((6,2))
if p_id == 1:
opt_pol[3,1] = 1 # always fold with a 1 if raised
opt_pol[5,0] = 1 # always call with a 3 if raised

p_bet_given_card = opp_pol[0:3,0]/2
p_bet = np.sum(p_bet_given_card) - p_bet_given_card[1]
p_cards_given_bet = p_bet_given_card/p_bet
p_cards_given_bet[1] = 0
if p_cards_given_bet[0] > 0.25:
opt_pol[4,0] = 1
elif p_cards_given_bet[0] < 0.25:
opt_pol[4,1] = 1
else:
opt_pol[4,0] = 1/3
opt_pol[4,1] = 2/3

for i in range(3):
bet_r = 0
check_r = 0
for j in range(3):
if i != j:
bet_r += 0.5*opp_pol[j+3,1]
if j < i:
bet_r += 0.5*opp_pol[j+3,0]*2
check_r += 0.5*opp_pol[j,1]*1
check_r += 0.5*opp_pol[j,0]*(opt_pol[i+3,0]*2-opt_pol[i+3,1])
else:
bet_r += 0.5*opp_pol[j+3,0]*(-2)
check_r += 0.5*opp_pol[j,1]*(-1)
check_r += 0.5*opp_pol[j,0]*((-opt_pol[i+3,0]*2)-opt_pol[i+3,1])
if bet_r > check_r:
opt_pol[i,0] = 1
elif check_r > bet_r:
opt_pol[i,1] = 1
else:
if i == 0:
opt_pol[i,0] = 1/3
opt_pol[i,1] = 2/3
if i == 1:
opt_pol[i,0] = 1
if i == 2:
opt_pol[i,0] = 1
else:
opt_pol[3,1] = 1
opt_pol[5,0] = 1
opt_pol[2,0] = 1

p_act_given_card_times_p_card = (opp_pol[0:3,:]/2)
belief = np.zeros((6,3))
for i in range(3):
p_act = np.sum(p_act_given_card_times_p_card, axis = 0) - p_act_given_card_times_p_card[i,:]
p_cards_given_act = p_act_given_card_times_p_card/p_act
rem_state = p_cards_given_act
rem_state[i,:] = 0
p_cards_given_state = rem_state.T
belief[3+i, :] = p_cards_given_state[0,:]
belief[i, :] = p_cards_given_state[1,:]
if belief[4,0] < 0.25:
opt_pol[4,1] = 1
elif belief[4,0] > 0.25:
opt_pol[4,0] = 1
else:
opt_pol[4,0] = 1/3 # doesn't actually matter what this is
opt_pol[4,1] = 2/3 # doesn't actually matter what this is

belief = belief[0:3] # discard the 2nd half of belief since we're done with it
opp_pol = opp_pol[3:] # same for policy
check_rewards = [-1, belief[1,0]-belief[1,2], 1]
bet_rewards = []
for i in range(2):
bet_r = 0
for j in range(3):
if i != j:
bet_r += belief[i,j] * opp_pol[j,1]
if j < i:
bet_r += belief[i, j]*opp_pol[j, 0]*2
else:
bet_r += belief[i, j]*opp_pol[j,0]*(-2)
bet_rewards.append(bet_r)
if bet_rewards[0] > check_rewards[0]:
opt_pol[0,0] = 1
elif bet_rewards[0] < check_rewards[0]:
opt_pol[0, 1] = 1
else:
opt_pol[0,0] = 1/3
opt_pol[0,1] = 2/3
if bet_rewards[1] > check_rewards[1]:
opt_pol[1,0] = 1
elif bet_rewards[1] < check_rewards[1]:
opt_pol[1, 1] = 1
else:
opt_pol[1,1] = 1

return opt_pol
12 changes: 9 additions & 3 deletions agents/players.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def __init__(self, opt_pol):
def reset(self):
pass

def observe(self, observation):
def observe(self, observation, fict=False):
self.state = observation[0]
self.r = observation[1]

Expand Down Expand Up @@ -126,9 +126,15 @@ def observe(self, observation, fict=False):
if not fict:
if self.state != -1:
belief_probs = self.belief[self.state, :]
belief_state = np.argmax(np.random.multinomial(1, pvals=belief_probs))
#Here we do OBL
self.fict_game.set_state(self.state, belief_state, self.id)
res = -1
while res != 0:
belief_state = np.argmax(np.random.multinomial(1, pvals=belief_probs))
res = self.fict_game.set_state(self.state, belief_state, self.id)
if res == -1:
false_prob = belief_probs[belief_state]
belief_probs[:] += false_prob/(belief_probs.size-1)
belief_probs[belief_state] = 0 # set prob to 0 if it was an impossible state
act = self.action()
#if self.state == 0:
# import pdb; pdb.set_trace()
Expand Down
101 changes: 69 additions & 32 deletions functions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from agents.players import RL, fixed_pol
from agents import learners
import numpy as np

def play_game(players, game):
Expand All @@ -16,54 +17,90 @@ def play_game(players, game):
player.wipe_mem()
return reward

def calc_exploitability(pol, game, learner, num_iters=100000, num_exploit_iters = 1000):
players = [RL(learner,0), fixed_pol(pol[1])]

reward_hist = [[],[]]

change = [[], []]
p_avg_exploitability = [[],[]]
exploit_rewards = [[],[]]

for i in range(num_iters):
old_pol = np.copy(players[0].opt_pol)
reward_hist[0].append(float(play_game(players, game)))
change[0].append(np.linalg.norm(players[0].opt_pol-old_pol))

def calc_exploitability(pol, game, learner, num_iters=100000, num_exploit_iters = 1000, tol=1e-10, exploit_tol = 1e-4):
new_pols = []
new_pols.append(players[0].opt_pol)
p_avg_exploitability = [0,0]
exploit_rewards = [[],[]]
if isinstance(learner, learners.kuhn_exact_solver):
new_pols.append(learner.calc_opt(pol[1],1))
reward_hist = None
V_1 = None
else:
players = [RL(learner,0), fixed_pol(pol[1])]

reward_hist = [[],[]]

change = [[], []]
i = 0
while True:
old_pol = np.copy(players[0].opt_pol)
reward_hist[0].append(float(play_game(players, game)))
change[0].append(np.linalg.norm(players[0].opt_pol-old_pol))
i += 1
if i == num_iters:
break
elif change[0][-1] <= tol:
break

new_pols.append(players[0].opt_pol)
V_1 = learner.advantage_func.V

players = [fixed_pol(new_pols[0]), fixed_pol(pol[1])]

for i in range(num_exploit_iters):
i = 0
while True:
old_exploitability = p_avg_exploitability[0]
exploit_rewards[0].append(float(play_game(players, game)))
p_avg_exploitability[0] = sum(exploit_rewards[0])/len(exploit_rewards[0])
i += 1
if i == num_exploit_iters:
break
elif i>100 and np.abs(old_exploitability - p_avg_exploitability[0]) < exploit_tol:
break

p_avg_exploitability[0] = sum(exploit_rewards[0])/len(exploit_rewards[0])
V_1 = learner.advantage_func.V

learner.reset()
learner.wipe_memory()

players = [fixed_pol(pol[0]), RL(learner,1)]
if isinstance(learner, learners.kuhn_exact_solver):
new_pols.append(learner.calc_opt(pol[0],2))
V_2 = None
else:

for i in range(num_iters):
old_pol = np.copy(players[1].opt_pol)
reward_hist[1].append(-float(play_game(players, game)))
change[1].append(np.linalg.norm(players[1].opt_pol-old_pol))

new_pols.append(players[1].opt_pol)
learner.reset()
learner.wipe_memory()

players = [fixed_pol(pol[0]), RL(learner,1)]

i = 0
while True:
old_pol = np.copy(players[1].opt_pol)
reward_hist[1].append(-float(play_game(players, game)))
change[1].append(np.linalg.norm(players[1].opt_pol-old_pol))
i += 1
if i == num_iters:
break
elif change[1][-1] <= tol:
break

V_2 = learner.advantage_func.V
new_pols.append(players[1].opt_pol)
learner.reset()
learner.wipe_memory()
players = [fixed_pol(pol[0]), fixed_pol(new_pols[1])]

for i in range(num_exploit_iters):
i = 0
while True:
old_exploitability = p_avg_exploitability[1]
exploit_rewards[1].append(-float(play_game(players, game)))
p_avg_exploitability[1] = sum(exploit_rewards[1])/len(exploit_rewards[1])
i+= 1
if i == num_exploit_iters:
break
elif i > 100 and np.abs(old_exploitability - p_avg_exploitability[1]) < exploit_tol:
break

p_avg_exploitability[1] = sum(exploit_rewards[1])/len(exploit_rewards[1])

V_2 = learner.advantage_func.V

avg_exploitability = sum(p_avg_exploitability)
learner.reset()
learner.wipe_memory()

#import pdb; pdb.set_trace()
return avg_exploitability, new_pols, reward_hist, (V_1, V_2)
4 changes: 3 additions & 1 deletion games/kuhn.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,8 @@ def set_state(self, p_state, hidden_state, p_id):
self.cards = list(self.poss_hidden[hidden_state])
player_card = (p_state % (self.num_players+1))+1
self.cards.insert(p_id, player_card)

if player_card == self.poss_hidden[hidden_state]:
return -1 # impossible state
p_pot = (p_state // (self.num_players+1))

self.curr_bets = list(self.poss_pots[p_pot])
Expand All @@ -128,6 +129,7 @@ def set_state(self, p_state, hidden_state, p_id):
self.folded[p] = not self.betted[p]
for p in range(first, self.num_players):
self.folded[p] = not self.betted[p]
return 0

def get_hidden(self, p_id):
curr_cards = self.cards.copy()
Expand Down
Loading

0 comments on commit f738621

Please sign in to comment.