Skip to content

Commit

Permalink
added logs for FSP, created BR tester
Browse files Browse the repository at this point in the history
  • Loading branch information
lukearcus committed Aug 10, 2022
1 parent 6a50143 commit 8fa28cf
Show file tree
Hide file tree
Showing 7 changed files with 113 additions and 46 deletions.
28 changes: 20 additions & 8 deletions FSP.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import numpy as np
import random
import time
import logging
log = logging.getLogger(__name__)

class FSP:

Expand All @@ -19,7 +21,6 @@ def gen_data(self, pi, beta, eta):
sigma = []
for p in range(self.num_players):
sigma.append((1-eta)*pi[p]+eta*beta[p])
#import pdb; pdb.set_trace()
D = [[] for i in range(self.num_players)]
for i in range(self.n):
res = self.play_game(sigma)
Expand All @@ -42,8 +43,8 @@ def run_algo(self):
for p in range(self.num_players):
pi_1.append(self.agents[p].pi)

pi.append(pi_1)
beta.append(pi_1)
pi.append(pi_1) # pi_1
beta.append(pi_1) # beta_2

exploitability = []
tic = time.perf_counter()
Expand All @@ -54,16 +55,23 @@ def run_algo(self):
#exploitability.append(curr_exploitability)
new_beta = []
new_pi = []
diff = 0
for p in range(self.num_players):
self.agents[p].update_memory(D[p])
new_b, new_p = self.agents[p].learn()
new_beta.append(new_b)
new_pi.append(new_p)
new_beta.append(new_b) # beta_(j+1)
new_pi.append(new_p) # pi_j
log.debug("p" + str(p+1) + " sigma: " + str(sigma[p]))
log.debug("p" + str(p+1) + " new_pi: " + str(new_pi[p]))
log.debug("p" + str(p+1) + " new_beta: " + str(new_beta[p]))
#import pdb; pdb.set_trace()
diff += np.linalg.norm(new_pi[p]-sigma[p])
log.info("norm difference between new_pi and sigma: " +str(diff))
pi.append(new_pi)
beta.append(new_beta)
#import pdb; pdb.set_trace()
if j%self.est_exploit_freq == 0:
exploitability.append(self.est_exploitability(new_pi, new_beta))
exploitability.append(self.est_exploitability(sigma, new_beta))
toc = time.perf_counter()
if toc-tic > self.max_time:
break
Expand Down Expand Up @@ -91,7 +99,10 @@ def play_game(self, strat):
buffer[player][-1]["r"] = r
return buffer

#def calc_BRs(self, pol):
#def calc_true_BRs(self, pol):

#for each information state
#calc next state probs (given fixed opponent)
# if self.num_players != 2:
# raise NotImplementedError
# else:
Expand All @@ -111,5 +122,6 @@ def est_exploitability(self, pi, beta):

for p in range(self.num_players):
R[p] /= self.exploitability_iters
#import pdb; pdb.set_trace()
#import pdb; pdb.set_trace()
log.info("Exploitability: " + str(sum(R)))
return sum(R)
7 changes: 3 additions & 4 deletions agents/learners.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,9 @@ def learn(self):
RL_buff += self.memory[-min(self.last_round, len(self.memory)):]

for elem in RL_buff:
theta_update = np.zeros_like(self.pol_func.thetas)
grad_log_theta = self.pol_func.grad_log(elem["s"], elem["a"])
advantage = self.advantage_func.eval(elem["s"], elem["a"], elem["r"], elem["s'"])
theta_update += grad_log_theta*advantage

theta_update = grad_log_theta*advantage
a_prime = np.argmax(np.random.multinomial(1, pvals=self.opt_pol[elem["s'"]]))
delta = self.advantage_func.calc_delta(elem["s"], elem["a"], elem["r"], elem["s'"], a_prime)
self.advantage_func.update(lr*delta, elem["s"], elem["a"])
Expand Down Expand Up @@ -171,6 +169,7 @@ def grad_log(self, s, a):
return grad

def update(self):
self.thetas = np.minimum(10**2,np.maximum(-10**2,self.thetas))
self.policy = np.exp(self.thetas)/np.sum(np.exp(self.thetas),axis=1)[:,np.newaxis]
return self.policy

Expand Down Expand Up @@ -265,7 +264,7 @@ def update_memory(self, data):

def learn(self):
self.RL_learner.reset()
for i in range(100):
for i in range(50):
self.beta = self.RL_learner.learn()
self.pi = self.SL_learner.learn()
return self.beta, self.pi
9 changes: 8 additions & 1 deletion agents/players.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ def action(self):
def reset(self):
pass

def wipe_mem(self):
pass

class human(player):
Q_mat = 0
state_hist=[]
Expand Down Expand Up @@ -58,10 +61,14 @@ def __init__(self, learner, player_id):
self.id = player_id
self.buffer = []

def reset(self):
def wipe_mem(self):
self.buffer = []
self.learner.wipe_memory()

def reset(self):
self.learner.reset()
self.opt_pol = self.learner.opt_pol

def observe(self, observation, fict=False):
self.state = observation[0]
if not fict:
Expand Down
16 changes: 16 additions & 0 deletions functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@

def play_game(players, game):
game.start_game()
while not game.ended:
player = players[game.curr_player]
player.observe(game.observe())
game.action(player.action())
for i in players:
player = players[game.curr_player]
player.observe(game.observe())
game.action(None)
reward = players[0].r
for player in players:
player.wipe_mem()
return reward

33 changes: 9 additions & 24 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,39 +3,19 @@
import agents.learners as learners
from scipy.ndimage.filters import gaussian_filter1d
from UI.plot_funcs import plot_everything

def play_game(players, game):
game.start_game()
while not game.ended:
player = players[game.curr_player]
player.observe(game.observe())
game.action(player.action())
for i in players:
player = players[game.curr_player]
player.observe(game.observe())
game.action(None)
reward = players[0].r
for player in players:
player.reset()
return reward
from functions import play_game

game = Kuhn_Poker_int_io()
num_lvls = 4
num_lvls = 1
games_per_lvl=100000
num_players = 2
RL_learners = [learners.actor_critic(learners.softmax, learners.value_advantage, 2, 6, extra_samples = 0)\
for p in range(num_players)]
fict_game = Fict_Kuhn_int()
#players = [RL(RL_learners[p],p) for p in range(num_players)]
pol = [np.ones((6,2))/2 for i in range(2)]
#pol = [np.array([[0.2045, 0.795],[0.7105,0.2895],[0.6680,0.3320],[0.7231,0.2769],[0.5,0.5],[0.5,0.5]]),\
# np.array([[0.6861, 0.3139],[0.4518,0.5482],[0.4385,0.5615],[0.1512,0.8488],[0.7143,0.2857],[0.5833,0.4167]])]
#pol = [np.array([[0.75,0.25],[0.75,0.25],[0.75,0.25],[0.5,0.5],[0.5,0.5],[0.5,0.5]]),\
# np.array([[0.67,0.33],[0.69,0.31],[0.71,0.29],[0.19,0.81],[0.77,0.23],[0.79,0.21]])]

#players = [fixed_pol(pol[0]), RL(RL_learners[1],1)]
#players = [RL(RL_learners[0],0), fixed_pol(pol[1])]
#players = [RL(RL_learners[p],p) for p in range(num_players)]
players = [OBL(RL_learners[p], p, fict_game) for p in range(num_players)]

for p in range(num_players):
curr_player = players.pop(p)
if curr_player.belief is not None:
Expand All @@ -57,6 +37,8 @@ def play_game(players, game):
bels.append(np.zeros((1,1)))
pol_hist.append(pols)
belief_hist.append(bels)
for p in players:
p.reset()
for i in range(games_per_lvl):
reward_hist[lvl][i] = float(play_game(players, game))
pols = []
Expand All @@ -71,6 +53,9 @@ def play_game(players, game):
pol_hist.append(pols)
belief_hist.append(bels)

#pol_hist = pol_hist[-5:]
#belief_hist = belief_hist[-5:]

plot_everything(pol_hist, belief_hist, "kuhn")

import pdb; pdb.set_trace()
19 changes: 10 additions & 9 deletions main_FSP.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,32 @@
import matplotlib.pyplot as plt
import agents.learners as learners
from UI.plot_funcs import FSP_plots

import logging

#sort of working
logging.basicConfig(level=logging.DEBUG, format='%(relativeCreated)6d %(threadName)s %(message)s')
extras = 20
num_BR = 30
num_mixed = 10
iters= 200000
time = 300
time = 30

#test
#extras = 2
#num_BR = 4
#num_mixed = 0
#iters = 100000
#time = 60
extras = 0
num_BR = 10000
num_mixed = 1000
iters = 100000
time = 60

KP_game = game.Kuhn_Poker_int_io()

RL_learners = [learners.actor_critic(learners.softmax, learners.value_advantage, 2, 6, init_adv=-2, extra_samples = extras)\
RL_learners = [learners.actor_critic(learners.softmax, learners.value_advantage, 2, 6, init_adv=0, extra_samples = extras)\
for p in range(2)]
SL_learners = [learners.count_based_SL((6,2)) for p in range(2)]

agents = [learners.complete_learner(RL_learners[p], SL_learners[p]) for p in range(2)]

worker = FSP.FSP(KP_game, agents, max_iters=iters, max_time=time, m=num_BR, n=num_mixed)
worker = FSP.FSP(KP_game, agents, max_iters=iters, max_time=time, m=num_BR, n=num_mixed, exploit_freq=1)
pi, exploitability, data = worker.run_algo()

FSP_plots(exploitability, worker.est_exploit_freq, [pi], 'kuhn')
Expand Down
47 changes: 47 additions & 0 deletions test_BR.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from Kuhn_poker.game import *
from agents.players import *
import agents.learners as learners
from UI.plot_funcs import plot_everything
from functions import play_game

game = Kuhn_Poker_int_io()
games_per_lvl=100000
num_players = 2
RL_learners = [learners.actor_critic(learners.softmax, learners.value_advantage, 2, 6, extra_samples = 0)\
for p in range(num_players)]
fict_game = Fict_Kuhn_int()
pol = [np.ones((6,2))/2 for i in range(2)]
#pol = [np.array([[0.2045, 0.795],[0.7105,0.2895],[0.6680,0.3320],[0.7231,0.2769],[0.5,0.5],[0.5,0.5]]),\
# np.array([[0.6861, 0.3139],[0.4518,0.5482],[0.4385,0.5615],[0.1512,0.8488],[0.7143,0.2857],[0.5833,0.4167]])]
#pol = [np.array([[0.75,0.25],[0.75,0.25],[0.75,0.25],[0.5,0.5],[0.5,0.5],[0.5,0.5]]),\
# np.array([[0.67,0.33],[0.69,0.31],[0.71,0.29],[0.19,0.81],[0.77,0.23],[0.79,0.21]])]
pol = [np.array([[2/3, 1/3],[2/3,1/3],[2/3,1/3],[1/3,2/3],[2/3,1/3],[2/3,1/3]]) for i in range(2)]

players = [RL(RL_learners[0],0), fixed_pol(pol[1])]

reward_hist = []

for i in range(games_per_lvl):
reward_hist.append(float(play_game(players, game)))

R = reward_hist[-100:]
pols = []
pols.append(players[0].opt_pol)
V_1 = players[0].learner.advantage_func.V

players = [fixed_pol(pol[0]), RL(RL_learners[1],1)]

for i in range(games_per_lvl):
reward_hist.append(-float(play_game(players, game)))

R += reward_hist[-100:]
pols.append(players[1].opt_pol)
V_2 = players[1].learner.advantage_func.V

print(sum(R)/200)
print(pols[0])
print(pols[1])
print(V_1)
print(V_2)

import pdb; pdb.set_trace()

0 comments on commit 8fa28cf

Please sign in to comment.