From 8fa28cf74e0c82708b0a88bf1b90e02c3532b0fd Mon Sep 17 00:00:00 2001 From: lukearcus Date: Wed, 10 Aug 2022 16:51:48 +0100 Subject: [PATCH] added logs for FSP, created BR tester --- FSP.py | 28 +++++++++++++++++++-------- agents/learners.py | 7 +++---- agents/players.py | 9 ++++++++- functions.py | 16 ++++++++++++++++ main.py | 33 +++++++++----------------------- main_FSP.py | 19 ++++++++++--------- test_BR.py | 47 ++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 113 insertions(+), 46 deletions(-) create mode 100644 functions.py create mode 100644 test_BR.py diff --git a/FSP.py b/FSP.py index 3d091c8..591cb5e 100644 --- a/FSP.py +++ b/FSP.py @@ -1,6 +1,8 @@ import numpy as np import random import time +import logging +log = logging.getLogger(__name__) class FSP: @@ -19,7 +21,6 @@ def gen_data(self, pi, beta, eta): sigma = [] for p in range(self.num_players): sigma.append((1-eta)*pi[p]+eta*beta[p]) - #import pdb; pdb.set_trace() D = [[] for i in range(self.num_players)] for i in range(self.n): res = self.play_game(sigma) @@ -42,8 +43,8 @@ def run_algo(self): for p in range(self.num_players): pi_1.append(self.agents[p].pi) - pi.append(pi_1) - beta.append(pi_1) + pi.append(pi_1) # pi_1 + beta.append(pi_1) # beta_2 exploitability = [] tic = time.perf_counter() @@ -54,16 +55,23 @@ def run_algo(self): #exploitability.append(curr_exploitability) new_beta = [] new_pi = [] + diff = 0 for p in range(self.num_players): self.agents[p].update_memory(D[p]) new_b, new_p = self.agents[p].learn() - new_beta.append(new_b) - new_pi.append(new_p) + new_beta.append(new_b) # beta_(j+1) + new_pi.append(new_p) # pi_j + log.debug("p" + str(p+1) + " sigma: " + str(sigma[p])) + log.debug("p" + str(p+1) + " new_pi: " + str(new_pi[p])) + log.debug("p" + str(p+1) + " new_beta: " + str(new_beta[p])) + #import pdb; pdb.set_trace() + diff += np.linalg.norm(new_pi[p]-sigma[p]) + log.info("norm difference between new_pi and sigma: " +str(diff)) pi.append(new_pi) beta.append(new_beta) #import pdb; pdb.set_trace() if j%self.est_exploit_freq == 0: - exploitability.append(self.est_exploitability(new_pi, new_beta)) + exploitability.append(self.est_exploitability(sigma, new_beta)) toc = time.perf_counter() if toc-tic > self.max_time: break @@ -91,7 +99,10 @@ def play_game(self, strat): buffer[player][-1]["r"] = r return buffer - #def calc_BRs(self, pol): + #def calc_true_BRs(self, pol): + + #for each information state + #calc next state probs (given fixed opponent) # if self.num_players != 2: # raise NotImplementedError # else: @@ -111,5 +122,6 @@ def est_exploitability(self, pi, beta): for p in range(self.num_players): R[p] /= self.exploitability_iters - #import pdb; pdb.set_trace() + #import pdb; pdb.set_trace() + log.info("Exploitability: " + str(sum(R))) return sum(R) diff --git a/agents/learners.py b/agents/learners.py index 49081ec..540ce84 100644 --- a/agents/learners.py +++ b/agents/learners.py @@ -110,11 +110,9 @@ def learn(self): RL_buff += self.memory[-min(self.last_round, len(self.memory)):] for elem in RL_buff: - theta_update = np.zeros_like(self.pol_func.thetas) grad_log_theta = self.pol_func.grad_log(elem["s"], elem["a"]) advantage = self.advantage_func.eval(elem["s"], elem["a"], elem["r"], elem["s'"]) - theta_update += grad_log_theta*advantage - + theta_update = grad_log_theta*advantage a_prime = np.argmax(np.random.multinomial(1, pvals=self.opt_pol[elem["s'"]])) delta = self.advantage_func.calc_delta(elem["s"], elem["a"], elem["r"], elem["s'"], a_prime) self.advantage_func.update(lr*delta, elem["s"], elem["a"]) @@ -171,6 +169,7 @@ def grad_log(self, s, a): return grad def update(self): + self.thetas = np.minimum(10**2,np.maximum(-10**2,self.thetas)) self.policy = np.exp(self.thetas)/np.sum(np.exp(self.thetas),axis=1)[:,np.newaxis] return self.policy @@ -265,7 +264,7 @@ def update_memory(self, data): def learn(self): self.RL_learner.reset() - for i in range(100): + for i in range(50): self.beta = self.RL_learner.learn() self.pi = self.SL_learner.learn() return self.beta, self.pi diff --git a/agents/players.py b/agents/players.py index 6b717e9..d4a76a3 100644 --- a/agents/players.py +++ b/agents/players.py @@ -18,6 +18,9 @@ def action(self): def reset(self): pass + def wipe_mem(self): + pass + class human(player): Q_mat = 0 state_hist=[] @@ -58,10 +61,14 @@ def __init__(self, learner, player_id): self.id = player_id self.buffer = [] - def reset(self): + def wipe_mem(self): self.buffer = [] self.learner.wipe_memory() + def reset(self): + self.learner.reset() + self.opt_pol = self.learner.opt_pol + def observe(self, observation, fict=False): self.state = observation[0] if not fict: diff --git a/functions.py b/functions.py new file mode 100644 index 0000000..2d3fdb8 --- /dev/null +++ b/functions.py @@ -0,0 +1,16 @@ + +def play_game(players, game): + game.start_game() + while not game.ended: + player = players[game.curr_player] + player.observe(game.observe()) + game.action(player.action()) + for i in players: + player = players[game.curr_player] + player.observe(game.observe()) + game.action(None) + reward = players[0].r + for player in players: + player.wipe_mem() + return reward + diff --git a/main.py b/main.py index 3b71830..0421999 100644 --- a/main.py +++ b/main.py @@ -3,39 +3,19 @@ import agents.learners as learners from scipy.ndimage.filters import gaussian_filter1d from UI.plot_funcs import plot_everything - -def play_game(players, game): - game.start_game() - while not game.ended: - player = players[game.curr_player] - player.observe(game.observe()) - game.action(player.action()) - for i in players: - player = players[game.curr_player] - player.observe(game.observe()) - game.action(None) - reward = players[0].r - for player in players: - player.reset() - return reward +from functions import play_game game = Kuhn_Poker_int_io() -num_lvls = 4 +num_lvls = 1 games_per_lvl=100000 num_players = 2 RL_learners = [learners.actor_critic(learners.softmax, learners.value_advantage, 2, 6, extra_samples = 0)\ for p in range(num_players)] fict_game = Fict_Kuhn_int() -#players = [RL(RL_learners[p],p) for p in range(num_players)] -pol = [np.ones((6,2))/2 for i in range(2)] -#pol = [np.array([[0.2045, 0.795],[0.7105,0.2895],[0.6680,0.3320],[0.7231,0.2769],[0.5,0.5],[0.5,0.5]]),\ -# np.array([[0.6861, 0.3139],[0.4518,0.5482],[0.4385,0.5615],[0.1512,0.8488],[0.7143,0.2857],[0.5833,0.4167]])] -#pol = [np.array([[0.75,0.25],[0.75,0.25],[0.75,0.25],[0.5,0.5],[0.5,0.5],[0.5,0.5]]),\ -# np.array([[0.67,0.33],[0.69,0.31],[0.71,0.29],[0.19,0.81],[0.77,0.23],[0.79,0.21]])] -#players = [fixed_pol(pol[0]), RL(RL_learners[1],1)] -#players = [RL(RL_learners[0],0), fixed_pol(pol[1])] +#players = [RL(RL_learners[p],p) for p in range(num_players)] players = [OBL(RL_learners[p], p, fict_game) for p in range(num_players)] + for p in range(num_players): curr_player = players.pop(p) if curr_player.belief is not None: @@ -57,6 +37,8 @@ def play_game(players, game): bels.append(np.zeros((1,1))) pol_hist.append(pols) belief_hist.append(bels) + for p in players: + p.reset() for i in range(games_per_lvl): reward_hist[lvl][i] = float(play_game(players, game)) pols = [] @@ -71,6 +53,9 @@ def play_game(players, game): pol_hist.append(pols) belief_hist.append(bels) +#pol_hist = pol_hist[-5:] +#belief_hist = belief_hist[-5:] + plot_everything(pol_hist, belief_hist, "kuhn") import pdb; pdb.set_trace() diff --git a/main_FSP.py b/main_FSP.py index b55e395..287e623 100644 --- a/main_FSP.py +++ b/main_FSP.py @@ -3,31 +3,32 @@ import matplotlib.pyplot as plt import agents.learners as learners from UI.plot_funcs import FSP_plots - +import logging #sort of working +logging.basicConfig(level=logging.DEBUG, format='%(relativeCreated)6d %(threadName)s %(message)s') extras = 20 num_BR = 30 num_mixed = 10 iters= 200000 -time = 300 +time = 30 #test -#extras = 2 -#num_BR = 4 -#num_mixed = 0 -#iters = 100000 -#time = 60 +extras = 0 +num_BR = 10000 +num_mixed = 1000 +iters = 100000 +time = 60 KP_game = game.Kuhn_Poker_int_io() -RL_learners = [learners.actor_critic(learners.softmax, learners.value_advantage, 2, 6, init_adv=-2, extra_samples = extras)\ +RL_learners = [learners.actor_critic(learners.softmax, learners.value_advantage, 2, 6, init_adv=0, extra_samples = extras)\ for p in range(2)] SL_learners = [learners.count_based_SL((6,2)) for p in range(2)] agents = [learners.complete_learner(RL_learners[p], SL_learners[p]) for p in range(2)] -worker = FSP.FSP(KP_game, agents, max_iters=iters, max_time=time, m=num_BR, n=num_mixed) +worker = FSP.FSP(KP_game, agents, max_iters=iters, max_time=time, m=num_BR, n=num_mixed, exploit_freq=1) pi, exploitability, data = worker.run_algo() FSP_plots(exploitability, worker.est_exploit_freq, [pi], 'kuhn') diff --git a/test_BR.py b/test_BR.py new file mode 100644 index 0000000..de7d3b8 --- /dev/null +++ b/test_BR.py @@ -0,0 +1,47 @@ +from Kuhn_poker.game import * +from agents.players import * +import agents.learners as learners +from UI.plot_funcs import plot_everything +from functions import play_game + +game = Kuhn_Poker_int_io() +games_per_lvl=100000 +num_players = 2 +RL_learners = [learners.actor_critic(learners.softmax, learners.value_advantage, 2, 6, extra_samples = 0)\ + for p in range(num_players)] +fict_game = Fict_Kuhn_int() +pol = [np.ones((6,2))/2 for i in range(2)] +#pol = [np.array([[0.2045, 0.795],[0.7105,0.2895],[0.6680,0.3320],[0.7231,0.2769],[0.5,0.5],[0.5,0.5]]),\ +# np.array([[0.6861, 0.3139],[0.4518,0.5482],[0.4385,0.5615],[0.1512,0.8488],[0.7143,0.2857],[0.5833,0.4167]])] +#pol = [np.array([[0.75,0.25],[0.75,0.25],[0.75,0.25],[0.5,0.5],[0.5,0.5],[0.5,0.5]]),\ +# np.array([[0.67,0.33],[0.69,0.31],[0.71,0.29],[0.19,0.81],[0.77,0.23],[0.79,0.21]])] +pol = [np.array([[2/3, 1/3],[2/3,1/3],[2/3,1/3],[1/3,2/3],[2/3,1/3],[2/3,1/3]]) for i in range(2)] + +players = [RL(RL_learners[0],0), fixed_pol(pol[1])] + +reward_hist = [] + +for i in range(games_per_lvl): + reward_hist.append(float(play_game(players, game))) + +R = reward_hist[-100:] +pols = [] +pols.append(players[0].opt_pol) +V_1 = players[0].learner.advantage_func.V + +players = [fixed_pol(pol[0]), RL(RL_learners[1],1)] + +for i in range(games_per_lvl): + reward_hist.append(-float(play_game(players, game))) + +R += reward_hist[-100:] +pols.append(players[1].opt_pol) +V_2 = players[1].learner.advantage_func.V + +print(sum(R)/200) +print(pols[0]) +print(pols[1]) +print(V_1) +print(V_2) + +import pdb; pdb.set_trace()