Skip to content

Commit

Permalink
added timer to FSP
Browse files Browse the repository at this point in the history
  • Loading branch information
lukearcus committed Aug 8, 2022
1 parent 1c37dcb commit fe6e762
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 20 deletions.
20 changes: 14 additions & 6 deletions Kuhn_poker/FSP.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,25 @@
import numpy as np
import random
import time

class FSP:

def __init__(self, _game, _agents, max_iters=100, m=50, n=50):
def __init__(self, _game, _agents, max_iters=100, max_time=300, m=50, n=50, exploit_iters=100, exploit_freq=10):
self.game = _game
self.agents = _agents
self.num_players = self.game.num_players
self.m = m
self.n = n
self.max_iters = max_iters
self.exploitability_iters = 100
self.est_exploit_freq = 1
self.max_time = max_time
self.exploitability_iters = exploit_iters
self.est_exploit_freq = exploit_freq

def gen_data(self, pi, beta, eta):
sigma = []
for p in range(self.num_players):
sigma.append((1-eta)*pi[p]+eta*beta[p])
#import pdb; pdb.set_trace()
D = [[] for i in range(self.num_players)]
for i in range(self.n):
res = self.play_game(sigma)
Expand All @@ -30,7 +33,7 @@ def gen_data(self, pi, beta, eta):
result = self.play_game(strat)
exploitability += result[p][-1]['r']/(self.m)
D[p].append((result[p],strat[p],True))
return D, exploitability
return D, exploitability, sigma

def run_algo(self):
pi = []
Expand All @@ -43,10 +46,11 @@ def run_algo(self):
beta.append(pi_1)

exploitability = []
tic = time.perf_counter()
for j in range(2,self.max_iters):
eta_j = 1/j
#eta_j = 1/2
D, curr_exploitability = self.gen_data(pi[-1],beta[-1], eta_j)
D, curr_exploitability, sigma = self.gen_data(pi[-1],beta[-1], eta_j)
#exploitability.append(curr_exploitability)
new_beta = []
new_pi = []
Expand All @@ -57,9 +61,12 @@ def run_algo(self):
new_pi.append(new_p)
pi.append(new_pi)
beta.append(new_beta)

#import pdb; pdb.set_trace()
if j%self.est_exploit_freq == 0:
exploitability.append(self.est_exploitability(new_pi, new_beta))
toc = time.perf_counter()
if toc-tic > self.max_time:
break
#import pdb; pdb.set_trace()
return pi[-1], exploitability, (pi, beta, D)

Expand Down Expand Up @@ -104,4 +111,5 @@ def est_exploitability(self, pi, beta):

for p in range(self.num_players):
R[p] /= self.exploitability_iters
#import pdb; pdb.set_trace()
return sum(R)
21 changes: 20 additions & 1 deletion Kuhn_poker/learners.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,17 @@ def learn(self):

class actor_critic(RL_base):

def __init__(self, pol_func, advantage_func, num_actions, num_states, init_adv = 1, extra_samples=10, init_lr=0.05, df=1.0):
def __init__(self, pol_func, advantage_func, num_actions, num_states, init_adv = 0, extra_samples=10, init_lr=0.05, df=1.0):
self.pol_func = pol_func(num_states, num_actions)
self.advantage_func = advantage_func(init_adv, num_states, num_actions, df)
self.opt_pol = self.pol_func.policy
self.memory = []
super().__init__(extra_samples, init_lr, df)

def reset(self):
self.pol_func.reset()
self.advantage_func.reset()

def learn(self):
self.iteration += 1
lr = self.init_lr/(1+0.003*np.sqrt(self.iteration))
Expand Down Expand Up @@ -131,6 +135,9 @@ def grad_log(self, s, a):
def update(self):
raise NotImplementedError

def reset(self):
raise NotImplementedError

class advantage_func_base:

def __init__(self, init_adv, num_states, num_actions):
Expand All @@ -141,6 +148,9 @@ def eval(self, s, a, r, s_prime):

def update(self, update, s, a):
raise NotImplementedError

def reset(self):
raise NotImplementedError

class softmax(pol_func_base):

Expand All @@ -160,11 +170,16 @@ def grad_log(self, s, a):
def update(self):
self.policy = np.exp(self.thetas)/np.sum(np.exp(self.thetas),axis=1)[:,np.newaxis]
return self.policy

def reset(self):
self.thetas = np.ones_like(self.thetas)
self.update()

class value_advantage(advantage_func_base):

def __init__(self, init_adv, num_states, _, df):
self.V = np.ones(num_states)*init_adv
self.init_adv = init_adv
self.gamma = df

def eval(self, s, a, r, s_prime):
Expand All @@ -177,6 +192,9 @@ def eval(self, s, a, r, s_prime):
def update(self, update, s, _):
self.V[s] += update

def reset(self):
self.V = np.ones_like(self.V) * self.init_adv

class complete_learner:

def __init__(self, RL, SL):
Expand All @@ -190,6 +208,7 @@ def update_memory(self, data):
self.SL_learner.update_memory(data)

def learn(self):
self.RL_learner.reset()
for i in range(100):
self.beta = self.RL_learner.learn()
self.pi = self.SL_learner.learn()
Expand Down
32 changes: 23 additions & 9 deletions Kuhn_poker/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,24 +15,32 @@ def play_game(players, game):
player = players[game.curr_player]
player.observe(game.observe())
game.action(None)
reward = players[0].buffer[-1]["r"]
reward = players[0].r
for player in players:
player.reset()
return reward

game = Kuhn_Poker_int_io()
num_lvls = 4
games_per_lvl=1000
num_lvls = 1
games_per_lvl=100000
num_players = 2
RL_learners = [learners.actor_critic(learners.softmax, learners.value_advantage, 2, 6, extra_samples = 0)\
for p in range(num_players)]
fict_game = Fict_Kuhn_int()
#players = [RL(RL_learners[p],p) for p in range(num_players)]
pol = [np.ones((6,2))/2 for i in range(2)]
#pol = [np.array([[0.2045, 0.795],[0.7105,0.2895],[0.6680,0.3320],[0.7231,0.2769],[0.5,0.5],[0.5,0.5]]),\
# np.array([[0.6861, 0.3139],[0.4518,0.5482],[0.4385,0.5615],[0.1512,0.8488],[0.7143,0.2857],[0.5833,0.4167]])]
pol = [np.array([[0.75,0.25],[0.75,0.25],[0.75,0.25],[0.5,0.5],[0.5,0.5],[0.5,0.5]]),\
np.array([[0.67,0.33],[0.69,0.31],[0.71,0.29],[0.19,0.81],[0.77,0.23],[0.79,0.21]])]

players = [OBL(RL_learners[p], p, fict_game) for p in range(num_players)]
#players = [fixed_pol(pol[0]), RL(RL_learners[1],1)]
players = [RL(RL_learners[0],0), fixed_pol(pol[1])]
#players = [OBL(RL_learners[p], p, fict_game) for p in range(num_players)]
for p in range(num_players):
curr_player = players.pop(p)
curr_player.set_other_players(players)
if curr_player.belief is not None:
curr_player.set_other_players(players)
players.insert(p, curr_player)

reward_hist = [[0 for i in range(games_per_lvl)] for lvl in range(num_lvls)]
Expand All @@ -43,8 +51,11 @@ def play_game(players, game):
bels = []
for p in players:
pols.append(p.opt_pol)
p.update_belief()
bels.append(p.belief)
if p.belief is not None:
p.update_belief()
bels.append(p.belief)
else:
bels.append(np.zeros((1,1)))
pol_hist.append(pols)
belief_hist.append(bels)
for i in range(games_per_lvl):
Expand All @@ -53,8 +64,11 @@ def play_game(players, game):
bels = []
for p in players:
pols.append(p.opt_pol)
p.update_belief()
bels.append(p.belief)
if p.belief is not None:
p.update_belief()
bels.append(p.belief)
else:
bels.append(np.zeros((1,1)))
pol_hist.append(pols)
belief_hist.append(bels)

Expand Down
18 changes: 16 additions & 2 deletions Kuhn_poker/main_FSP.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,23 @@
import matplotlib.pyplot as plt
import learners

#sort of working
extras = 20
num_BR = 30
num_mixed = 10
iters= 200000
time = 60

#test
#extras = 2
#num_BR = 4
#num_mixed = 0
#iters = 100000
#time = 60

KP_game = game.Kuhn_Poker_int_io()

RL_learners = [learners.actor_critic(learners.softmax, learners.value_advantage, 2, 6, extra_samples = 0)\
RL_learners = [learners.actor_critic(learners.softmax, learners.value_advantage, 2, 6, extra_samples = extras)\
for p in range(2)]
SL_learners = [learners.count_based_SL((6,2)) for p in range(2)]

Expand Down Expand Up @@ -40,7 +54,7 @@

agents = [learners.complete_learner(RL_learners[p], SL_learners[p]) for p in range(2)]

worker = FSP.FSP(KP_game, agents, max_iters=20, m=30,n=0)
worker = FSP.FSP(KP_game, agents, max_iters=iters, max_time=time, m=num_BR, n=num_mixed)
pi, exploitability, data = worker.run_algo()
plt.plot(exploitability)
plt.show()
Expand Down
28 changes: 26 additions & 2 deletions Kuhn_poker/players.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ def action(self):
def get_reward(self, reward):
print("You got " + str(reward) + " coins")




class RL(player):
opt_pol = None

Expand All @@ -57,9 +60,9 @@ def __init__(self, learner, player_id):

def reset(self):
self.buffer = []
#self.learner.wipe_memory()
self.learner.wipe_memory()

def observe(self, observation):
def observe(self, observation, fict=False):
self.state = observation[0]
if not fict:
reward = observation[1]
Expand All @@ -71,13 +74,33 @@ def observe(self, observation):
else:
self.learner.update_memory([(self.buffer, None)])
self.opt_pol = self.learner.learn()
self.r = observation[1]

def action(self):
probs = self.opt_pol[self.state, :]
act = np.argmax(np.random.multinomial(1, pvals=probs))
self.buffer[-1]["a"] = act
return act

class fixed_pol(player):
opt_pol = None

def __init__(self, opt_pol):
self.opt_pol = opt_pol

def reset(self):
pass

def observe(self, observation):
self.state = observation[0]
self.r = observation[1]

def action(self):
probs = self.opt_pol[self.state, :]
act = np.argmax(np.random.multinomial(1, pvals=probs))
return act


class OBL(RL):
belief = 0

Expand All @@ -92,6 +115,7 @@ def set_other_players(self, other_players):

def observe(self, observation, fict=False):
self.state = observation[0]
self.r = observation[1]
if not fict:
if self.state != -1:
belief_probs = self.belief[self.state, :]
Expand Down

0 comments on commit fe6e762

Please sign in to comment.