From 37a0d2da0fe63373e8aab4ee0350969b1015730e Mon Sep 17 00:00:00 2001 From: Cam Allen Date: Fri, 18 Jun 2021 16:42:52 -0400 Subject: [PATCH] Clean up archived code for release --- archive/factops/.gitignore | 17 -- archive/factops/factops/__init__.py | 0 archive/factops/factops/cgrid.py | 94 ------- archive/factops/factops/qlearningagent.py | 61 ---- archive/factops/factops/subgoalgrid.py | 124 -------- archive/factops/notebooks/__init__.py | 0 archive/factops/notebooks/icf.py | 265 ------------------ archive/factops/notebooks/train_skill.py | 171 ----------- archive/factops/requirements.txt | 36 --- archive/learners/__init__.py | 0 archive/learners/actorcritic.py | 82 ------ archive/learners/classifier.py | 45 --- archive/learners/common.py | 12 - archive/learners/diayn.py | 50 ---- archive/learners/test_actorcritic.py | 117 -------- archive/learners/test_classifier.py | 63 ----- archive/learners/test_vic.py | 104 ------- archive/learners/vic.py | 139 --------- archive/simple_rl_env.py | 52 ---- experiments/cube/archive/apply_steps.py | 74 ----- .../archive/cube_investigate_dead_ends.py | 46 --- experiments/cube/archive/generate_skills.py | 133 --------- .../cube/archive/option_vs_action_effects.py | 71 ----- .../cube/archive/solve_with_actions.py | 75 ----- .../cube/archive/solve_with_options.py | 84 ------ 25 files changed, 1915 deletions(-) delete mode 100644 archive/factops/.gitignore delete mode 100644 archive/factops/factops/__init__.py delete mode 100644 archive/factops/factops/cgrid.py delete mode 100644 archive/factops/factops/qlearningagent.py delete mode 100644 archive/factops/factops/subgoalgrid.py delete mode 100644 archive/factops/notebooks/__init__.py delete mode 100644 archive/factops/notebooks/icf.py delete mode 100644 archive/factops/notebooks/train_skill.py delete mode 100644 archive/factops/requirements.txt delete mode 100644 archive/learners/__init__.py delete mode 100644 archive/learners/actorcritic.py delete mode 100644 archive/learners/classifier.py delete mode 100644 archive/learners/common.py delete mode 100644 archive/learners/diayn.py delete mode 100644 archive/learners/test_actorcritic.py delete mode 100644 archive/learners/test_classifier.py delete mode 100644 archive/learners/test_vic.py delete mode 100644 archive/learners/vic.py delete mode 100644 archive/simple_rl_env.py delete mode 100644 experiments/cube/archive/apply_steps.py delete mode 100644 experiments/cube/archive/cube_investigate_dead_ends.py delete mode 100644 experiments/cube/archive/generate_skills.py delete mode 100644 experiments/cube/archive/option_vs_action_effects.py delete mode 100644 experiments/cube/archive/solve_with_actions.py delete mode 100644 experiments/cube/archive/solve_with_options.py diff --git a/archive/factops/.gitignore b/archive/factops/.gitignore deleted file mode 100644 index c4b5b43..0000000 --- a/archive/factops/.gitignore +++ /dev/null @@ -1,17 +0,0 @@ -**.idea/ -**.ipynb_checkpoints/ -data/ -env/ -images/ -cluster/logs/ -cluster/scripts/ -logs/ -models/ -notebooks/unversioned/* -videos/ -results/ -scores/ -**__pycache__ -**.pyc -**.DS_Store - diff --git a/archive/factops/factops/__init__.py b/archive/factops/factops/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/archive/factops/factops/cgrid.py b/archive/factops/factops/cgrid.py deleted file mode 100644 index 08fa706..0000000 --- a/archive/factops/factops/cgrid.py +++ /dev/null @@ -1,94 +0,0 @@ -import matplotlib.pyplot as plt -import numpy as np -import seaborn as sns -import time - -class CGridWorld: - '''Continuous gridworld domain - ''' - def __init__(self, n_dims=2, discrete_actions=False): - if discrete_actions: - self.n_actions = 9 - else: - self.n_actions = n_dims - self.discrete_actions = discrete_actions - self.n_states = n_dims - self.state = self.random_state() - - def reset(self): - self.state = self.random_state() - - def random_state(self): - return np.random.uniform(0, 1, size=self.n_states) - - def random_action(self): - if self.discrete_actions: - a = np.random.randint(self.n_actions) - else: - a = np.random.uniform(-0.1,0.1, size=self.n_actions) - return a - - def discrete2continuous(self, a): - assert np.all(a >= 0) and np.all(a < self.n_actions) - ax = a % 3 - 1 - ay = -1*(a // 3 - 1) - return 0.1*np.stack([ax, ay]).transpose() - - def step(self, action): - if self.discrete_actions: - action = self.discrete2continuous(action) - assert len(action)==self.n_states - self.state += action + np.random.normal(0, 0.01, size=self.n_states) - self.state = np.clip(self.state,0,1) - - s = self.get_state() - r = 0 - done = False - return s, r, done - - def get_state(self): - return np.copy(self.state) - - def plot(self, ax=None): - n_subplots = self.n_states//2 + 3 - if ax is None: - _, ax = plt.subplots(nrows=1, ncols=(self.n_states//2), figsize=(4,4)) - for i in range(self.n_states//2): - ax.set_xlim([0,1]) - ax.set_ylim([0,1]) - ax.scatter(self.state[2*i],self.state[2*i+1]) - ax.set_xticks([]) - ax.set_yticks([]) - # ax.set_xlabel('XY({})'.format(i)) - return ax - -def run_agent(env, n_trials=1, n_samples=100, video=False): - if video: - ax = env.plot() - fig = plt.gcf() - fig.show() - states = [env.get_state()] - actions = [] - for trial in range(n_trials): - for sample in range(n_samples): - a = env.random_action() - _, _, done = env.step(a) - actions.append(a) - states.append(env.get_state()) - - if video: - ax.clear() - env.plot(ax) - fig.canvas.draw() - fig.canvas.flush_events() - - if done: - time.sleep(1) - env.reset() - break - return np.stack(states,axis=0), np.stack(actions,axis=0) - -#%% -if __name__ == '__main__': - env = CGridWorld() - run_agent(env, n_samples=100, video=True) diff --git a/archive/factops/factops/qlearningagent.py b/archive/factops/factops/qlearningagent.py deleted file mode 100644 index 4badb88..0000000 --- a/archive/factops/factops/qlearningagent.py +++ /dev/null @@ -1,61 +0,0 @@ -from collections import defaultdict -import numpy as np - -class QLearningAgent(): - def __init__(self, n_actions, lr=0.01, epsilon=0.1, gamma=0.99): - self.n_actions = n_actions - self.lr = lr - self.epsilon = epsilon - self.n_steps_init = 2000 - self.decay_period = 8000 - self.gamma = gamma - self.reset() - - def reset(self): - self.n_training_steps = 0 - self.default_q = 0.0 - self.q_table = defaultdict(lambda : defaultdict(lambda: self.default_q)) - - def get_epsilon(self): - alpha = (self.n_training_steps - self.n_steps_init)/self.decay_period - alpha = np.clip(alpha, 0, 1) - return self.epsilon*alpha + 1*(1-alpha) - - def act(self, state): - # Epsilon-greedy selection w.r.t. valid actions/skills - if (self.n_training_steps < self.n_steps_init - or np.random.uniform() < self.get_epsilon()): - action = np.random.randint(0, self.n_actions) - else: - action = self.greedy_policy(state) - return action - - def greedy_policy(self, state): - if type(state) is np.ndarray and state.ndim > 1: - result = np.asarray([self.greedy_policy(s) for s in state]) - else: - result = max(range(self.n_actions), key=lambda a: self.Q(state, a)) - return result - - def Q(self, state, action): - if type(state) is np.ndarray and state.ndim > 1: - result = np.asarray([self.Q(s,a) for s in state]) - else: - result = self.q_table[tuple(state)][action] - return result - - def v(self, state): - if type(state) is np.ndarray and state.ndim > 1: - result = np.asarray([self.v(s) for s in state]) - else: - result = max([self.Q(state, a) for a in range(self.n_actions)]) - return result - - def train(self, s, a, r, sp, done): - self.n_training_steps += 1 - s = tuple(s) - sp = tuple(sp) - max_q_next = self.v(sp) - q_sa = self.Q(s, a) - bootstrap = 0 if done else self.gamma * max_q_next - self.q_table[s][a] = (1-self.lr) * q_sa + self.lr * (r + bootstrap) diff --git a/archive/factops/factops/subgoalgrid.py b/archive/factops/factops/subgoalgrid.py deleted file mode 100644 index 8dfa503..0000000 --- a/archive/factops/factops/subgoalgrid.py +++ /dev/null @@ -1,124 +0,0 @@ -import numpy as np -import matplotlib.pyplot as plt - -from gridworlds.domain.gridworld.gridworld import GridWorld, DiagGridWorld -from gridworlds.domain.gridworld.objects.depot import Depot - -from . import cgrid - -class SubgoalGridWorld(DiagGridWorld): - def __init__(self, rows=10, cols=10, goal=None, idx=None, penalty=None, obstacles=0): - super().__init__(rows, cols) - if goal is None or idx is None: - self.set_goal(*self.random_goal()) - else: - self.set_goal(idx, goal) - self.penalty = penalty - for o in range(obstacles): - self.add_wall() - self.reset() - - def random_goal(self): - idx = np.random.randint(2) - N = self._rows if idx == 0 else self._cols - goal = np.random.randint(0, N) - return idx, goal - - def random_action(self): - return np.random.randint(self.n_actions) - - def discrete2continuous(self, a): - assert np.all(a >= 0) and np.all(a < self.n_actions) - ax = np.zeros(len(a)) - ay = np.zeros(len(a)) - ax += (a % 2 - 0.5) * (a < 2) - ay += -1*(a % 2 - 0.5) * (a >= 2) - return 0.1*np.stack([ax, ay]).transpose() - - def set_goal(self, idx, goal): - if self.goals: - self.goals = [] - goal_size = self._cols if idx == 0 else self._rows - for i in range(goal_size): - position = i*np.ones(2) - position[idx] = goal - d = Depot(position, color='red') - self.goals.append(d) - self.idx, self.goal = idx, goal - - def reset(self): - self.reset_agent() - self.s0 = self.get_state() - - def step(self, a): - s = self.get_state() - sp, _, done = super().step(a) - mask = np.arange(2) != self.idx - r = -1 - if self.penalty == 'stepwise': - r -= np.sum(np.abs(sp[mask] - s[mask])) - elif self.penalty == 'start': - r -= np.sum(np.abs(sp[mask] - self.s0[mask])) - return sp, r, done - -class SubgoalCGridWorld(cgrid.CGridWorld): - def __init__(self, goal=None, idx=None, tol=0.1, discrete_actions=False, penalty=None): - super().__init__(discrete_actions=discrete_actions) - if goal is None or idx is None: - self.set_goal(*self.random_goal()) - self.tol = tol - self.reset() - - def random_goal(self): - idx = np.random.randint(0, self.n_states) - goal = np.random.uniform(0, 1) - return idx, goal - - def set_goal(self, idx, goal): - self.idx, self.goal = idx, goal - - def reset(self): - self.state = self.random_state() - at = lambda x, y: np.abs(x-y) < self.tol - while (self.goal is not None) and at(self.state[self.idx], self.goal): - self.state = self.random_state() - self.s0 = self.get_state() - assert not at(self.state[self.idx], self.goal) - - def step(self, a): - s = self.get_state() - sp, _, _ = super().step(a) - mask = np.arange(len(sp)) != self.idx - r = -1 - if self.penalty == 'stepwise': - r -= np.sum(np.abs(sp[mask] - s[mask])) - elif self.penalty == 'start': - r -= np.sum(np.abs(sp[mask] - self.s0[mask])) - done = ( np.abs(sp[self.idx] - self.goal) < self.tol ) - return sp, r, done - - def plot(self, ax=None): - ax = super().plot(ax) - line_positions = [self.goal-self.tol, self.goal+self.tol] - if self.idx % 2 == 0: - ax.vlines(line_positions, 0, 1, 'r') - else: - ax.hlines(line_positions, 0, 1, 'r') - return ax - -def main(): - env = SubgoalGridWorld() - env.plot() -#%% - a = 2 - env.step(a) - env.discrete2continuous(a) -#%% - #0 LEFT - #1 RIGHT - #2 UP - #3 DOWN - cgrid.run_agent(env, n_trials=5, n_samples=100, video=True) - -if __name__ == '__main__': - main() diff --git a/archive/factops/notebooks/__init__.py b/archive/factops/notebooks/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/archive/factops/notebooks/icf.py b/archive/factops/notebooks/icf.py deleted file mode 100644 index a7cc91d..0000000 --- a/archive/factops/notebooks/icf.py +++ /dev/null @@ -1,265 +0,0 @@ -import matplotlib.pyplot as plt -import numpy as np -import os -import seaborn as sns -import sys -import torch -import torch.nn -import torch.nn.functional as F -from tqdm import tqdm - -from gridworlds.nn.nnutils import Network, Reshape, extract -from gridworlds.utils import reset_seeds -import gridworlds.sensors as sensors - -#%% -# seed = 10# good -seed = 1 -reset_seeds(seed) - -n_zdim = 4 - -class Squares: - def __init__(self, nsquares=1, size=12, side=2): - self.nsquares = nsquares - self.size = size # size of the observation - self.side = side # size of squares - self.reset() - - def reset(self): - self.state = [np.random.randint(0,self.size-self.side,2) for i in range(self.nsquares)] - - @property - def nactions(self): - # 0=right, 1=left, 2=up, 3=down - return 4 - - def get_state(self): - return np.copy(self.state) - - def observe(self, s): - x = np.zeros([self.size]*2, 'float32') - for i in range(self.nsquares): - x[s[i][0]:s[i][0]+self.side, s[i][1]:s[i][1]+self.side] = 1 - return x - - def step(self, action): - delta = [(1,0),(-1,0),(0,1),(0,-1)] - self.state = [p+delta[action[i]] for i,p in enumerate(self.state)] - self.state = [np.minimum(np.maximum(p,0),self.size-self.side) for p in self.state] - r = 0 - done = False - return (self.state, r, done) - - def genRandomSample(self): - """ - get a random (s,a,s') transition from the environment (assuming a uniform policy) - - returns (state, action, next state) - """ - self.reset() - - s0 = self.state - x0 = self.observe(s0) - - action = np.random.randint(0,self.nactions,self.nsquares) - self.step(action) - - s1 = self.state - x1 = self.observe(s1) - - return (x0.flatten(), action, x1.flatten(), s0, s1) - -class ICFNet(Network): - def __init__(self, input_shape=(12,12), n_hidden=32, n_zdim=4, n_actions=4): - super().__init__() - n_inputs = np.prod(input_shape) - self.conv1 = torch.nn.Conv2d( 1,16,kernel_size=3,stride=2,padding=1) - self.conv2 = torch.nn.Conv2d(16,16,kernel_size=3,stride=2,padding=1) - self.flatten = Reshape(-1, 16*3*3) - self.fc1 = torch.nn.Linear(16*3*3, n_hidden) - self.fc2 = torch.nn.Linear(n_hidden, n_zdim) - self.fc_pi = torch.nn.Linear(n_hidden, n_zdim*n_actions) - self.regroup = Reshape(-1, n_zdim, 4) - - self.dec_fc2 = torch.nn.Linear(n_zdim, n_hidden) - self.dec_fc1 = torch.nn.Linear(n_hidden, 16*3*3) - self.unflatten = Reshape(-1, 16, 3, 3) - self.deconv2 = torch.nn.ConvTranspose2d(16,16,kernel_size=3,stride=2,padding=1,output_padding=1) - self.deconv1 = torch.nn.ConvTranspose2d(16, 1,kernel_size=3,stride=2,padding=1,output_padding=1) - - def encode(self, x): - tmp = x - tmp = self.conv1(tmp) - tmp = F.relu(tmp) - tmp = self.conv2(tmp) - tmp = F.relu(tmp) - tmp = self.flatten(tmp) - tmp = self.fc1(tmp) - tmp = F.relu(tmp) - prev = tmp - tmp = self.fc2(tmp) - z = torch.tanh(tmp) - - tmp = self.fc_pi(prev) - tmp = self.regroup(tmp) - pi = F.softmax(tmp, dim=-1) - return z, pi - - def decode(self, z): - tmp = z - tmp = self.dec_fc2(tmp) - tmp = F.relu(tmp) - tmp = self.dec_fc1(tmp) - tmp = F.relu(tmp) - tmp = self.unflatten(tmp) - tmp = self.deconv2(tmp) - tmp = F.relu(tmp) - tmp = self.deconv1(tmp) - x_hat = torch.tanh(tmp) - return x_hat - - def forward(self, x): - z, pi = self.encode(x) - x = self.decode(z) - return x, pi, z - pass - -class ICFTrainer(): - def __init__(self, model, beta=0.1, lr=0.0005): - self.model = model - self.beta = beta - self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr) - - def reconstruction_loss(self, x, x_hat): - return 0.5 * F.mse_loss(input=x_hat, target=x) - - def selectivity(self, z, zp): - return (zp - z) / (1e-4 + torch.sum(F.relu(zp-z), dim=-1, keepdim=True)) - - def train_batch(self, batch_s, batch_a, batch_sp): - self.model.train() - self.optimizer.zero_grad() - - s_hat, pi, z = self.model(batch_s) - sp_hat, pi_p, zp = self.model(batch_sp) - - L_ae = self.reconstruction_loss(batch_s, s_hat) - - Pr_a_acted = extract(pi, idx=batch_a, idx_dim=-1) - sel_acted = self.selectivity(z, zp) - L_sel = torch.mean(torch.sum(Pr_a_acted * sel_acted, dim=1), dim=0) - loss = L_ae - self.beta * L_sel - - loss.backward() - self.optimizer.step() - return loss, L_ae, L_sel - pass - -#%% -env = Squares() -sensor = sensors.SensorChain([ - env, - sensors.TorchSensor(), - sensors.UnsqueezeSensor(0), -]) - -plt.imshow(sensor.observe(env.get_state())[0]) -plt.show() - -#%% -replay_s = [] -replay_a = [] -replay_sp = [] -replay_x = [] -replay_y = [] -for t in tqdm(range(40000)): - env.reset() - s = sensor.observe(env.get_state()) - a = np.random.randint(4, size=env.nsquares) - env.step(a) - x,y = env.get_state()[0] - sp = sensor.observe(env.get_state()) - replay_x.append(x) - replay_y.append(y) - replay_s.append(s) - replay_a.append(a) - replay_sp.append(sp) -replay_x = torch.as_tensor(replay_x) -replay_y = torch.as_tensor(replay_y) -replay_s = torch.stack(replay_s) -replay_a = torch.as_tensor(replay_a) -replay_sp = torch.stack(replay_sp) - -#%% -net = ICFNet(n_zdim=n_zdim) -trainer = ICFTrainer(net) - -batch_size = 64 -losses = [] -L_aes = [] -L_sels = [] -for i in tqdm(range(500*20)): - batch_idx = np.random.choice(np.arange(len(replay_a)), batch_size) - batch_s = replay_s[batch_idx] - batch_a = replay_a[batch_idx] - batch_sp = replay_sp[batch_idx] - loss, L_ae, L_sel = trainer.train_batch(batch_s, batch_a, batch_sp) - losses.append(loss) - L_aes.append(L_ae) - L_sels.append(L_sel) - -#%% -os.makedirs('results/seed_{}'.format(seed), exist_ok=True) - -s_recons, pi, z = net(batch_s) -fig, ax = plt.subplots(3,2, figsize=(4,6)) -ax = ax.flatten() -[a.axis('off') for a in ax] -ax[0].set_title('observed state') -ax[1].set_title('reconstruction') -ax[0].imshow(batch_s[0][0].detach().numpy(),vmin=0,vmax=1) -ax[1].imshow(s_recons[0][0].detach().numpy(),vmin=0,vmax=1) -ax[2].imshow(batch_s[15][0].detach().numpy(),vmin=0,vmax=1) -ax[3].imshow(s_recons[15][0].detach().numpy(),vmin=0,vmax=1) -ax[4].imshow(batch_s[31][0].detach().numpy(),vmin=0,vmax=1) -ax[5].imshow(s_recons[31][0].detach().numpy(),vmin=0,vmax=1) -plt.savefig('results/seed_{}/reconstruction.png'.format(seed)) -plt.show() - -#%% -plt.plot(losses) -plt.title('Loss vs Time') -plt.xlabel('Minibatches') -plt.savefig('results/seed_{}/loss.png'.format(seed)) -plt.show() - -#%% - -pi = net(replay_s)[1].detach().numpy() -plt.imshow(np.mean(pi,0), vmin=0, vmax=1) -plt.xticks(range(4),['right','left','up','down']) -plt.yticks(range(n_zdim)) -plt.xlabel(r'$a$') -plt.ylabel(r'$k$') -plt.colorbar() -plt.title(r'$E_s[\pi_k(a|s)]$') -plt.savefig('results/seed_{}/policy.png'.format(seed)) -plt.show() - -z = net.encode(replay_s)[0].detach().numpy() -zp = net.encode(replay_sp)[0].detach().numpy() -a = replay_a.detach().numpy() - -r = np.zeros((n_zdim,2)) -for k in range(n_zdim): - r[k,0] = np.corrcoef(x=replay_x,y=z[:,k])[1][0] - r[k,1] = np.corrcoef(x=replay_y,y=z[:,k])[1][0] -plt.imshow(r,cmap='coolwarm', vmin=-1, vmax=1) -plt.xticks([0,1],['x','y']) -plt.yticks(range(n_zdim)) -plt.ylabel(r'$f_k$') -plt.title('Correlation with true state') -plt.colorbar() -plt.savefig('results/seed_{}/factors.png'.format(seed)) -plt.show() diff --git a/archive/factops/notebooks/train_skill.py b/archive/factops/notebooks/train_skill.py deleted file mode 100644 index 3769ad5..0000000 --- a/archive/factops/notebooks/train_skill.py +++ /dev/null @@ -1,171 +0,0 @@ -import json -import matplotlib.pyplot as plt -import numpy as np -import os -import seaborn as sns -import sys -from tqdm import tqdm - -from gridworlds.domain.gridworld.grid import action_meanings -from gridworlds.domain.gridworld.gridworld import GridWorld, DiagGridWorld -from factops.subgoalgrid import SubgoalGridWorld, SubgoalCGridWorld -from gridworlds.utils import reset_seeds -from gridworlds.agents.dqnagent import DQNAgent -from factops.qlearningagent import QLearningAgent -from gridworlds.nn.nullabstraction import NullAbstraction -from gridworlds.sensors import OffsetSensor - -#%% -class args: pass -args.seed = 0 -args.n_trials = 1 -args.n_episodes = 10000 -args.max_steps = 100 -args.video = True -args.subgoal = True -args.continuous = False -# args.penalty = None -# args.penalty = 'stepwise' -args.penalty = 'start' -args.obstacles = 5 - -reset_seeds(args.seed) - -if args.continuous: - env = SubgoalCGridWorld(discrete_actions=True) - xy_offset = 0 - flipxy = False - xy_lim = 1 -else: - if args.subgoal: - env = SubgoalGridWorld(10,10, penalty=args.penalty, obstacles=args.obstacles) - else: - env = DiagGridWorld(10, 10) - env.discrete2continuous = lambda a: np.asarray(list(map(lambda i: env.action_map[i], a.tolist())))*np.asarray([-1,1]) - env.reset_goals(1) - flipxy = True - xy_offset = 0.5 - xy_lim = 9 - - -# agent = DQNAgent(2, env.n_actions, NullAbstraction(-1, 2), n_hidden_layers=2, lr=0.001) -agent = QLearningAgent(env.n_actions, lr=0.1) - -if args.video: - fig, ax = plt.subplots(2,2) - ax = ax.flatten() - fig.show() - - def plot_value_function(ax): - n_bins = 10 - x, y = np.meshgrid(np.linspace(0,xy_lim,n_bins), np.linspace(0,xy_lim,n_bins)) - if flipxy: - x, y = y, x - s = np.stack([np.asarray(x), np.asarray(y)],axis=-1) - v = agent.v(s).reshape(n_bins,n_bins) - v.shape - if flipxy: - x, y = y, x - ax.contourf(x+xy_offset, y+xy_offset, v, vmin=-10, vmax=0) - - def plot_policy(ax): - n_bins = 10 - x, y = np.meshgrid(np.linspace(0,xy_lim,n_bins), np.linspace(0,xy_lim,n_bins)) - if flipxy: - x, y = y, x - s = np.stack([np.asarray(x), np.asarray(y)],axis=-1) - # s = np.concatenate([x, y], axis=-1) - a = agent.greedy_policy(s).reshape(-1) - dir = env.discrete2continuous(a) - # dir = list(map(lambda x: action_meanings[tuple(x)], dir.tolist())) - dir = np.asarray(dir).reshape(n_bins, n_bins, 2) - dir_x = dir[:,:,1] - dir_y = dir[:,:,0] - - if flipxy: - x, y = y, x - ax.quiver(x+xy_offset, y+xy_offset, dir_x, dir_y) - - def plot_states(ax): - data = pd.DataFrame(agent.replay.memory) - data[['x.r','x.c']] = pd.DataFrame(data['x'].tolist(), index=data.index) - data[['xp.r','xp.c']] = pd.DataFrame(data['xp'].tolist(), index=data.index) - sns.scatterplot(data=data, x='x.c',y='x.r', hue='done', style='done', markers=True, size='done', size_order=[1,0], ax=ax, alpha=0.3, legend=False) - ax.invert_yaxis() - -for trial in tqdm(range(args.n_trials), desc='trials'): - # env.set_goal(*env.random_goal()) - agent.reset() - total_reward = 0 - total_steps = 0 - losses = [] - rewards = [] - value_fn = [] - for episode in tqdm(range(args.n_episodes), desc='episodes'): - env.reset() - ep_rewards = [] - for step in range(args.max_steps): - s = env.get_state() - a = agent.act(s) - sp, r, done = env.step(a) - ep_rewards.append(r) - if args.video: - value_fn.append(agent.v(s)) - total_reward += r - - loss = agent.train(s, a, r, sp, done) - losses.append(loss) - - if done: - break - rewards.append(sum(ep_rewards)) - - if args.video and episode % 500 == 0: - [a.clear() for a in ax] - plot_value_function(ax[0]) - env.plot(ax[0]) - ax[0].set_title('V(s)') - env.plot(ax[1]) - plot_policy(ax[1]) - ax[1].set_title('Policy') - ax[2].plot(rewards, c='C3') - ax[2].set_title('Rewards') - ax[3].plot(value_fn) - ax[3].set_title('V(s) vs time') - # plot_states(ax[3]) - # ax[1].set_ylim([-10,0]) - fig.canvas.draw() - fig.canvas.flush_events() - - total_steps += step - # score_info = { - # 'trial': trial, - # 'episode': episode, - # 'reward': sum(ep_rewards), - # 'total_reward': total_reward, - # 'total_steps': total_steps, - # 'steps': step - # } - # json_str = json.dumps(score_info) - # log.write(json_str+'\n') - # log.flush() -print('\n\n') - -fig, ax = plt.subplots(2,2) -ax = ax.flatten() -[a.clear() for a in ax] -plot_value_function(ax[0]) -env.plot(ax[0]) -ax[0].set_title('V(s)') -env.plot(ax[1]) -plot_policy(ax[1]) -ax[1].set_title('Policy') -ax[2].plot(rewards, c='C3') -ax[2].set_title('Rewards') -ax[3].plot(value_fn) -ax[3].set_title('V(s) vs time') - -# mode_str = '_penalty_' if args.continuous else '' -results_dir = 'results/discrete-obstacles/tabular-q' -os.makedirs(results_dir, exist_ok=True) -plt.savefig(results_dir+'/train_{}_penalty_{}.png'.format(args.seed, str(args.penalty).lower())) diff --git a/archive/factops/requirements.txt b/archive/factops/requirements.txt deleted file mode 100644 index fd7c8d6..0000000 --- a/archive/factops/requirements.txt +++ /dev/null @@ -1,36 +0,0 @@ -appnope==0.1.0 -backcall==0.1.0 -cycler==0.10.0 -decorator==4.3.0 -imageio==2.4.1 -ipykernel==5.1.0 -ipython==7.2.0 -ipython-genutils==0.2.0 -jedi==0.13.2 -jupyter-client==5.2.4 -jupyter-core==4.4.0 -kiwisolver==1.0.1 -matplotlib==3.0.2 -numpy==1.15.4 -pandas==0.24.1 -parso==0.3.1 -pexpect==4.6.0 -pickleshare==0.7.5 -Pillow==5.4.1 -prompt-toolkit==2.0.7 -ptyprocess==0.6.0 -Pygments==2.3.1 -pyparsing==2.3.0 -python-dateutil==2.7.5 -pytz==2018.9 -pyzmq==17.1.2 -scikit-learn==0.20.3 -scipy==1.2.0 -seaborn==0.9.0 -six==1.12.0 -sklearn==0.0 -torch==1.0.0 -tornado==5.1.1 -tqdm==4.29.1 -traitlets==4.3.2 -wcwidth==0.1.7 diff --git a/archive/learners/__init__.py b/archive/learners/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/archive/learners/actorcritic.py b/archive/learners/actorcritic.py deleted file mode 100644 index 06d815c..0000000 --- a/archive/learners/actorcritic.py +++ /dev/null @@ -1,82 +0,0 @@ -from collections import namedtuple -import torch -import torch.nn as nn - -Experience = namedtuple('Experience',['s','a','r','done','sp']) - -class Network(nn.Module): - def __init__(self, ndim_s, n_actions, n_units=32): - super().__init__() - self.ndim_s = ndim_s - self.n_actions = n_actions - self.phi = nn.Sequential(*[ - nn.Linear(ndim_s, n_units), nn.ReLU(), - nn.Linear(n_units, n_units), nn.ReLU(), - ]) - self.policy_head = nn.Linear(n_units, n_actions) - self.q_head = nn.Linear(n_units, n_actions) - - def forward(self, s): - features = self.phi(s) - pi = torch.distributions.Categorical(logits=self.policy_head(features)) - q = self.q_head(features) - v = torch.sum(pi.probs*q, dim=-1) - return v, q, pi - - def action_distr(self, s): - with torch.no_grad(): - pi = self(s)[2] - return pi - -class Trainer: - def __init__(self, net, discount, lr=1e-3): - self.net = net - self.discount = torch.as_tensor(discount) - self.actor_optimizer = torch.optim.Adam(net.parameters(), lr=lr) - self.critic_optimizer = torch.optim.Adam(net.parameters(), lr=lr) - - def critic_loss(self, batch, mode='td'): - a_onehot = nn.functional.one_hot(batch.a, self.net.n_actions) - - v_sp = self.net(batch.sp)[0] - q_s = self.net(batch.s)[1] - q_s_acted = q_s * a_onehot.float() - - with torch.no_grad(): - if mode in ['mc','montecarlo']: - v_s_target = batch.r - elif mode == 'td': - v_s_target = (batch.r + (1-batch.done).float() * self.discount * v_sp) - else: - assert mode in ['mc','montecarlo', 'td'], 'Invalid mode' - q_s_target = q_s_acted.clone() - idx = torch.arange(q_s_target.shape[0], dtype=torch.int64) - q_s_target[idx, batch.a] = v_s_target - - return nn.functional.mse_loss(input=q_s_acted, target=q_s_target) - - def actor_loss(self, batch): - v, q, pi = self.net(batch.s) - - idx = torch.arange(q.shape[0], dtype=torch.int64) - q_acted = q[idx, batch.a] - adv = q_acted - v - - log_p = pi.log_prob(batch.a) - return torch.mean(-log_p * adv, dim=0) - - def train_model(self, batch, optimizer, loss_fn): - optimizer.zero_grad() - loss = loss_fn(batch) - loss.backward() - optimizer.step() - return loss.detach() - - def train(self, batch, critic_mode='td'): - critic_loss_fn = lambda x: self.critic_loss(x, mode=critic_mode) - critic_loss = 0 - for _ in range(10): - critic_loss += self.train_model(batch, optimizer=self.critic_optimizer, loss_fn=critic_loss_fn) - critic_loss /= 10 - actor_loss = self.train_model(batch, optimizer=self.actor_optimizer, loss_fn=self.actor_loss) - return critic_loss, actor_loss diff --git a/archive/learners/classifier.py b/archive/learners/classifier.py deleted file mode 100644 index c5afee4..0000000 --- a/archive/learners/classifier.py +++ /dev/null @@ -1,45 +0,0 @@ -from collections import namedtuple -import torch -import torch.nn as nn - -Sample = namedtuple('Sample',['x','y']) - -class Network(nn.Module): - def __init__(self, ndim_x, ndim_y, n_units=32): - super().__init__() - self.ndim_x = ndim_x - self.ndim_y = ndim_y - self.layers = nn.Sequential(*[ - nn.Linear(ndim_x, n_units), nn.ReLU(), - nn.Linear(n_units, n_units), nn.ReLU(), - nn.Linear(n_units, ndim_y) - ]) - - def forward(self, x): - logits = self.layers(x) - predictions = torch.distributions.Categorical(logits=logits) - return predictions - -class Trainer: - def __init__(self, net, lr=1e-3): - self.net = net - self.optimizer = torch.optim.Adam(net.parameters(), lr=lr) - - def classification_loss(self, batch): - logits = self.net(batch.x).logits - loss = nn.functional.cross_entropy(logits, batch.y) - return loss - - def accuracy(self, batch): - with torch.no_grad(): - logits = self.net(batch.x).logits - pred = logits.argmax(dim=1, keepdim=True) # get the index of the max log-probability - correct = pred.eq(batch.y.view_as(pred)).sum().item() - return correct / len(pred) - - def train(self, batch): - self.optimizer.zero_grad() - loss = self.classification_loss(batch) - loss.backward() - self.optimizer.step() - return loss.detach() diff --git a/archive/learners/common.py b/archive/learners/common.py deleted file mode 100644 index 5b2ba0b..0000000 --- a/archive/learners/common.py +++ /dev/null @@ -1,12 +0,0 @@ -import torch -import numpy -import random - -def reset_seeds(seed=0): - random.seed(seed) - numpy.random.seed(seed) - torch.manual_seed(seed) - -def get_batch(replay): - batch = type(replay[0])(*map(lambda x: torch.stack(x, dim=0), zip(*replay))) - return batch diff --git a/archive/learners/diayn.py b/archive/learners/diayn.py deleted file mode 100644 index 0fd1670..0000000 --- a/archive/learners/diayn.py +++ /dev/null @@ -1,50 +0,0 @@ -import torch -from tqdm import tqdm - -from learners.vic import VIC -from learners.common import get_batch -from learners import classifier - -class DIAYN(VIC): - def train(self, n_episodes): - discrim_losses = [] - critic_losses = [] - actor_losses = [] - for episode in tqdm(range(n_episodes)): - self.env.reset() - s0 = self.get_env_state() - - # Sample Ω ~ p^C(Ω|s0) - skill = self.skill_prior.sample() - - # Follow policy π(a|Ω,s) till termination state s_f - s_f, trajectory = self.run_skill(skill) - - # Regress q(Ω|s0,s_f) towards Ω - batch = get_batch([classifier.Sample((s-s0).detach(), skill) for s in trajectory.sp]) - loss = self.discriminator.train(batch) - discrim_losses.append(loss.item()) - - # Calculate intrinsic rewards r_I (and update trajectory experiences) - r_I = self.get_intrinsic_reward(s0, torch.stack(trajectory.sp), skill) - r_If = self.get_intrinsic_reward(s0, s_f, skill) - trajectory.r.clear() - trajectory.r.extend(r_I) - done = trajectory.done[-1] - if not done: - trajectory.s.append(s_f) - trajectory.a.append(torch.tensor(self.terminate_action)) - trajectory.r.append(r_If) - trajectory.done.append(torch.tensor(True)) - trajectory.sp.append(s_f) - experiences = self._get_experiences(trajectory, skill) - - # Use an RL algorithm update for π(a|Ω,s) to maximize r_I - critic_loss,actor_loss = self.skill_policies[skill].train(get_batch(experiences), critic_mode='montecarlo') - critic_losses.append(critic_loss.item()) - actor_losses.append(actor_loss.item()) - - # Reinforce option prior p^C(Ω|s0) based on r_I - # TODO - - return discrim_losses, critic_losses, actor_losses diff --git a/archive/learners/test_actorcritic.py b/archive/learners/test_actorcritic.py deleted file mode 100644 index b98b73e..0000000 --- a/archive/learners/test_actorcritic.py +++ /dev/null @@ -1,117 +0,0 @@ -import gym -import matplotlib.pyplot as plt -import torch -from tqdm import tqdm - -from learners.actorcritic import Network, Trainer, Experience -from learners.common import get_batch - -def test_network_single(): - ndim_s = 48 - n_actions = 12 - n = Network(ndim_s, n_actions) - state = torch.rand((ndim_s,),dtype=torch.float32) - action = n.action_distr(state).sample() - value, q_value, pi = n(state) - assert value.shape == torch.Size([]) - assert q_value.shape == torch.Size([n_actions]) - assert pi.param_shape == torch.Size([n_actions]) - -def test_network_batch(): - batch_size = 100 - ndim_s = 48 - n_actions = 12 - n = Network(ndim_s, n_actions) - states = torch.rand((batch_size,ndim_s),dtype=torch.float32) - values, q_values, pi = n(states) - assert values.shape == torch.Size([batch_size]) - assert q_values.shape == torch.Size([batch_size,n_actions]) - assert pi.param_shape == torch.Size([batch_size,n_actions]) - -def test_train_critic(): - batch_size = 100 - ndim_s = 48 - n_actions = 12 - n = Network(ndim_s, n_actions) - t = Trainer(n, discount=0.9) - - s = torch.rand((batch_size, ndim_s), dtype=torch.float32) - a = torch.randint(0, n_actions, size=(batch_size,)) - r = torch.rand((batch_size,), dtype=torch.float32) - done = torch.randint(0, 2, size=(batch_size,)) - sp = torch.rand((batch_size, ndim_s), dtype=torch.float32) - batch = Experience(s,a,r,done,sp) - - losses = [] - n_updates = 200 - for _ in range(n_updates): - losses.append(t.train_model(batch, t.critic_optimizer, t.critic_loss).item()) - assert losses[n_updates//2] < losses[0], '{} >= {}'.format(losses[n_updates//2], losses[0]) - assert losses[-1] < losses[n_updates//2], '{} >= {}'.format(losses[-1], losses[n_updates//2]) - -def test_train_actor(): - batch_size = 100 - ndim_s = 48 - n_actions = 12 - n = Network(ndim_s, n_actions) - t = Trainer(n, discount=0.9) - - s = torch.rand((batch_size, ndim_s), dtype=torch.float32) - a = torch.randint(0,n_actions,size=(batch_size,)) - r = torch.rand((batch_size,), dtype=torch.float32) - done = torch.randint(0,2,size=(batch_size,)) - sp = torch.rand((batch_size, ndim_s), dtype=torch.float32) - batch = Experience(s,a,r,done,sp) - - losses = [] - n_updates = 200 - for _ in range(n_updates): - losses.append(t.train_model(batch, t.actor_optimizer, t.actor_loss).item()) - assert losses[n_updates//2] < losses[0], '{} >= {}'.format(losses[n_updates//2], losses[0]) - assert losses[-1] < losses[n_updates//2], '{} >= {}'.format(losses[-1], losses[n_updates//2]) - -def test_rl(): - batch_size = 100 - env = gym.make('CartPole-v0') - ndim_s = 4 - n_actions = 2 - net = Network(ndim_s, n_actions) - trainer = Trainer(net, discount=0.99) - - ep_rewards = [] - replay = [] - for episode in tqdm(range(1000)): - state = torch.as_tensor(env.reset(), dtype=torch.float32) - ep_reward = 0 - - for t in range(1000): - action = net.action_distr(state).sample() - next_state, reward, done, _ = env.step(action.item()) - ep_reward += reward - next_state = torch.as_tensor(next_state, dtype=torch.float32) - reward = torch.as_tensor(reward, dtype=torch.float32) - done = torch.as_tensor(done, dtype=torch.int64) - - experience = Experience(state, action, reward, done, next_state) - replay.append(experience) - state = next_state - - if len(replay) >= batch_size: - batch = get_batch(replay) - trainer.train(batch) - replay = [] - - if done: - break - - ep_rewards.append(ep_reward) - fig, ax = plt.subplots() - ax.plot(ep_rewards) - plt.show() - -if __name__ == '__main__': - test_network_single() - test_network_batch() - test_train_critic() - test_train_actor() - test_rl() diff --git a/archive/learners/test_classifier.py b/archive/learners/test_classifier.py deleted file mode 100644 index 1c373fb..0000000 --- a/archive/learners/test_classifier.py +++ /dev/null @@ -1,63 +0,0 @@ -import matplotlib.pyplot as plt -import torch -import torchvision -from torchvision import datasets, transforms -from tqdm import tqdm - -from learners.classifier import Network, Trainer, Sample -from learners.common import get_batch - -def test_network_single(): - ndim_x = 28*28 - n_actions = 10 - net = Network(ndim_x, n_actions) - x = torch.rand((ndim_x,),dtype=torch.float32) - y_preds = net(x) - assert y_preds.param_shape == torch.Size([n_actions]) - -def test_network_batch(): - batch_size = 100 - ndim_x = 28*28 - n_actions = 10 - net = Network(ndim_x, n_actions) - x = torch.rand((batch_size,ndim_x),dtype=torch.float32) - y_preds = net(x) - assert y_preds.param_shape == torch.Size([batch_size, n_actions]) - -def get_batch(replay): - batch = Experience(*map(lambda x: torch.stack(x, dim=0), zip(*replay))) - return batch - -def test_train_mnist(): - batch_size = 100 - epochs = 1 - ndim_x = 28*28 - n_actions = 10 - - train_loader = torch.utils.data.DataLoader( - datasets.MNIST('./data', train=True, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])), - batch_size=batch_size, shuffle=True) - - net = Network(ndim_x, n_actions) - t = Trainer(net) - - accuracies = [] - for epoch in range(1, epochs + 1): - for batch_idx, (data, target) in enumerate(tqdm(train_loader)): - batch = Sample(x=torch.flatten(data, 1), y=target) - t.train(batch) - accuracy = t.accuracy(batch) - accuracies.append(accuracy) - - fig, ax = plt.subplots() - ax.plot(accuracies) - plt.show() - -if __name__ == '__main__': - test_network_single() - test_network_batch() - test_train_mnist() diff --git a/archive/learners/test_vic.py b/archive/learners/test_vic.py deleted file mode 100644 index 36f2a6f..0000000 --- a/archive/learners/test_vic.py +++ /dev/null @@ -1,104 +0,0 @@ -import matplotlib.pyplot as plt -import numpy as np -import torch - -from learners.vic import VIC -from learners.diayn import DIAYN -from learners.common import reset_seeds - -def get_policy(alg, mdp, skill): - policy = np.empty((mdp.width+1, mdp.height+1), dtype=np.object) - for x in range(1,mdp.width+1): - for y in range(1,mdp.height+1): - if not env.mdp.is_wall(x,y): - s = torch.tensor((x,y)).float() - action = torch.argmax(alg.action_distr(s, torch.tensor(skill)).logits).item() - action_word = dict(enumerate(alg.env.action_map)).get(action, 'term') - policy[x,y] = action_word - return policy - -plt.figure() - -#%% -reset_seeds(0) -from simple_rl.tasks import FourRoomMDP -from notebooks.simple_rl_env import SimpleGymEnv -env = SimpleGymEnv(FourRoomMDP(12,12,goal_locs=[(12,12)])) -env.render() -s0 = torch.as_tensor(env.reset(), dtype=torch.float32) -ndim_s = len(env.observation_space) -n_actions = env.action_space.n -n_skills = 40 -gamma = 0.99 -max_steps_per_skill = 10 -n_units = 32 -lr = 1e-3 -alg = VIC -# alg = DIAYN -alg = alg(env, ndim_s, n_actions, n_skills, gamma, max_steps_per_skill, n_units, lr) -start_policies = [get_policy(alg, env.mdp, s) for s in range(n_skills)] -#%% -discrim_losses, critic_losses, actor_losses = alg.train(20000) - -#%% -fig, ax = plt.subplots(2,2) -ax = ax.flatten() -ax[0].plot(discrim_losses) -ax[0].set_title('discrim_losses') -ax[1].plot(critic_losses) -ax[1].set_title('critic_losses') -ax[2].plot(actor_losses) -ax[2].set_title('actor_losses') -ax[3].axis('off') -fig.tight_layout() -fig.show() - -#%% -# final_policies = [get_policy(alg, env.mdp, s) for s in range(n_skills)] -# -# for skill in range(n_skills): -# # env.render(start_policies[skill]) -# env.render(final_policies[skill]) -# print() - -#%% -ncols = 3#int(np.floor(np.sqrt(n_skills))) -nrows = int(np.ceil(n_skills/ncols)) -fig, ax = plt.subplots(nrows, ncols, figsize=(3*ncols,3*nrows)) -ax = ax.flatten() -n_samples = 200 -for skill in range(n_skills): - final_states = np.zeros((env.mdp.width+1, env.mdp.height+1)) - for i in range(n_samples): - alg.env.reset() - (x,y), _ = alg.run_skill(torch.tensor(skill)) - final_states[x.int(),y.int()]+=1 - ax[skill].imshow(final_states, cmap='hot', interpolation='nearest', vmax=n_samples) - ax[skill].set_xlim([0.5, env.mdp.width+0.5]) - ax[skill].set_ylim([0.5, env.mdp.height+0.5]) - ax[skill].set_xticks(range(1,env.mdp.width+1,2)) - ax[skill].set_yticks(range(1,env.mdp.height+1,2)) - ax[skill].invert_yaxis() -[a.axis('off') for a in ax[skill+1:]] -fig.tight_layout() -plt.savefig('{}-40sk-40k.png'.format(alg.__class__.__name__)) -plt.show() - -#%% -fig, ax = plt.subplots( figsize=(9,9)) -n_samples = 200 -final_states = np.zeros((env.mdp.width+1, env.mdp.height+1)) -for skill in range(n_skills): - for i in range(n_samples): - alg.env.reset() - (x,y), _ = alg.run_skill(torch.tensor(skill)) - final_states[x.int(),y.int()]+=1 -ax.imshow(final_states, cmap='hot', interpolation='nearest', vmax=n_samples) -ax.set_xlim([0.5, env.mdp.width+0.5]) -ax.set_ylim([0.5, env.mdp.height+0.5]) -ax.set_xticks(range(1,env.mdp.width+1,2)) -ax.set_yticks(range(1,env.mdp.height+1,2)) -ax.invert_yaxis() -fig.tight_layout() -# plt.savefig('{}-40sk-40k-all.png'.format(alg.__class__.__name__)) -plt.show() diff --git a/archive/learners/vic.py b/archive/learners/vic.py deleted file mode 100644 index 4a435e7..0000000 --- a/archive/learners/vic.py +++ /dev/null @@ -1,139 +0,0 @@ -# Variational Intrinsic Control -# https://arxiv.org/pdf/1611.07507.pdf -from collections import namedtuple -import matplotlib.pyplot as plt -import numpy as np -import torch -import torch.nn as nn -from tqdm import tqdm - -from learners import actorcritic as a2c -from learners import classifier -from learners.common import get_batch - -class VIC: - def __init__(self, env, ndim_s, n_actions, n_skills, discount, max_timesteps=6, n_units=32, lr=1e-3): - self.env = env - self.n_actions = n_actions - self.terminate_action = n_actions - self.n_skills = n_skills - self.discount = discount - self.max_timesteps = max_timesteps - self.skill_policies = [None]*n_skills - for i in range(n_skills): - net = a2c.Network(ndim_s, n_actions+1, n_units) - self.skill_policies[i] = a2c.Trainer(net, discount, lr=lr) - - discrim_net = classifier.Network(ndim_s, n_skills, n_units) - self.discriminator = classifier.Trainer(discrim_net, lr=lr) - - self.skill_prior = torch.distributions.Categorical(logits=torch.ones(n_skills)) - - def get_intrinsic_reward(self, s0, sf, skill): - with torch.no_grad(): - ds = sf - s0 - reward = self.discriminator.net(ds).log_prob(skill) - self.skill_prior.log_prob(skill) - return reward - - def _get_experiences(self, trajectory, skill): - s,a,rewards,dones,sp = trajectory - # - # skill_1hot = torch.nn.functional.one_hot(skill, self.n_skills).float() - # s = [torch.cat((x, skill_1hot)) for x in s] - # sp = [torch.cat((x, skill_1hot)) for x in sp] - - returns = [] - bootstrap = 0 - for reward, done in zip(rewards[::-1], dones[::-1]): - bootstrap = reward + (self.discount*bootstrap if not done else 0) - returns.insert(0, bootstrap) - - discounted_trajectory = a2c.Experience(s,a,returns,dones,sp) - experiences = list(map(lambda x: a2c.Experience(*x),list(zip(*discounted_trajectory)))) - return experiences - - def get_env_state(self): - return torch.as_tensor(tuple(self.env.state), dtype=torch.float32) - - def action_distr(self, state, skill, primitives_only=False): - distr = self.skill_policies[skill].net.action_distr(state) - if primitives_only: - primitive_logits = distr.logits[:-1] - distr = torch.distributions.Categorical(logits=primitive_logits) - return distr - - def run_skill(self, skill): - states = [] - actions = [] - intrinsic_rewards = [] - extrinsic_rewards = [] - dones = [] - - done = False - timestep = 0 - while True: - state = self.get_env_state() - states.append(state) - - is_first_timestep = (timestep==0) - action = self.action_distr(state, skill, primitives_only=is_first_timestep).sample() - if done or action == self.terminate_action or timestep >= self.max_timesteps: - break - - next_state, extrinsic_reward, done, _ = self.env.step(action) - timestep += 1 - - actions.append(action) - intrinsic_rewards.append(torch.tensor(0)) - extrinsic_rewards.append(torch.tensor(extrinsic_reward)) - dones.append(torch.tensor(done)) - if done: - self.env.reset() - - final_state = states[-1] - next_states = states[1:] - states = states[:-1] - # rewards = list(zip(intrinsic_rewards, extrinsic_rewards)) - rewards = intrinsic_rewards - trajectory = a2c.Experience(states, actions, rewards, dones, next_states) - return final_state, trajectory - - def train(self, n_episodes): - discrim_losses = [] - critic_losses = [] - actor_losses = [] - for episode in tqdm(range(n_episodes)): - self.env.reset() - s0 = self.get_env_state() - - # Sample Ω ~ p^C(Ω|s0) - skill = self.skill_prior.sample() - - # Follow policy π(a|Ω,s) till termination state s_f - s_f, trajectory = self.run_skill(skill) - - # Regress q(Ω|s0,s_f) towards Ω - batch = get_batch([classifier.Sample((s_f-s0).detach(), skill)]) - loss = self.discriminator.train(batch) - discrim_losses.append(loss.item()) - - # Calculate intrinsic reward r_I (and update trajectory experiences) - r_I = self.get_intrinsic_reward(s0, s_f, skill) - done = trajectory.done[-1] - if not done: - trajectory.s.append(s_f) - trajectory.a.append(torch.tensor(self.terminate_action)) - trajectory.r.append(r_I) - trajectory.done.append(torch.tensor(True)) - trajectory.sp.append(s_f) - experiences = self._get_experiences(trajectory, skill) - - # Use an RL algorithm update for π(a|Ω,s) to maximize r_I - critic_loss,actor_loss = self.skill_policies[skill].train(get_batch(experiences), critic_mode='montecarlo') - critic_losses.append(critic_loss.item()) - actor_losses.append(actor_loss.item()) - - # Reinforce option prior p^C(Ω|s0) based on r_I - # TODO - - return discrim_losses, critic_losses, actor_losses diff --git a/archive/simple_rl_env.py b/archive/simple_rl_env.py deleted file mode 100644 index e7989af..0000000 --- a/archive/simple_rl_env.py +++ /dev/null @@ -1,52 +0,0 @@ -import gym -from gym.spaces import Discrete, Tuple -import numpy as np - -class SimpleGymEnv(gym.Env): - def __init__(self, mdp): - self.mdp = mdp - self.state = mdp.init_state - self.observation_space = Tuple((Discrete(mdp.width),Discrete(mdp.height))) - self.action_space = Discrete(len(mdp.ACTIONS)) - self.action_map = mdp.ACTIONS - super().__init__() - - def step(self, action): - action = self.action_map[action] - next_state = self.mdp.transition_func(self.state, action) - reward = self.mdp.reward_func(self.state, action, next_state) - done = self.mdp.is_goal_state(next_state) - self.state = next_state - return np.asarray(next_state), reward, done, None - - def reset(self): - self.state = self.mdp.init_state - return np.asarray(self.state) - - def render(self, policy=None): - grid = np.zeros((self.mdp.width+2, self.mdp.height+2)) - grid[0,:] = 1 - grid[-1,:] = 1 - grid[:,0] = 1 - grid[:,-1] = 1 - for x,y in self.mdp.walls: - grid[x, y] = 1 - - for y, column in enumerate(grid): - for x, cell in enumerate(column): - if grid[x,y]: - print('#',end='') - elif policy is not None: - action_arrow = {'up': '˄', 'down': '˅', 'left': '˂', 'right': '˃', 'term': '•'}[policy[x,y]] - print(action_arrow, end='') - else: - print('O' if tuple(self.state)==(x,y) else '.',end='') - print() - - def close(self): - pass -# -# from simple_rl.tasks import FourRoomMDP -# env = SimpleGymEnv(FourRoomMDP(12,12,goal_locs=[(12,12)])) -# env.reset() -# env.render() diff --git a/experiments/cube/archive/apply_steps.py b/experiments/cube/archive/apply_steps.py deleted file mode 100644 index ca95f75..0000000 --- a/experiments/cube/archive/apply_steps.py +++ /dev/null @@ -1,74 +0,0 @@ -import copy -import random -from tqdm import tqdm - -from cube import cube -from cube import skills -from cube.options import expert -from cube import pattern - -c = cube.Cube() -c.apply(pattern.scramble1) -c.render() - -#%% - -mods = c.summarize_effects() - -action_steps = ["L'", 'B', 'R', 'R', "B'", "D'", "B'", "F'", 'R', 'F', 'L', "D'", "D'", "L'"] -n_action_steps = len(action_steps) -action_experiences = 1359208 - -option_steps = [ - ['F', 'U', "B'", "U'", "L'", "B'", 'L', "F'", "L'", 'B', 'L', 'U', 'B', "U'"], - ["B'", 'L', 'B', "R'", "B'", "L'", 'B', 'R'], - ["F'", 'L', 'F', "R'", "F'", "L'", 'F', 'R'], - ["L'", 'R', 'F', "L'", 'R', 'U', "L'", 'R', 'B', "L'", 'R', 'D', 'D', "L'", 'R', 'F', "L'", 'R', 'U', "L'", 'R', 'B', "L'", 'R'], - ["L'", 'B', "L'", "B'", "L'", "B'", "L'", 'B', 'L', 'B', 'L', 'L'], - ["D'", "D'", "L'", "D'", "L'", 'D', 'L', 'D', 'L', 'D', "L'", 'D'], - ['F', "B'", "U'", 'F', "B'", "R'", 'F', "B'", "D'", 'F', "B'", "L'", "L'", 'F', "B'", "U'", 'F', "B'", "R'", 'F', "B'", "D'", 'F', "B'"], - ['D', 'F', "U'", "F'", "L'", "U'", 'L', "D'", "L'", 'U', 'L', 'F', 'U', "F'"], - ['U', 'R', "D'", "R'", "U'", 'R', 'D', "R'"], - ["L'", "F'", 'R', 'F', 'D', 'R', "D'", 'L', 'D', "R'", "D'", "F'", "R'", 'F'], - ['D', "R'", 'D', 'R', 'D', 'R', 'D', "R'", "D'", "R'", "D'", "D'"], - ['F', "B'", "L'", 'F', "B'", "U'", 'F', "B'", "R'", 'F', "B'", "D'", "D'", 'F', "B'", "L'", 'F', "B'", "U'", 'F', "B'", "R'", 'F', "B'"], - ["D'", 'U', 'F', "D'", 'U', 'L', "D'", 'U', 'B', "D'", 'U', 'R', 'R', "D'", 'U', 'F', "D'", 'U', 'L', "D'", 'U', 'B', "D'", 'U'], - ['F', 'F', 'R', 'F', 'R', "F'", "R'", "F'", "R'", "F'", 'R', "F'"], - ["B'", 'F', 'U', 'U', "F'", 'B', 'L', 'L'], - ["R'", 'U', "R'", "U'", "R'", "U'", "R'", 'U', 'R', 'U', 'R', 'R'], - ["U'", "U'", "B'", "U'", "B'", 'U', 'B', 'U', 'B', 'U', "B'", 'U'], - ['R', "L'", "D'", "D'", 'L', "R'", "B'", "B'"], - ['B', 'B', 'U', 'B', 'U', "B'", "U'", "B'", "U'", "B'", 'U', "B'"], - ["R'", 'L', 'F', 'F', "L'", 'R', 'U', 'U'], - ['U', "L'", "U'", 'R', 'U', 'L', "U'", "R'"], - ["U'", "B'", 'D', 'B', 'U', "B'", "D'", 'B'] -] -option_steps = [ - ["B'", 'L', 'D', "B'", "B'", 'F', "R'", 'D', 'D', 'L', 'D', "R'"], - ['L', 'B', "U'", "L'", 'F', 'R', 'F', "L'", 'U', "D'"], - ['R', "L'", 'B', "D'", "F'", "D'", 'B', 'L', "U'", "B'"], - ["R'", 'L', "U'", 'F', 'D', 'F', "U'", "L'", 'B', 'U'], - ["U'", "U'", 'B', "F'", "F'", 'U', 'R', 'D', 'R', "D'", "B'", "U'", 'B', "D'", 'R', 'R', "U'", 'D', 'D', "U'"], - ['F', 'F', "D'", 'U', 'U', "F'", "R'", "B'", "R'", 'B', 'D', 'F', "D'", 'B', "R'", "R'", 'F', "B'", "B'", 'F'], - ['R', 'R', "U'", 'D', 'D', "R'", "F'", "L'", "F'", 'L', 'U', 'R', "U'", 'L', "F'", "F'", 'R', "L'", "L'", 'R'], - ['L', "R'", "R'", 'L', "F'", "F'", 'R', "U'", 'L', 'U', 'R', "F'", "R'", "F'", "L'", 'D', 'D', "U'", 'L', 'L'], - ['R', "L'", 'B', "D'", "F'", "D'", 'B', 'L', "U'", "B'"], - ["B'", "R'", 'D', 'B', "L'", "F'", "L'", 'B', "D'", 'U'] -] - - -n_option_steps = sum([len(o) for o in option_steps]) -option_experiences = 700368 - -a_cube = copy.deepcopy(c) -for step in action_steps: - a_cube.apply([step]) -a_cube.render() -print(n_action_steps) - -#%% -o_cube = copy.deepcopy(c) -for step in option_steps: - o_cube.apply(step) -o_cube.render() -print(n_option_steps) diff --git a/experiments/cube/archive/cube_investigate_dead_ends.py b/experiments/cube/archive/cube_investigate_dead_ends.py deleted file mode 100644 index 818457e..0000000 --- a/experiments/cube/archive/cube_investigate_dead_ends.py +++ /dev/null @@ -1,46 +0,0 @@ -import copy -import glob -import matplotlib.pyplot as plt -import numpy as np -import os -import pandas as pd -import pickle -import seaborn as sns - -from cube import cube -from cube import pattern -from cube import formula -from cube import skills - -results_dir = 'results/cube_deadends/' -cube_files = sorted(glob.glob(results_dir+'*.pickle')) -for f in cube_files: - seed = int(f.split('/')[-1].split('.')[-2].split('-')[-1]) - with open(f, 'rb') as f: - cubefail = pickle.load(f) - if seed in [8]:#[1,8,14,35,53,76,100]: - print(seed) - cubefail.render() - break - pass - -#%% -cube_mod = copy.deepcopy(cubefail) -rot = formula.rotate - -cube_mod.apply("R R".split()) -cube_mod.apply(rot(rot("F F R' F' U' F' U F R F' U U F U U F' U'".split(),cube.Face.L,2),cube.Face.D,2)) -cube_mod.apply("R R".split()) - -# cube_mod.apply("D' F".split()) -# cube_mod.apply(rot("F F R' F' U' F' U F R F' U U F U U F' U'".split(),cube.Face.U)) -# cube_mod.apply(formula.inverse(rot(skills.orient_2_corners,cube.Face.U))) -# cube_mod.apply("F' D".split()) - -cube_mod.render() - -#%% -newcube = cube.Cube() -newcube.apply(rot(rot("F F R' F' U' F' U F R F' U U F U U F' U'".split(),cube.Face.L,0),cube.Face.D,0)) -newcube.render() -# len(newcube.summarize_effects()) diff --git a/experiments/cube/archive/generate_skills.py b/experiments/cube/archive/generate_skills.py deleted file mode 100644 index c1eb9f1..0000000 --- a/experiments/cube/archive/generate_skills.py +++ /dev/null @@ -1,133 +0,0 @@ -import copy -import matplotlib.pyplot as plt -import random -from tqdm import tqdm -from collections import namedtuple - -from cube import cube -from cube import formula -from cube import skills -from cube import options -from cube import pattern - -c = cube.Cube() -c.render() - -Skill = namedtuple('Skill',['seq','mdl']) - -def combine_skills(skills, depth, prefix=None): - assert depth > 0, 'Depth must be > 0' - # skills = [s for s in random.sample(skills, len(skills))] - if depth==1: - seqs = [s.seq if prefix==None else formula.simplify(prefix.seq+s.seq) for s in skills] - mdls = [s.mdl if prefix==None else cube.combine_swaps(prefix.mdl, s.mdl) for s in skills] - return [Skill(s,tuple(m)) for (s, m) in zip(seqs, mdls) if s !=[] and m != []] - else: - new_prefix_seqs = [s.seq if prefix==None else formula.simplify(prefix.seq+s.seq) for s in skills] - new_prefix_mdls = [s.mdl if prefix==None else cube.combine_swaps(prefix.mdl,s.mdl) for s in skills] - new_prefixes = [Skill(s,tuple(m)) for (s,m) in list(zip(new_prefix_seqs, new_prefix_mdls)) if s !=[] and m != []] - combo_skills = [combine_skills(skills, depth-1, prefix=p) for p in tqdm(new_prefixes)] - result = [skill for entry in combo_skills for skill in entry] - return result - -skills = [Skill([a],tuple(cube.Cube().apply([a]).summarize_effects())) for a in cube.actions] -cached_effects = set([s.mdl for s in skills]) -min_effect_size = 17 -depth = 2 -#%% -combos = combine_skills(skills, depth) -n_changes = [len(skill.mdl) for skill in combos] -rankings = sorted([(n, skill) for (n, skill) in list(zip(n_changes, combos)) if n > 0 and n <= min_effect_size]) -for n, skill in rankings: - if skill.mdl not in cached_effects: - cached_effects.add(skill.mdl) - skills.append(skill) - min_effect_size = min(n, min_effect_size) - -#%% -for l in range(20): - print('{}:'.format(l), len([s for s in skills if len(s.seq) == l])) -print('total:',len(skills)) -sorted([tuple(s.seq) for s in skills]) - -for l in range(17): - print(l,len([s for s in skills if len(s.mdl) == 8 and len(s.seq)==l])) - -sorted([s for s in skills if len(s.mdl) == 8 and len(s.seq)==16]) - -y = [len(s.mdl) for s in skills] -plt.scatter(x,y) - -#%% -from cube import cube -from cube import skills as random_skills - -fig, ax = plt.subplots(figsize=(10,6)) -effects = [] -lengths = [] -short_skills = [] -for length in tqdm(range(1,25)): - n_trials = 100 - effect = 0 - for trial in range(n_trials): - d = cube.Cube() - f = random_skills.random_skill(length) - d.apply(f) - effect = len(d.summarize_effects()) - if effect < 20: - short_skills.append(f) - lengths.append(len(f)) - effects.append(effect) -lengths = [l-0.1 for l in lengths] - -plt.scatter(lengths, effects, marker='o', label='Random') - -effects = [] -lengths = [] -for prefix_length in tqdm(range(1,9)): - for body_length in range(1,9): - n_trials = 100 - effect = 0 - for trial in range(n_trials): - d = cube.Cube() - f = random_skills.random_conjugate(prefix_length, body_length) - d.apply(f) - effect = len(d.summarize_effects()) - lengths.append(len(f)) - effects.append(effect) -plt.scatter(lengths, effects, marker='^', label='Conjugates') - -effects = [] -lengths = [] -for x_length in tqdm(range(1,7)): - for y_length in range(1,7): - n_trials = 100 - effect = 0 - L = len(random_skills.random_commutator(x_length, y_length)) - for trial in range(n_trials): - d = cube.Cube() - f = random_skills.random_commutator(x_length, y_length) - d.apply(f) - effect = len(d.summarize_effects()) - lengths.append(len(f)) - effects.append(effect) -lengths = [l+0.1 for l in lengths] -plt.scatter(lengths, effects, marker='d', label='Commutators') - -x = [len(s.seq)+0.2 for s in skills] -y = [len(s.mdl) for s in skills] -plt.scatter(x,y, marker='o', facecolors='none', edgecolors='C4', label='Generated') - -from cube.options import expert -x = [len(o) for o in expert.options] -y = [len(m) for m in expert.models] -plt.scatter(x, y, marker='x', c='C3', label='Expert skills') - -plt.hlines(48, 0, 25, linestyles='dotted') -plt.legend(loc='lower left') -plt.title('Number of squares modified by skills') -plt.xlabel('Effective number of steps per skill') -plt.ylim([0,50]) -plt.xlim([0,25]) -plt.xticks(range(1,25)) -plt.show() diff --git a/experiments/cube/archive/option_vs_action_effects.py b/experiments/cube/archive/option_vs_action_effects.py deleted file mode 100644 index f5fd0c2..0000000 --- a/experiments/cube/archive/option_vs_action_effects.py +++ /dev/null @@ -1,71 +0,0 @@ -import random -import matplotlib.pyplot as plt -from tqdm import tqdm - -from cube import cube -from cube import formula -from cube import skills -from cube.options import expert - -def random_action_skill(length=3): - f = [random.choice(list(cube.Action.keys())) for _ in range(length)] - f = formula.simplify(f) - return f - -def random_option_skill(length=3): - idx_sequence = [random.choice(range(len(expert.options))) for _ in range(length)] - o_seq = [expert.options[idx] for idx in idx_sequence] - m_seq = [expert.models[idx] for idx in idx_sequence] - return o_seq, m_seq - -def main(count_type=count_type): - assert count_type in ['decisions', 'actions'] - - effects = [] - lengths = [] - for length in tqdm(range(1,41)): - n_trials = 100 - effect = 0 - for trial in range(n_trials): - d = cube.Cube() - f = random_action_skill(length) - d.apply(f) - effect += len(d.summarize_effects()) - lengths.append(length) - effects.append(effect/n_trials) - lengths = [l-0.1 for l in lengths] - plt.scatter(lengths, effects, marker='^', label='Primitive Actions') - - effects = [] - lengths = [] - for length in tqdm(range(1,41)): - n_trials = 50 - effect = 0 - n_actions = 0 - for trial in range(n_trials): - d = cube.Cube() - o_seq, m_seq = random_option_skill(length) - for o,m in zip(o_seq, m_seq): - n_actions += len(o) - d.apply(swap_list=m) - effect += len(d.summarize_effects()) - if count_type == 'actions': - lengths.append(n_actions/n_trials)# count actions - else: - lengths.append(length)# count options - effects.append(effect/n_trials) - lengths = [l for l in lengths] - plt.scatter(lengths, effects, marker='o', label='Options') - - x_max = 40 if count_type == 'decisions' else 500 - plt.hlines(48, 0, x_max, linestyles='dotted') - plt.legend(loc='lower right') - plt.title('Average number of squares modified by sequence') - plt.xlabel('Number of {} per sequence'.format(count_type)) - plt.ylim([0,50]) - if count_type == 'actions': - plt.xlim([0,500]) - plt.show() - -if __name__ == '__main__': - main(count_type='decisions') diff --git a/experiments/cube/archive/solve_with_actions.py b/experiments/cube/archive/solve_with_actions.py deleted file mode 100644 index 7b2c2d5..0000000 --- a/experiments/cube/archive/solve_with_actions.py +++ /dev/null @@ -1,75 +0,0 @@ -import copy -import random -from tqdm import tqdm - -from cube import cube -from cube import formula -from cube import skills -from cube import pattern - -c = cube.Cube() -c.apply(pattern.scramble1) -c.render() - -#%% -mods = c.summarize_effects() -steps = [] -experiences = 0 -tqdm.write('experiences:{}--steps:{}--errors:{}'.format(experiences, len(steps),len(mods))) - -max_depth = 5 -def action_sequences(depth, prefix=None): - assert depth > 0, 'Depth must be > 0' - actions = [a for a in random.sample(cube.actions, len(cube.actions))] - if depth==1: - return [[a] if prefix==None else prefix+[a] for a in actions] - else: - new_prefixes = [[a] if prefix==None else prefix+[a] for a in actions] - result = [] - for p in new_prefixes: - result += action_sequences(depth-1, prefix=p) - return result -action_seq = [None]*max_depth -for i in range(max_depth): - action_seq[i] = action_sequences(i+1) - -for _ in range(100): - good_sequences = [] - improvements = [] - - # Iterative deepening random search - for depth in tqdm(range(max_depth)): - for seq in tqdm(random.sample(action_seq[depth], len(action_seq[depth]))): - c_copy = copy.deepcopy(c) - c_copy.apply(seq) - experiences += len(seq) - resulting_mods = c_copy.summarize_effects() - improvement = len(mods) - len(resulting_mods) - if improvement > 0: - good_sequences.append(seq) - improvements.append(improvement) - if depth >= 3: - break - if improvements != []: - break - else: - continue - if improvements == []: - break - else: - rankings = sorted(list(zip(improvements, good_sequences)), reverse=True) - best_impr = rankings[0][0] - best_seqs = [op for impr, op in rankings if impr == best_impr] - seq = random.choice(best_seqs) - c.apply(seq) - mods = c.summarize_effects() - steps += seq - tqdm.write('experiences:{}--steps:{}--errors:{}'.format(experiences, len(steps),len(mods))) - c.render() - -print() -print() -print() -print('Experiences:', experiences) -print('Steps:', len(steps)) -print(steps) diff --git a/experiments/cube/archive/solve_with_options.py b/experiments/cube/archive/solve_with_options.py deleted file mode 100644 index ace6ca0..0000000 --- a/experiments/cube/archive/solve_with_options.py +++ /dev/null @@ -1,84 +0,0 @@ -import copy -import random -from tqdm import tqdm - -from cube import cube -from cube import formula -from cube import skills -from cube import options -from cube import pattern - -c = cube.Cube() -c.apply(pattern.scramble1) -c.render() - -#%% -mods = c.summarize_effects() -steps = [] -experiences = 0 -tqdm.write('experiences:{}--steps:{}--errors:{}'.format(experiences, len(steps),len(mods))) - -option_set = options.expert - -max_depth = 2 -def option_sequences(depth, prefix=None): - assert depth > 0, 'Depth must be > 0' - options = [o for o in random.sample(option_set.options, len(option_set.options))] - if depth==1: - return [[o] if prefix==None else prefix+[o] for o in options] - else: - new_prefixes = [[o] if prefix==None else prefix+[o] for o in options] - result = [] - for p in new_prefixes: - result += option_sequences(depth-1, prefix=p) - return result -option_seq = [None]*max_depth -for i in range(max_depth): - option_seq[i] = option_sequences(i+1) -option_seq[1][:4] -mdl = {} -for op, m in zip(option_set.options, option_set.models): - mdl[tuple(op)] = m - -for _ in range(100): - good_sequences = [] - improvements = [] - - # Iterative deepening random search - for depth in tqdm(range(max_depth)): - for seq in tqdm(random.sample(option_seq[depth], len(option_seq[depth]))): - c_copy = copy.deepcopy(c) - for op in seq: - c_copy.apply(swap_list=mdl[tuple(op)]) - experiences += 1 - resulting_mods = c_copy.summarize_effects() - improvement = len(mods) - len(resulting_mods) - if improvement > 0: - good_sequences.append(seq) - improvements.append(improvement) - if depth >= 1: - break - if improvements != []: - break - else: - continue - if improvements == []: - break - else: - rankings = sorted(list(zip(improvements, good_sequences)), reverse=True) - best_impr = rankings[0][0] - best_seqs = [op for impr, op in rankings if impr == best_impr] - seq = random.choice(best_seqs) - for op in seq: - c.apply(swap_list=mdl[tuple(op)]) - mods = c.summarize_effects() - steps += seq - tqdm.write('experiences:{}--steps:{}--errors:{}'.format(experiences, len(steps),len(mods))) - c.render() - -print() -print() -print() -print('Experiences:', experiences) -print('Steps:', len(steps)) -print(steps)