Skip to content

Commit

Permalink
Some more progress on lunar lander demo (WiP, needs to be cleaned up …
Browse files Browse the repository at this point in the history
…before release)
  • Loading branch information
Ariel committed Jan 17, 2024
1 parent f8dc559 commit 758372d
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 46 deletions.
97 changes: 75 additions & 22 deletions examples/demos/active-lunar/lunar-active.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from typarse import BaseParser

from cogment_lab.actors import ConstantActor
from cogment_lab.actors.nn_actor import NNActor
from cogment_lab.actors.nn_actor import NNActor, BoltzmannActor
from cogment_lab.envs import AECEnvironment, GymEnvironment
from cogment_lab.process_manager import Cogment
from cogment_lab.utils.runners import process_cleanup
Expand All @@ -36,6 +36,8 @@ class Parser(BaseParser):
wandb_project: str = "test"
wandb_name: str = "test"

env_name: str = "LunarLander-v2"

batch_size: int = 128
gamma: float = 0.99
replay_buffer_capacity: int = 50000
Expand All @@ -62,50 +64,46 @@ async def main():
wandb.config.replay_buffer_capacity = args.replay_buffer_capacity
wandb.config.learning_rate = args.learning_rate
wandb.config.num_episodes = args.num_episodes
wandb.config.seed = args.seed
wandb.config.env_name = args.env_name

logpath = f"logs/logs-{datetime.datetime.now().isoformat()}"

cog = Cogment(log_dir=logpath)

cenv = AECEnvironment(
env_path="cogment_lab.envs.conversions.teacher.GymTeacherAEC",
make_kwargs={"gym_env_name": "LunarLander-v2", "gym_make_kwargs": {}, "render_mode": "rgb_array"},
make_kwargs={"gym_env_name": args.env_name, "gym_make_kwargs": {}, "render_mode": "rgb_array"},
render=True,
reinitialize=True,
)

await cog.run_env(cenv, "lunar", port=9021)
await cog.run_env(cenv, "lunar", port=9021, log_file="env-aec.log")

obs_len = cenv.env.observation_space("gym").shape[0]

cenv_single = GymEnvironment(env_id="LunarLander-v2", reinitialize=True, render=True)

await cog.run_env(cenv_single, "lunar-single", port=9022)

# Run a passive teacher

passive_action = {"active": 0, "action": 0}

passive_actor = ConstantActor(passive_action)
cenv_single = GymEnvironment(env_id=args.env_name, reinitialize=True, render=True)

await cog.run_actor(passive_actor, "passive", port=9013)
await cog.run_env(cenv_single, "lunar-single", port=9022, log_file="env-gym.log")

# Create and run the learner network

replay_buffer = ReplayBuffer(args.replay_buffer_capacity, obs_len)

# Run the agent
network = FCNetwork(
input_size=obs_len, output_sizes=[cenv.env.action_space("gym").n], hidden_sizes=[256, 256], activation="tanh"
input_size=obs_len, output_sizes=[cenv.env.action_space("gym").n], hidden_sizes=[256, 256], activation="leaky_relu"
)

# actor = BoltzmannActor(network, "cpu")
actor = NNActor(network, "cpu")
optimizer = optim.Adam(network.parameters(), lr=args.learning_rate)

cog.run_local_actor(actor, "dqn", port=9012, log_file="dqn.log")

# Run the human teacher

# # Lunar lander actions
actions = {
"no-op": {"active": 0, "action": 0},
"ArrowDown": {"active": 1, "action": 0},
Expand All @@ -114,6 +112,14 @@ async def main():
"ArrowLeft": {"active": 1, "action": 3},
}

# Mountain car actions
# actions = {
# "no-op": {"active": 0, "action": 0},
# "ArrowDown": {"active": 1, "action": 1},
# "ArrowRight": {"active": 1, "action": 2},
# "ArrowLeft": {"active": 1, "action": 0},
# }

await cog.run_web_ui(actions=actions, log_file="human.log", fps=60)

print(f"Launched web UI at http://localhost:8000")
Expand All @@ -126,6 +132,8 @@ async def main():
if episode == args.human_episodes:
cog.stop_service("lunar")

# print(f"Starting episode {episode}")

if episode < args.human_episodes:
trial_id = await cog.start_trial(
env_name="lunar",
Expand All @@ -137,8 +145,35 @@ async def main():
env_name="lunar-single", actor_impls={"gym": "dqn"}, session_config={"render": True, "seed": episode}
)

trial_data = await cog.get_trial_data(trial_id)
# TODO: print(actor.num_actions)
# print(f"Started trial at episode {episode}")

trial_data_task = asyncio.create_task(cog.get_trial_data(trial_id))

# print(f"Created task at episode {episode}")

gradient_updates = 0

# if len(replay_buffer) > args.batch_size:
# # while not trial_data_task.done():
# for i in range(500):
# batch = replay_buffer.sample(args.batch_size)
#
# loss = dqn_loss(network, batch, args.gamma)
#
# optimizer.zero_grad()
# loss.backward()
# optimizer.step()
#
# gradient_updates += 1
#
# # await asyncio.sleep(0)


trial_data = await trial_data_task


# Logging
dqn_data = trial_data["gym"]

total_reward = dqn_data.rewards.sum()
Expand All @@ -147,14 +182,19 @@ async def main():

total_timesteps += len(dqn_data.rewards)

log_dict = {
"episode": episode,
"reward": total_reward,
"ep_length": len(dqn_data.rewards),
"total_timesteps": total_timesteps,
}
# log_dict = {
# "episode": episode,
# "reward": total_reward,
# "ep_length": len(dqn_data.rewards),
# "total_timesteps": total_timesteps,
# "gradient_updates": gradient_updates,
# }
#
# wandb.log(log_dict)

wandb.log(log_dict)
# print(f"Logged at episode {episode}")

# Add data to replay buffer

for t in range(len(dqn_data.done)):
state = dqn_data.observations[t]
Expand All @@ -169,6 +209,7 @@ async def main():

action = human_action if human_active == 1 else dqn_action


reward = dqn_data.rewards[t]
next_state = dqn_data.next_observations[t]
done = dqn_data.done[t]
Expand All @@ -184,6 +225,18 @@ async def main():
loss.backward()
optimizer.step()

gradient_updates += 1

log_dict = {
"episode": episode,
"reward": total_reward,
"ep_length": len(dqn_data.rewards),
"total_timesteps": total_timesteps,
"gradient_updates": gradient_updates,
}

wandb.log(log_dict)


if __name__ == "__main__":
asyncio.run(main())
68 changes: 44 additions & 24 deletions examples/demos/active-lunar/lunar-base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@
import torch
import wandb
from coltra.models import FCNetwork
from torch import optim, nn
from torch import optim
from tqdm import trange

from typarse import BaseParser

from cogment_lab.actors.nn_actor import NNActor
from cogment_lab.envs import GymEnvironment
from cogment_lab.actors import ConstantActor
from cogment_lab.actors.nn_actor import NNActor, BoltzmannActor
from cogment_lab.envs import AECEnvironment, GymEnvironment
from cogment_lab.process_manager import Cogment
from cogment_lab.utils.runners import process_cleanup
from shared import ReplayBuffer, get_current_eps, dqn_loss
Expand All @@ -36,14 +36,15 @@ class Parser(BaseParser):
wandb_project: str = "test"
wandb_name: str = "test"

env_name: str = "LunarLander-v2"

batch_size: int = 128
gamma: float = 0.99
replay_buffer_capacity: int = 50000
learning_rate: float = 6.3e-4
num_episodes: int = 500
seed: int = 0


async def main():
args = Parser()

Expand All @@ -60,42 +61,67 @@ async def main():
wandb.config.replay_buffer_capacity = args.replay_buffer_capacity
wandb.config.learning_rate = args.learning_rate
wandb.config.num_episodes = args.num_episodes
wandb.config.seed = args.seed
wandb.config.env_name = args.env_name

logpath = f"logs/logs-{datetime.datetime.now().isoformat()}"

cog = Cogment(log_dir=logpath)

cenv = GymEnvironment(env_id="LunarLander-v2", reinitialize=True, render=True)

await cog.run_env(cenv, "lunar", port=9021)
cenv = GymEnvironment(env_id=args.env_name, reinitialize=True, render=True)

obs_len = cenv.env.observation_space.shape[0]

await cog.run_env(cenv, "lunar", port=9021, log_file="env-gym.log")

# Create and run the learner network

replay_buffer = ReplayBuffer(args.replay_buffer_capacity, obs_len)

# Run the agent
network = FCNetwork(
input_size=obs_len, output_sizes=[cenv.env.action_space.n], hidden_sizes=[256, 256], activation="tanh"
input_size=obs_len, output_sizes=[cenv.env.action_space.n], hidden_sizes=[256, 256], activation="leaky_relu"
)

actor = NNActor(network, "cpu")
actor = BoltzmannActor(network, "cpu")
optimizer = optim.Adam(network.parameters(), lr=args.learning_rate)

cog.run_local_actor(actor, "dqn", port=9012)
cog.run_local_actor(actor, "dqn", port=9012, log_file="dqn.log")

total_timesteps = 0

ep_rewards = []

for episode in (pbar := trange(args.num_episodes)):
actor.set_eps(get_current_eps(episode))
for episode in (pbar := trange(1, args.num_episodes)):


trial_id = await cog.start_trial(
env_name="lunar", actor_impls={"gym": "dqn"}, session_config={"render": True, "seed": episode}
)

trial_data = await cog.get_trial_data(trial_id)
trial_data_task = asyncio.create_task(cog.get_trial_data(trial_id))

gradient_updates = 0

if len(replay_buffer) > args.batch_size:
# while not trial_data_task.done():
for i in range(200):
batch = replay_buffer.sample(args.batch_size)

loss = dqn_loss(network, batch, args.gamma)

optimizer.zero_grad()
loss.backward()
optimizer.step()

gradient_updates += 1

# await asyncio.sleep(0)


trial_data = await trial_data_task

# Logging
dqn_data = trial_data["gym"]

total_reward = dqn_data.rewards.sum()
Expand All @@ -109,10 +135,14 @@ async def main():
"reward": total_reward,
"ep_length": len(dqn_data.rewards),
"total_timesteps": total_timesteps,
"gradient_updates": gradient_updates,
}

wandb.log(log_dict)


# Add data to replay buffer

for t in range(len(dqn_data.done)):
state = dqn_data.observations[t]
dqn_action = dqn_data.actions[t]
Expand All @@ -125,16 +155,6 @@ async def main():

replay_buffer.push(state, action, reward, next_state, done)

if len(replay_buffer) > args.batch_size:
# Single DQN update
batch = replay_buffer.sample(args.batch_size)

loss = dqn_loss(network, batch, args.gamma)

optimizer.zero_grad()
loss.backward()
optimizer.step()


if __name__ == "__main__":
asyncio.run(main())

0 comments on commit 758372d

Please sign in to comment.