Skip to content

Commit

Permalink
Merge pull request #81 from french-ai/develop
Browse files Browse the repository at this point in the history
Version 0.1.0
  • Loading branch information
bruzat authored Dec 7, 2020
2 parents a59fa63 + 3619ee6 commit 0ab3b0a
Show file tree
Hide file tree
Showing 44 changed files with 6,475 additions and 5,287 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
============

[![Read the Docs](https://img.shields.io/readthedocs/blobrl?style=for-the-badge)](https://blobrl.readthedocs.io/en/latest/?badge=latest)
[![Build Status](https://img.shields.io/travis/french-ai/reinforcement?branch=master.svg&style=for-the-badge)](https://travis-ci.org/french-ai/reinforcement)
[![Build Status](https://img.shields.io/travis/french-ai/reinforcement/master.svg?=master&style=for-the-badge)](https://travis-ci.org/french-ai/reinforcement)
[![CodeFactor](https://www.codefactor.io/repository/github/french-ai/reinforcement/badge?style=for-the-badge)](https://www.codefactor.io/repository/github/french-ai/reinforcement)
[![Codecov](https://img.shields.io/codecov/c/github/french-ai/reinforcement?style=for-the-badge)](https://codecov.io/gh/french-ai/reinforcement)
[![Discord](https://img.shields.io/badge/discord-chat-7289DA.svg?logo=Discord&style=for-the-badge)](https://discord.gg/f5MZP2K)
Expand Down
24 changes: 16 additions & 8 deletions TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
- [x] Random Agent
- [x] Constant Agent


- [x] Deep Q Network (Mnih *et al.*, [2013](https://arxiv.org/abs/1312.5602))
- [ ] Deep Recurrent Q Network (Hausknecht *et al.*, [2015](https://arxiv.org/abs/1507.06527))
- [ ] Persistent Advantage Learning (Bellamare *et al.*, [2015](https://arxiv.org/abs/1512.04860))
Expand All @@ -30,34 +31,41 @@
- [x] Categorical Deep Q Network (Bellamare *et al.*, [2017](https://arxiv.org/abs/1707.06887))
- [ ] Quantile Regression DQN (Dabney et al, [2017](https://arxiv.org/abs/1710.10044))


- [ ] Rainbow (Hessel *et al.*, [2017](https://arxiv.org/abs/1710.02298))
- [ ] Quantile Regression Deep Q Network (Dabney *et al.*, [2017](https://arxiv.org/abs/1710.10044))


- [ ] Soft Actor-Critic (Haarnoja et al, [2018](https://arxiv.org/abs/1801.01290))


- [ ] Vanilla Policy Gradient ([2000](https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf))


- [ ] Deep Deterministic Policy Gradient (Lillicrap et al, [2015](https://arxiv.org/abs/1509.02971))
- [ ] Twin Delayed DDPG (Fujimoto et al, [2018](https://arxiv.org/abs/1802.09477))


- [ ] Trust Region Policy Optimization (Schulman *et al.*, [2015](https://arxiv.org/abs/1502.05477))
- [ ] Proximal Policy Optimizations (Schulman *et al.*, [2017](https://arxiv.org/abs/1707.06347))


- [ ] A2C (Mnih et al, [2016](https://arxiv.org/abs/1602.01783))
- [ ] A3C (Mnih et al, [2016](https://arxiv.org/abs/1602.01783))


- [ ] Hindsight Experience Replay (Andrychowicz et al, [2017](https://arxiv.org/abs/1707.01495))

# Network

- [ ] base network support discrete action space
- [ ] base network support continuous action space
- [ ] base network support discrete observation space
- [ ] base network support continuous observation space
- [ ] simple network support discrete/continuous action/observation space
- [ ] c51 network support discrete/continuous action/observation space
- [ ] base dueling network support discrete/continuous action/observation space
- [ ] simple dueling network support discrete/continuous action/observation space
- [x] base network support discrete action space
- [x] base network support continuous action space
- [x] base network support discrete observation space
- [x] base network support continuous observation space
- [x] simple network support discrete/continuous action/observation space
- [x] c51 network support discrete action/observation space
- [x] base dueling network support discrete/continuous action/observation space
- [x] simple dueling network support discrete/continuous action/observation space

# Explorations list

Expand Down
1 change: 0 additions & 1 deletion blobrl/agents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,4 @@
from .agent_random import AgentRandom
from .dqn import DQN
from .double_dqn import DoubleDQN
from .dueling_dqn import DuelingDQN
from .categorical_dqn import CategoricalDQN
9 changes: 1 addition & 8 deletions blobrl/agents/agent_constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,7 @@ def __init__(self, observation_space, action_space, device=None):
:param action_space: Space for init action size
:type observation_space: gym.Space
"""
super().__init__(device)
if not isinstance(action_space, Space):
raise TypeError("action_space need to be instance of gym.spaces.Space, not :" + str(type(action_space)))
if not isinstance(observation_space, Space):
raise TypeError(
"observation_space need to be instance of gym.spaces.Space, not :" + str(type(observation_space)))
self.action_space = action_space
self.observation_space = observation_space
super().__init__(observation_space, action_space, device)

self.action = self.action_space.sample()

Expand Down
17 changes: 16 additions & 1 deletion blobrl/agents/agent_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,30 @@

import torch

from gym.spaces import Space


class AgentInterface(metaclass=abc.ABCMeta):

def __init__(self, device):
def __init__(self, observation_space, action_space, device):
"""
:param device: torch device to run agent
:type: torch.device
:param observation_space: Space for init observation size
:type observation_space: gym.Space
:param device: torch device to run agent
:type: torch.device
"""

if not isinstance(action_space, Space):
raise TypeError("action_space need to be instance of gym.spaces.Space, not :" + str(type(action_space)))
if not isinstance(observation_space, Space):
raise TypeError(
"observation_space need to be instance of gym.spaces.Space, not :" + str(type(observation_space)))
self.action_space = action_space
self.observation_space = observation_space

if device is None:
device = torch.device("cpu")
if not isinstance(device, torch.device):
Expand Down
10 changes: 1 addition & 9 deletions blobrl/agents/agent_random.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import pickle

import torch
from gym.spaces import Space

from blobrl.agents import AgentInterface

Expand All @@ -25,14 +24,7 @@ def __init__(self, observation_space, action_space, device=None):
:param action_space: Space for init action size
:type observation_space: gym.Space
"""
super().__init__(device)
if not isinstance(action_space, Space):
raise TypeError("action_space need to be instance of gym.spaces.Space, not :" + str(type(action_space)))
if not isinstance(observation_space, Space):
raise TypeError(
"observation_space need to be instance of gym.spaces.Space, not :" + str(type(observation_space)))
self.action_space = action_space
self.observation_space = observation_space
super().__init__(observation_space, action_space, device)

def get_action(self, observation):
""" Return action randomly choice in action_space
Expand Down
120 changes: 59 additions & 61 deletions blobrl/agents/categorical_dqn.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import torch
import torch.nn.functional as F
import torch.optim as optim
from gym.spaces import Discrete, Space, flatdim, flatten
from gym.spaces import flatten

from blobrl.agents import DQN
from blobrl.memories import ExperienceReplay
Expand All @@ -10,8 +10,8 @@

class CategoricalDQN(DQN):

def __init__(self, action_space, observation_space, memory=ExperienceReplay(), neural_network=None, num_atoms=51,
r_min=-10, r_max=10, step_train=2, batch_size=32, gamma=0.99,
def __init__(self, observation_space, action_space, memory=ExperienceReplay(), network=None, num_atoms=51,
r_min=-10, r_max=10, step_train=1, batch_size=32, gamma=1.0,
optimizer=None, greedy_exploration=None, device=None):
"""
Expand All @@ -20,7 +20,7 @@ def __init__(self, action_space, observation_space, memory=ExperienceReplay(), n
:param action_space:
:param observation_space:
:param memory:
:param neural_network:
:param network:
:param num_atoms:
:param r_min:
:param r_max:
Expand All @@ -30,25 +30,16 @@ def __init__(self, action_space, observation_space, memory=ExperienceReplay(), n
:param optimizer:
:param greedy_exploration:
"""
loss = None

if not isinstance(action_space, Discrete):
raise TypeError(
"action_space need to be instance of gym.spaces.Space.Discrete, not :" + str(type(action_space)))
if not isinstance(observation_space, Space):
raise TypeError(
"observation_space need to be instance of gym.spaces.Space.Discrete, not :" + str(
type(observation_space)))

if neural_network is None and optimizer is None:
neural_network = C51Network(observation_shape=flatdim(observation_space),
action_shape=flatdim(action_space))
if network is None and optimizer is None:
network = C51Network(observation_space=observation_space,
action_space=action_space)
num_atoms = 51

optimizer = optim.Adam(neural_network.parameters())
optimizer = optim.Adam(network.parameters())

super().__init__(action_space, observation_space, memory, neural_network, step_train, batch_size, gamma, loss,
optimizer, greedy_exploration, device=device)
super().__init__(observation_space=observation_space, action_space=action_space, memory=memory,
network=network, step_train=step_train, batch_size=batch_size, gamma=gamma,
loss=None, optimizer=optimizer, greedy_exploration=greedy_exploration, device=device)

self.num_atoms = num_atoms
self.r_min = r_min
Expand All @@ -63,70 +54,77 @@ def get_action(self, observation):
:param observation: stat of environment
:type observation: gym.Space
"""
observation = torch.tensor([flatten(self.observation_space, observation)], device=self.device)
if not self.greedy_exploration.be_greedy(self.step) and self.with_exploration:
return self.action_space.sample()

prediction = self.neural_network.forward(observation).detach()[0]
q_values = prediction * self.z
q_values = torch.sum(q_values, dim=1)
observation = torch.tensor([flatten(self.observation_space, observation)], device=self.device).float()

return torch.argmax(q_values).detach().item()
prediction = self.network.forward(observation)

def train(self):
"""
def return_values(values):
if isinstance(values, list):
return [return_values(v) for v in values]

"""
self.batch_size = 3
q_values = values * self.z
q_values = torch.sum(q_values, dim=2)
return torch.argmax(q_values).detach().item()

return return_values(prediction)

def apply_loss(self, next_prediction, prediction, actions, rewards, next_observations, dones, len_space):
if isinstance(next_prediction, list):
[self.apply_loss(n, p, a, rewards, next_observations, dones, c) for n, p, a, c in
zip(next_prediction, prediction, actions.permute(1, 0, *[i for i in range(2, len(actions.shape))]),
len_space)]
else:

observations, actions, rewards, next_observations, dones = self.memory.sample(self.batch_size,
device=self.device)
q_values_next = next_prediction * self.z
q_values_next = torch.sum(q_values_next, dim=2)

actions = actions.to(torch.long)
actions = F.one_hot(actions, num_classes=self.action_space.n)
actions = F.one_hot(actions.long(), num_classes=len_space)

predictions_next = self.neural_network.forward(next_observations).detach()
q_values_next = predictions_next * self.z
q_values_next = torch.sum(q_values_next, dim=2)
actions_next = torch.argmax(q_values_next, dim=1)
actions_next = F.one_hot(actions_next, num_classes=len_space)

actions_next = torch.argmax(q_values_next, dim=1)
actions_next = actions_next.to(torch.long)
actions_next = F.one_hot(actions_next, num_classes=self.action_space.n)
dones = dones.view(-1, 1)

dones = dones.view(-1, 1)
tz = rewards.view(-1, 1) + self.gamma * self.z * (1 - dones)
tz = tz.clamp(self.r_min, self.r_max)
b = (tz - self.r_min) / self.delta_z

tz = torch.clamp(rewards.view(-1, 1) + self.gamma * self.z * (1 - dones), self.r_min, self.r_max)
b = (tz - self.r_min) / self.delta_z
l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)

l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
l[(u > 0) * (l == u)] -= 1
u[(l < (self.num_atoms - 1)) * (l == u)] += 1

m_prob = torch.zeros((self.batch_size, self.action_space.n, self.num_atoms), device=self.device)
m_prob = torch.zeros((self.batch_size, len_space, self.num_atoms), device=self.device)

predictions_next = predictions_next[actions_next == 1, :]
predictions_next = next_prediction[actions_next == 1, :]

offset = torch.linspace(0, (self.batch_size - 1) * self.num_atoms, self.batch_size, device=self.device).view(-1,
1)
offset = offset.expand(self.batch_size, self.num_atoms)
offset = torch.linspace(0, (self.batch_size - 1) * self.num_atoms, self.batch_size,
device=self.device).view(-1,
1)
offset = offset.expand(self.batch_size, self.num_atoms)

u_index = (u + offset).view(-1).to(torch.int64)
l_index = (l + offset).view(-1).to(torch.int64)
u_index = (u + offset).view(-1).to(torch.int64)
l_index = (l + offset).view(-1).to(torch.int64)

predictions_next = (dones + (1 - dones) * predictions_next)
predictions_next = (dones + (1 - dones) * predictions_next)

m_prob_action = m_prob[actions == 1, :].view(-1)
m_prob_action.index_add_(0, u_index, (predictions_next * (u - b)).view(-1))
m_prob_action.index_add_(0, l_index, (predictions_next * (b - l)).view(-1))
m_prob_action = m_prob[actions == 1, :].view(-1)
m_prob_action.index_add_(0, u_index, (predictions_next * (u - b)).view(-1))
m_prob_action.index_add_(0, l_index, (predictions_next * (b - l)).view(-1))

m_prob[actions == 1, :] = m_prob_action.view(-1, self.num_atoms)
m_prob[actions == 1, :] = m_prob_action.view(-1, self.num_atoms)

self.optimizer.zero_grad()
predictions = self.neural_network.forward(observations)
loss = - predictions.log() * m_prob
loss.sum((1, 2)).mean().backward()
self.optimizer.zero_grad()

self.optimizer.step()
loss = - prediction.log() * m_prob
loss.sum((1, 2)).mean().backward(retain_graph=True)

def __str__(self):
return 'CategoricalDQN-' + str(self.observation_space) + "-" + str(self.action_space) + "-" + str(
self.neural_network) + "-" + str(self.memory) + "-" + str(self.step_train) + "-" + str(
self.network) + "-" + str(self.memory) + "-" + str(self.step_train) + "-" + str(
self.step) + "-" + str(self.batch_size) + "-" + str(self.gamma) + "-" + str(self.loss) + "-" + str(
self.optimizer) + "-" + str(self.greedy_exploration) + "-" + str(self.num_atoms) + "-" + str(
self.r_min) + "-" + str(self.r_max) + "-" + str(self.delta_z) + "-" + str(self.z)
Loading

0 comments on commit 0ab3b0a

Please sign in to comment.