Merge pull request #81 from french-ai/develop

Version 0.1.0
french-ai · Dec 7, 2020 · 0ab3b0a · 0ab3b0a
2 parents a59fa63 + 3619ee6
commit 0ab3b0a
Show file tree

Hide file tree

Showing 44 changed files with 6,475 additions and 5,287 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 ============
 
 [![Read the Docs](https://img.shields.io/readthedocs/blobrl?style=for-the-badge)](https://blobrl.readthedocs.io/en/latest/?badge=latest)
-[![Build Status](https://img.shields.io/travis/french-ai/reinforcement?branch=master.svg&style=for-the-badge)](https://travis-ci.org/french-ai/reinforcement)
+[![Build Status](https://img.shields.io/travis/french-ai/reinforcement/master.svg?=master&style=for-the-badge)](https://travis-ci.org/french-ai/reinforcement)
 [![CodeFactor](https://www.codefactor.io/repository/github/french-ai/reinforcement/badge?style=for-the-badge)](https://www.codefactor.io/repository/github/french-ai/reinforcement)
 [![Codecov](https://img.shields.io/codecov/c/github/french-ai/reinforcement?style=for-the-badge)](https://codecov.io/gh/french-ai/reinforcement)
 [![Discord](https://img.shields.io/badge/discord-chat-7289DA.svg?logo=Discord&style=for-the-badge)](https://discord.gg/f5MZP2K)

diff --git a/TODO.md b/TODO.md
@@ -20,6 +20,7 @@
 - [x] Random Agent
 - [x] Constant Agent
 
+
 - [x] Deep Q Network (Mnih *et al.*, [2013](https://arxiv.org/abs/1312.5602))
 - [ ] Deep Recurrent Q Network (Hausknecht *et al.*, [2015](https://arxiv.org/abs/1507.06527))
 - [ ] Persistent Advantage Learning (Bellamare *et al.*, [2015](https://arxiv.org/abs/1512.04860))
@@ -30,34 +31,41 @@
 - [x] Categorical Deep Q Network (Bellamare *et al.*, [2017](https://arxiv.org/abs/1707.06887))
 - [ ] Quantile Regression DQN (Dabney et al, [2017](https://arxiv.org/abs/1710.10044))
 
+
 - [ ] Rainbow (Hessel *et al.*, [2017](https://arxiv.org/abs/1710.02298))
 - [ ] Quantile Regression Deep Q Network (Dabney *et al.*, [2017](https://arxiv.org/abs/1710.10044))
 
+
 - [ ] Soft Actor-Critic (Haarnoja et al, [2018](https://arxiv.org/abs/1801.01290))
 
+
 - [ ] Vanilla Policy Gradient ([2000](https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf))
 
+
 - [ ] Deep Deterministic Policy Gradient (Lillicrap et al, [2015](https://arxiv.org/abs/1509.02971))
 - [ ] Twin Delayed DDPG (Fujimoto et al, [2018](https://arxiv.org/abs/1802.09477))
 
+
 - [ ] Trust Region Policy Optimization (Schulman *et al.*, [2015](https://arxiv.org/abs/1502.05477))
 - [ ] Proximal Policy Optimizations (Schulman *et al.*, [2017](https://arxiv.org/abs/1707.06347))
 
+
 - [ ] A2C (Mnih et al, [2016](https://arxiv.org/abs/1602.01783))
 - [ ] A3C (Mnih et al, [2016](https://arxiv.org/abs/1602.01783))
 
+
 - [ ] Hindsight Experience Replay (Andrychowicz et al, [2017](https://arxiv.org/abs/1707.01495))
 
 # Network
 
-- [ ] base network support discrete action space
-- [ ] base network support continuous action space
-- [ ] base network support discrete observation space
-- [ ] base network support continuous observation space
-- [ ] simple network support discrete/continuous action/observation space
-- [ ] c51 network support discrete/continuous action/observation space
-- [ ] base dueling network support discrete/continuous action/observation space
-- [ ] simple dueling network support discrete/continuous action/observation space
+- [x] base network support discrete action space
+- [x] base network support continuous action space
+- [x] base network support discrete observation space
+- [x] base network support continuous observation space
+- [x] simple network support discrete/continuous action/observation space
+- [x] c51 network support discrete action/observation space
+- [x] base dueling network support discrete/continuous action/observation space
+- [x] simple dueling network support discrete/continuous action/observation space
 
 # Explorations list
 

diff --git a/blobrl/agents/__init__.py b/blobrl/agents/__init__.py
@@ -3,5 +3,4 @@
 from .agent_random import AgentRandom
 from .dqn import DQN
 from .double_dqn import DoubleDQN
-from .dueling_dqn import DuelingDQN
 from .categorical_dqn import CategoricalDQN
diff --git a/blobrl/agents/agent_constant.py b/blobrl/agents/agent_constant.py
@@ -25,14 +25,7 @@ def __init__(self, observation_space, action_space, device=None):
         :param action_space: Space for init action size
         :type observation_space: gym.Space
         """
-        super().__init__(device)
-        if not isinstance(action_space, Space):
-            raise TypeError("action_space need to be instance of gym.spaces.Space, not :" + str(type(action_space)))
-        if not isinstance(observation_space, Space):
-            raise TypeError(
-                "observation_space need to be instance of gym.spaces.Space, not :" + str(type(observation_space)))
-        self.action_space = action_space
-        self.observation_space = observation_space
+        super().__init__(observation_space, action_space, device)
 
         self.action = self.action_space.sample()
 

diff --git a/blobrl/agents/agent_interface.py b/blobrl/agents/agent_interface.py
@@ -2,15 +2,30 @@
 
 import torch
 
+from gym.spaces import Space
+
 
 class AgentInterface(metaclass=abc.ABCMeta):
 
-    def __init__(self, device):
+    def __init__(self, observation_space, action_space, device):
         """
 
+        :param device: torch device to run agent
+        :type: torch.device
+        :param observation_space: Space for init observation size
+        :type observation_space: gym.Space
         :param device: torch device to run agent
         :type: torch.device
         """
+
+        if not isinstance(action_space, Space):
+            raise TypeError("action_space need to be instance of gym.spaces.Space, not :" + str(type(action_space)))
+        if not isinstance(observation_space, Space):
+            raise TypeError(
+                "observation_space need to be instance of gym.spaces.Space, not :" + str(type(observation_space)))
+        self.action_space = action_space
+        self.observation_space = observation_space
+
         if device is None:
             device = torch.device("cpu")
         if not isinstance(device, torch.device):

diff --git a/blobrl/agents/agent_random.py b/blobrl/agents/agent_random.py
@@ -2,7 +2,6 @@
 import pickle
 
 import torch
-from gym.spaces import Space
 
 from blobrl.agents import AgentInterface
 
@@ -25,14 +24,7 @@ def __init__(self, observation_space, action_space, device=None):
         :param action_space: Space for init action size
         :type observation_space: gym.Space
         """
-        super().__init__(device)
-        if not isinstance(action_space, Space):
-            raise TypeError("action_space need to be instance of gym.spaces.Space, not :" + str(type(action_space)))
-        if not isinstance(observation_space, Space):
-            raise TypeError(
-                "observation_space need to be instance of gym.spaces.Space, not :" + str(type(observation_space)))
-        self.action_space = action_space
-        self.observation_space = observation_space
+        super().__init__(observation_space, action_space, device)
 
     def get_action(self, observation):
         """ Return action randomly choice in action_space

diff --git a/blobrl/agents/categorical_dqn.py b/blobrl/agents/categorical_dqn.py
@@ -1,7 +1,7 @@
 import torch
 import torch.nn.functional as F
 import torch.optim as optim
-from gym.spaces import Discrete, Space, flatdim, flatten
+from gym.spaces import flatten
 
 from blobrl.agents import DQN
 from blobrl.memories import ExperienceReplay
@@ -10,8 +10,8 @@
 
 class CategoricalDQN(DQN):
 
-    def __init__(self, action_space, observation_space, memory=ExperienceReplay(), neural_network=None, num_atoms=51,
-                 r_min=-10, r_max=10, step_train=2, batch_size=32, gamma=0.99,
+    def __init__(self, observation_space, action_space, memory=ExperienceReplay(), network=None, num_atoms=51,
+                 r_min=-10, r_max=10, step_train=1, batch_size=32, gamma=1.0,
                  optimizer=None, greedy_exploration=None, device=None):
         """
 
@@ -20,7 +20,7 @@ def __init__(self, action_space, observation_space, memory=ExperienceReplay(), n
         :param action_space:
         :param observation_space:
         :param memory:
-        :param neural_network:
+        :param network:
         :param num_atoms:
         :param r_min:
         :param r_max:
@@ -30,25 +30,16 @@ def __init__(self, action_space, observation_space, memory=ExperienceReplay(), n
         :param optimizer:
         :param greedy_exploration:
         """
-        loss = None
-
-        if not isinstance(action_space, Discrete):
-            raise TypeError(
-                "action_space need to be instance of gym.spaces.Space.Discrete, not :" + str(type(action_space)))
-        if not isinstance(observation_space, Space):
-            raise TypeError(
-                "observation_space need to be instance of gym.spaces.Space.Discrete, not :" + str(
-                    type(observation_space)))
-
-        if neural_network is None and optimizer is None:
-            neural_network = C51Network(observation_shape=flatdim(observation_space),
-                                        action_shape=flatdim(action_space))
+        if network is None and optimizer is None:
+            network = C51Network(observation_space=observation_space,
+                                 action_space=action_space)
             num_atoms = 51
 
-            optimizer = optim.Adam(neural_network.parameters())
+            optimizer = optim.Adam(network.parameters())
 
-        super().__init__(action_space, observation_space, memory, neural_network, step_train, batch_size, gamma, loss,
-                         optimizer, greedy_exploration, device=device)
+        super().__init__(observation_space=observation_space, action_space=action_space, memory=memory,
+                         network=network, step_train=step_train, batch_size=batch_size, gamma=gamma,
+                         loss=None, optimizer=optimizer, greedy_exploration=greedy_exploration, device=device)
 
         self.num_atoms = num_atoms
         self.r_min = r_min
@@ -63,70 +54,77 @@ def get_action(self, observation):
         :param observation: stat of environment
         :type observation: gym.Space
         """
-        observation = torch.tensor([flatten(self.observation_space, observation)], device=self.device)
+        if not self.greedy_exploration.be_greedy(self.step) and self.with_exploration:
+            return self.action_space.sample()
 
-        prediction = self.neural_network.forward(observation).detach()[0]
-        q_values = prediction * self.z
-        q_values = torch.sum(q_values, dim=1)
+        observation = torch.tensor([flatten(self.observation_space, observation)], device=self.device).float()
 
-        return torch.argmax(q_values).detach().item()
+        prediction = self.network.forward(observation)
 
-    def train(self):
-        """
+        def return_values(values):
+            if isinstance(values, list):
+                return [return_values(v) for v in values]
 
-        """
-        self.batch_size = 3
+            q_values = values * self.z
+            q_values = torch.sum(q_values, dim=2)
+            return torch.argmax(q_values).detach().item()
+
+        return return_values(prediction)
+
+    def apply_loss(self, next_prediction, prediction, actions, rewards, next_observations, dones, len_space):
+        if isinstance(next_prediction, list):
+            [self.apply_loss(n, p, a, rewards, next_observations, dones, c) for n, p, a, c in
+             zip(next_prediction, prediction, actions.permute(1, 0, *[i for i in range(2, len(actions.shape))]),
+                 len_space)]
+        else:
 
-        observations, actions, rewards, next_observations, dones = self.memory.sample(self.batch_size,
-                                                                                      device=self.device)
+            q_values_next = next_prediction * self.z
+            q_values_next = torch.sum(q_values_next, dim=2)
 
-        actions = actions.to(torch.long)
-        actions = F.one_hot(actions, num_classes=self.action_space.n)
+            actions = F.one_hot(actions.long(), num_classes=len_space)
 
-        predictions_next = self.neural_network.forward(next_observations).detach()
-        q_values_next = predictions_next * self.z
-        q_values_next = torch.sum(q_values_next, dim=2)
+            actions_next = torch.argmax(q_values_next, dim=1)
+            actions_next = F.one_hot(actions_next, num_classes=len_space)
 
-        actions_next = torch.argmax(q_values_next, dim=1)
-        actions_next = actions_next.to(torch.long)
-        actions_next = F.one_hot(actions_next, num_classes=self.action_space.n)
+            dones = dones.view(-1, 1)
 
-        dones = dones.view(-1, 1)
+            tz = rewards.view(-1, 1) + self.gamma * self.z * (1 - dones)
+            tz = tz.clamp(self.r_min, self.r_max)
+            b = (tz - self.r_min) / self.delta_z
 
-        tz = torch.clamp(rewards.view(-1, 1) + self.gamma * self.z * (1 - dones), self.r_min, self.r_max)
-        b = (tz - self.r_min) / self.delta_z
+            l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
 
-        l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
+            l[(u > 0) * (l == u)] -= 1
+            u[(l < (self.num_atoms - 1)) * (l == u)] += 1
 
-        m_prob = torch.zeros((self.batch_size, self.action_space.n, self.num_atoms), device=self.device)
+            m_prob = torch.zeros((self.batch_size, len_space, self.num_atoms), device=self.device)
 
-        predictions_next = predictions_next[actions_next == 1, :]
+            predictions_next = next_prediction[actions_next == 1, :]
 
-        offset = torch.linspace(0, (self.batch_size - 1) * self.num_atoms, self.batch_size, device=self.device).view(-1,
-                                                                                                                     1)
-        offset = offset.expand(self.batch_size, self.num_atoms)
+            offset = torch.linspace(0, (self.batch_size - 1) * self.num_atoms, self.batch_size,
+                                    device=self.device).view(-1,
+                                                             1)
+            offset = offset.expand(self.batch_size, self.num_atoms)
 
-        u_index = (u + offset).view(-1).to(torch.int64)
-        l_index = (l + offset).view(-1).to(torch.int64)
+            u_index = (u + offset).view(-1).to(torch.int64)
+            l_index = (l + offset).view(-1).to(torch.int64)
 
-        predictions_next = (dones + (1 - dones) * predictions_next)
+            predictions_next = (dones + (1 - dones) * predictions_next)
 
-        m_prob_action = m_prob[actions == 1, :].view(-1)
-        m_prob_action.index_add_(0, u_index, (predictions_next * (u - b)).view(-1))
-        m_prob_action.index_add_(0, l_index, (predictions_next * (b - l)).view(-1))
+            m_prob_action = m_prob[actions == 1, :].view(-1)
+            m_prob_action.index_add_(0, u_index, (predictions_next * (u - b)).view(-1))
+            m_prob_action.index_add_(0, l_index, (predictions_next * (b - l)).view(-1))
 
-        m_prob[actions == 1, :] = m_prob_action.view(-1, self.num_atoms)
+            m_prob[actions == 1, :] = m_prob_action.view(-1, self.num_atoms)
 
-        self.optimizer.zero_grad()
-        predictions = self.neural_network.forward(observations)
-        loss = - predictions.log() * m_prob
-        loss.sum((1, 2)).mean().backward()
+            self.optimizer.zero_grad()
 
-        self.optimizer.step()
+            loss = - prediction.log() * m_prob
+            loss.sum((1, 2)).mean().backward(retain_graph=True)
 
     def __str__(self):
         return 'CategoricalDQN-' + str(self.observation_space) + "-" + str(self.action_space) + "-" + str(
-            self.neural_network) + "-" + str(self.memory) + "-" + str(self.step_train) + "-" + str(
+            self.network) + "-" + str(self.memory) + "-" + str(self.step_train) + "-" + str(
             self.step) + "-" + str(self.batch_size) + "-" + str(self.gamma) + "-" + str(self.loss) + "-" + str(
             self.optimizer) + "-" + str(self.greedy_exploration) + "-" + str(self.num_atoms) + "-" + str(
             self.r_min) + "-" + str(self.r_max) + "-" + str(self.delta_z) + "-" + str(self.z)