V0.1.1 Add Discount reward

french-ai · Dec 16, 2020 · c2eb003 · c2eb003
2 parents 5386ec0 + bda359d
commit c2eb003
Show file tree

Hide file tree

Showing 8 changed files with 1,699 additions and 4,890 deletions.
diff --git a/TODO.md b/TODO.md
@@ -82,7 +82,8 @@
 - [ ] Prioritized Experience Replay (Schaul *et al.*, [2015](https://arxiv.org/abs/1511.05952))
 - [ ] Hindsight Experience Replay (Andrychowicz *et al.*, [2017](https://arxiv.org/abs/1707.01495))
 
-- [ ] Add advantage in all memories or create Advantage buffer
+- [ ] Add temporal difference option in all memories
+- [x] Add Discount reward in experience replay
 
 # Environments list
 

diff --git a/blobrl/memories/experience_replay.py b/blobrl/memories/experience_replay.py
@@ -1,36 +1,41 @@
 import numpy as np
 import torch
+from collections import deque
 
 from blobrl.memories import MemoryInterface
 
 
 class ExperienceReplay(MemoryInterface):
 
-    def __init__(self, max_size=5000):
+    def __init__(self, max_size=5000, gamma=0.0):
         """
+        Create ExperienceReplay with buffersize equal to max_size
 
-        :param max_size:
+        :param max_size: size max of buffer
+        :type max_size: int
+        :param gamma: gamma for discount reward. 0 disable discount reward
+        :type gamma: float [0,1]
         """
-        self.max_size = max_size
-        self.buffer = np.empty(shape=(self.max_size, 5), dtype=np.object)
-        self.index = 0
-        self.size = 0
+        self.buffer = deque(maxlen=max_size)
+        if not 0 <= gamma <= 1:
+            raise ValueError("gamma need to be in range [0,1] not " + str(gamma))
+        self.gamma = gamma
 
     def append(self, observation, action, reward, next_observation, done):
         """
+        Store one couple of value
 
         :param observation:
         :param action:
         :param reward:
         :param next_observation:
         :param done:
         """
-        self.buffer[self.index] = np.array([np.array(observation), action, reward, np.array(next_observation), done])
-        self.index = (self.index + 1) % self.max_size
-        self.size = min(self.size + 1, self.max_size)
+        self.buffer.append([observation, action, reward, next_observation, done])
 
     def extend(self, observations, actions, rewards, next_observations, dones):
         """
+        Store many couple of value
 
         :param observations:
         :param actions:
@@ -43,14 +48,36 @@ def extend(self, observations, actions, rewards, next_observations, dones):
 
     def sample(self, batch_size, device):
         """
+        returns *batch_size* of samples
 
-        :param device:
+        :param device: torch device to run agent
+        :type device: torch.device
         :param batch_size:
-        :return:
+        :type batch_size: int
+        :return: list<Tensor>
         """
-        idxs = np.random.randint(self.size, size=batch_size)
+        idxs = np.random.randint(len(self.buffer), size=batch_size)
 
-        return [torch.Tensor(list(V)).to(device=device) for V in self.buffer[idxs].T]
+        batch = np.array([self.get_sample(idx) for idx in idxs])
+
+        return [torch.Tensor(list(V)).to(device=device) for V in batch.T]
+
+    def get_sample(self, idx):
+        """
+        returns sample at idx position. if self.gamma not equal to 0 apply discount reward.
+
+        :param idx: torch device to run agent
+        :type idx: int
+        :return: [observation, action, reward, next_observation, done]
+        """
+        sample = self.buffer[idx]
+        if self.gamma == 0 or sample[4] is True:
+            return sample
+
+        if idx + 1 < len(self.buffer):
+            sample[2] = sample[2] + self.gamma * self.get_sample(idx + 1)[2]
+
+        return sample
 
     def __str__(self):
-        return 'ExperienceReplay-' + str(self.max_size)
+        return 'ExperienceReplay-' + str(self.buffer.maxlen) + '-' + str(self.gamma)
diff --git a/blobrl/memories/memory_interface.py b/blobrl/memories/memory_interface.py
@@ -6,6 +6,7 @@ class MemoryInterface(metaclass=abc.ABCMeta):
     @abc.abstractmethod
     def append(self, observation, action, reward, next_observation, done) -> None:
         """
+        Store one couple of value
 
         :param observation:
         :param action:
@@ -18,6 +19,7 @@ def append(self, observation, action, reward, next_observation, done) -> None:
     @abc.abstractmethod
     def extend(self, observations, actions, rewards, next_observations, dones) -> None:
         """
+        Store many couple of value
 
         :param observations:
         :param actions:
@@ -30,9 +32,13 @@ def extend(self, observations, actions, rewards, next_observations, dones) -> No
     @abc.abstractmethod
     def sample(self, batch_size, device):
         """
+        returns *batch_size* sample
 
-        :param device:
+        :param device: torch device to run agent
+        :type: torch.device
         :param batch_size:
+        :type: int
+        :return: list<Tensor>
         """
         pass
 

diff --git a/examples/example_train_jupyter.ipynb b/examples/example_train_jupyter.ipynb