Skip to content

Commit

Permalink
V0.1.1 Add Discount reward
Browse files Browse the repository at this point in the history
V0.1.1 Add Discount reward
  • Loading branch information
bruzat authored Dec 16, 2020
2 parents 5386ec0 + bda359d commit c2eb003
Show file tree
Hide file tree
Showing 8 changed files with 1,699 additions and 4,890 deletions.
3 changes: 2 additions & 1 deletion TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@
- [ ] Prioritized Experience Replay (Schaul *et al.*, [2015](https://arxiv.org/abs/1511.05952))
- [ ] Hindsight Experience Replay (Andrychowicz *et al.*, [2017](https://arxiv.org/abs/1707.01495))

- [ ] Add advantage in all memories or create Advantage buffer
- [ ] Add temporal difference option in all memories
- [x] Add Discount reward in experience replay

# Environments list

Expand Down
55 changes: 41 additions & 14 deletions blobrl/memories/experience_replay.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,41 @@
import numpy as np
import torch
from collections import deque

from blobrl.memories import MemoryInterface


class ExperienceReplay(MemoryInterface):

def __init__(self, max_size=5000):
def __init__(self, max_size=5000, gamma=0.0):
"""
Create ExperienceReplay with buffersize equal to max_size
:param max_size:
:param max_size: size max of buffer
:type max_size: int
:param gamma: gamma for discount reward. 0 disable discount reward
:type gamma: float [0,1]
"""
self.max_size = max_size
self.buffer = np.empty(shape=(self.max_size, 5), dtype=np.object)
self.index = 0
self.size = 0
self.buffer = deque(maxlen=max_size)
if not 0 <= gamma <= 1:
raise ValueError("gamma need to be in range [0,1] not " + str(gamma))
self.gamma = gamma

def append(self, observation, action, reward, next_observation, done):
"""
Store one couple of value
:param observation:
:param action:
:param reward:
:param next_observation:
:param done:
"""
self.buffer[self.index] = np.array([np.array(observation), action, reward, np.array(next_observation), done])
self.index = (self.index + 1) % self.max_size
self.size = min(self.size + 1, self.max_size)
self.buffer.append([observation, action, reward, next_observation, done])

def extend(self, observations, actions, rewards, next_observations, dones):
"""
Store many couple of value
:param observations:
:param actions:
Expand All @@ -43,14 +48,36 @@ def extend(self, observations, actions, rewards, next_observations, dones):

def sample(self, batch_size, device):
"""
returns *batch_size* of samples
:param device:
:param device: torch device to run agent
:type device: torch.device
:param batch_size:
:return:
:type batch_size: int
:return: list<Tensor>
"""
idxs = np.random.randint(self.size, size=batch_size)
idxs = np.random.randint(len(self.buffer), size=batch_size)

return [torch.Tensor(list(V)).to(device=device) for V in self.buffer[idxs].T]
batch = np.array([self.get_sample(idx) for idx in idxs])

return [torch.Tensor(list(V)).to(device=device) for V in batch.T]

def get_sample(self, idx):
"""
returns sample at idx position. if self.gamma not equal to 0 apply discount reward.
:param idx: torch device to run agent
:type idx: int
:return: [observation, action, reward, next_observation, done]
"""
sample = self.buffer[idx]
if self.gamma == 0 or sample[4] is True:
return sample

if idx + 1 < len(self.buffer):
sample[2] = sample[2] + self.gamma * self.get_sample(idx + 1)[2]

return sample

def __str__(self):
return 'ExperienceReplay-' + str(self.max_size)
return 'ExperienceReplay-' + str(self.buffer.maxlen) + '-' + str(self.gamma)
8 changes: 7 additions & 1 deletion blobrl/memories/memory_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ class MemoryInterface(metaclass=abc.ABCMeta):
@abc.abstractmethod
def append(self, observation, action, reward, next_observation, done) -> None:
"""
Store one couple of value
:param observation:
:param action:
Expand All @@ -18,6 +19,7 @@ def append(self, observation, action, reward, next_observation, done) -> None:
@abc.abstractmethod
def extend(self, observations, actions, rewards, next_observations, dones) -> None:
"""
Store many couple of value
:param observations:
:param actions:
Expand All @@ -30,9 +32,13 @@ def extend(self, observations, actions, rewards, next_observations, dones) -> No
@abc.abstractmethod
def sample(self, batch_size, device):
"""
returns *batch_size* sample
:param device:
:param device: torch device to run agent
:type: torch.device
:param batch_size:
:type: int
:return: list<Tensor>
"""
pass

Expand Down
84 changes: 23 additions & 61 deletions examples/example_train_jupyter.ipynb

Large diffs are not rendered by default.

Loading

0 comments on commit c2eb003

Please sign in to comment.