add hw1

bri25yu · Aug 24, 2022 · 1d58bde · 1d58bde
1 parent f99e4a2
commit 1d58bde
Show file tree

Hide file tree

Showing 33 changed files with 1,641 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1 @@
-# homework_fall2022
-Assignments for Berkeley CS 285: Deep Reinforcement Learning (Fall 2022)
+Assignments for [Berkeley CS 285: Deep Reinforcement Learning, Decision Making, and Control](http://rail.eecs.berkeley.edu/deeprlcourse/).
diff --git a/hw1/README.md b/hw1/README.md
@@ -0,0 +1,77 @@
+## Setup
+
+You can run this code on your own machine or on Google Colab. 
+
+1. **Local option:** If you choose to run locally, you will need to install MuJoCo and some Python packages; see [installation.md](installation.md) for instructions.
+2. **Colab:** The first few sections of the notebook will install all required dependencies. You can try out the Colab option by clicking the badge below:
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2021/blob/master/hw1/cs285/scripts/run_hw1.ipynb)
+
+## Complete the code
+
+Fill in sections marked with `TODO`. In particular, see
+ - [infrastructure/rl_trainer.py](cs285/infrastructure/rl_trainer.py)
+ - [policies/MLP_policy.py](cs285/policies/MLP_policy.py)
+ - [infrastructure/replay_buffer.py](cs285/infrastructure/replay_buffer.py)
+ - [infrastructure/utils.py](cs285/infrastructure/utils.py)
+ - [infrastructure/pytorch_util.py](cs285/infrastructure/pytorch_util.py)
+
+Look for sections maked with `HW1` to see how the edits you make will be used.
+Some other files that you may find relevant
+ - [scripts/run_hw1.py](cs285/scripts/run_hw1.py) (if running locally) or [scripts/run_hw1.ipynb](cs285/scripts/run_hw1.ipynb) (if running on Colab)
+ - [agents/bc_agent.py](cs285/agents/bc_agent.py)
+
+See the homework pdf for more details.
+
+## Run the code
+
+Tip: While debugging, you probably want to keep the flag `--video_log_freq -1` which will disable video logging and speed up the experiment. However, feel free to remove it to save videos of your awesome policy!
+
+If running on Colab, adjust the `#@params` in the `Args` class according to the commmand line arguments above.
+
+### Section 1 (Behavior Cloning)
+Command for problem 1:
+
+```
+python cs285/scripts/run_hw1.py \
+	--expert_policy_file cs285/policies/experts/Ant.pkl \
+	--env_name Ant-v2 --exp_name bc_ant --n_iter 1 \
+	--expert_data cs285/expert_data/expert_data_Ant-v2.pkl
+	--video_log_freq -1
+```
+
+Make sure to also try another environment.
+See the homework PDF for more details on what else you need to run.
+To generate videos of the policy, remove the `--video_log_freq -1` flag.
+
+### Section 2 (DAgger)
+Command for section 1:
+(Note the `--do_dagger` flag, and the higher value for `n_iter`)
+
+```
+python cs285/scripts/run_hw1.py \
+    --expert_policy_file cs285/policies/experts/Ant.pkl \
+    --env_name Ant-v2 --exp_name dagger_ant --n_iter 10 \
+    --do_dagger --expert_data cs285/expert_data/expert_data_Ant-v2.pkl \
+	--video_log_freq -1
+```
+
+Make sure to also try another environment.
+See the homework PDF for more details on what else you need to run.
+
+## Visualization the saved tensorboard event file:
+
+You can visualize your runs using tensorboard:
+```
+tensorboard --logdir data
+```
+
+You will see scalar summaries as well as videos of your trained policies (in the 'images' tab).
+
+You can choose to visualize specific runs with a comma-separated list:
+```
+tensorboard --logdir data/run1,data/run2,data/run3...
+```
+
+If running on Colab, you will be using the `%tensorboard` [line magic](https://ipython.readthedocs.io/en/stable/interactive/magics.html) to do the same thing; see the [notebook](cs285/scripts/run_hw1.ipynb) for more details.
+
diff --git a/hw1/cs285/agents/__init__.py b/hw1/cs285/agents/__init__.py
diff --git a/hw1/cs285/agents/base_agent.py b/hw1/cs285/agents/base_agent.py
@@ -0,0 +1,17 @@
+
+class BaseAgent(object):
+    def __init__(self, **kwargs):
+        super(BaseAgent, self).__init__(**kwargs)
+
+    def train(self) -> dict:
+        """Return a dictionary of logging information."""
+        raise NotImplementedError
+
+    def add_to_replay_buffer(self, paths):
+        raise NotImplementedError
+
+    def sample(self, batch_size):
+        raise NotImplementedError
+
+    def save(self, path):
+        raise NotImplementedError
diff --git a/hw1/cs285/agents/bc_agent.py b/hw1/cs285/agents/bc_agent.py
@@ -0,0 +1,40 @@
+from cs285.infrastructure.replay_buffer import ReplayBuffer
+from cs285.policies.MLP_policy import MLPPolicySL
+from .base_agent import BaseAgent
+
+
+class BCAgent(BaseAgent):
+    def __init__(self, env, agent_params):
+        super(BCAgent, self).__init__()
+
+        # init vars
+        self.env = env
+        self.agent_params = agent_params
+
+        # actor/policy
+        self.actor = MLPPolicySL(
+            self.agent_params['ac_dim'],
+            self.agent_params['ob_dim'],
+            self.agent_params['n_layers'],
+            self.agent_params['size'],
+            discrete=self.agent_params['discrete'],
+            learning_rate=self.agent_params['learning_rate'],
+        )
+
+        # replay buffer
+        self.replay_buffer = ReplayBuffer(self.agent_params['max_replay_buffer_size'])
+
+    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
+        # training a BC agent refers to updating its actor using
+        # the given observations and corresponding action labels
+        log = self.actor.update(ob_no, ac_na)  # HW1: you will modify this
+        return log
+
+    def add_to_replay_buffer(self, paths):
+        self.replay_buffer.add_rollouts(paths)
+
+    def sample(self, batch_size):
+        return self.replay_buffer.sample_random_data(batch_size)  # HW1: you will modify this
+
+    def save(self, path):
+        return self.actor.save(path)
diff --git a/hw1/cs285/expert_data/expert_data_Ant-v4.pkl b/hw1/cs285/expert_data/expert_data_Ant-v4.pkl
diff --git a/hw1/cs285/expert_data/expert_data_HalfCheetah-v4.pkl b/hw1/cs285/expert_data/expert_data_HalfCheetah-v4.pkl
diff --git a/hw1/cs285/expert_data/expert_data_Hopper-v4.pkl b/hw1/cs285/expert_data/expert_data_Hopper-v4.pkl
diff --git a/hw1/cs285/expert_data/expert_data_Humanoid-v4.pkl b/hw1/cs285/expert_data/expert_data_Humanoid-v4.pkl
diff --git a/hw1/cs285/expert_data/expert_data_Walker2d-v4.pkl b/hw1/cs285/expert_data/expert_data_Walker2d-v4.pkl
diff --git a/hw1/cs285/infrastructure/__init__.py b/hw1/cs285/infrastructure/__init__.py
diff --git a/hw1/cs285/infrastructure/colab_utils.py b/hw1/cs285/infrastructure/colab_utils.py
@@ -0,0 +1,26 @@
+from gym.wrappers import RecordVideo
+import glob
+import io
+import base64
+from IPython.display import HTML
+from IPython import display as ipythondisplay
+
+## modified from https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#scrollTo=TCelFzWY9MBI
+
+def show_video():
+  mp4list = glob.glob('/content/video/*.mp4')
+  if len(mp4list) > 0:
+    mp4 = mp4list[0]
+    video = io.open(mp4, 'r+b').read()
+    encoded = base64.b64encode(video)
+    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
+                loop controls style="height: 400px;">
+                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
+             </video>'''.format(encoded.decode('ascii'))))
+  else: 
+    print("Could not find video")
+
+
+def wrap_env(env):
+  env = RecordVideo(env, '/content/video')
+  return env
diff --git a/hw1/cs285/infrastructure/logger.py b/hw1/cs285/infrastructure/logger.py
@@ -0,0 +1,74 @@
+import os
+from tensorboardX import SummaryWriter
+import numpy as np
+
+class Logger:
+    def __init__(self, log_dir, n_logged_samples=10, summary_writer=None):
+        self._log_dir = log_dir
+        print('########################')
+        print('logging outputs to ', log_dir)
+        print('########################')
+        self._n_logged_samples = n_logged_samples
+        self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1)
+
+    def log_scalar(self, scalar, name, step_):
+        self._summ_writer.add_scalar('{}'.format(name), scalar, step_)
+
+    def log_scalars(self, scalar_dict, group_name, step, phase):
+        """Will log all scalars in the same plot."""
+        self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step)
+
+    def log_image(self, image, name, step):
+        assert(len(image.shape) == 3)  # [C, H, W]
+        self._summ_writer.add_image('{}'.format(name), image, step)
+
+    def log_video(self, video_frames, name, step, fps=10):
+        assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!"
+        self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps)
+
+    def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'):
+
+        # reshape the rollouts
+        videos = [np.transpose(p['image_obs'][:, 0], [0, 3, 1, 2]) for p in paths]
+
+        # max rollout length
+        max_videos_to_save = np.min([max_videos_to_save, len(videos)])
+        max_length = videos[0].shape[0]
+        for i in range(max_videos_to_save):
+            if videos[i].shape[0]>max_length:
+                max_length = videos[i].shape[0]
+
+        # pad rollouts to all be same length
+        for i in range(max_videos_to_save):
+            if videos[i].shape[0]<max_length:
+                padding = np.tile([videos[i][-1]], (max_length-videos[i].shape[0],1,1,1))
+                videos[i] = np.concatenate([videos[i], padding], 0)
+
+        # log videos to tensorboard event file
+        videos = np.stack(videos[:max_videos_to_save], 0)
+        self.log_video(videos, video_title, step, fps=fps)
+
+    def log_figures(self, figure, name, step, phase):
+        """figure: matplotlib.pyplot figure handle"""
+        assert figure.shape[0] > 0, "Figure logging requires input shape [batch x figures]!"
+        self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
+
+    def log_figure(self, figure, name, step, phase):
+        """figure: matplotlib.pyplot figure handle"""
+        self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
+
+    def log_graph(self, array, name, step, phase):
+        """figure: matplotlib.pyplot figure handle"""
+        im = plot_graph(array)
+        self._summ_writer.add_image('{}_{}'.format(name, phase), im, step)
+
+    def dump_scalars(self, log_path=None):
+        log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path
+        self._summ_writer.export_scalars_to_json(log_path)
+
+    def flush(self):
+        self._summ_writer.flush()
+
+
+
+
diff --git a/hw1/cs285/infrastructure/pytorch_util.py b/hw1/cs285/infrastructure/pytorch_util.py
@@ -0,0 +1,75 @@
+from typing import Union
+
+import torch
+from torch import nn
+
+Activation = Union[str, nn.Module]
+
+
+_str_to_activation = {
+    'relu': nn.ReLU(),
+    'tanh': nn.Tanh(),
+    'leaky_relu': nn.LeakyReLU(),
+    'sigmoid': nn.Sigmoid(),
+    'selu': nn.SELU(),
+    'softplus': nn.Softplus(),
+    'identity': nn.Identity(),
+}
+
+
+def build_mlp(
+        input_size: int,
+        output_size: int,
+        n_layers: int,
+        size: int,
+        activation: Activation = 'tanh',
+        output_activation: Activation = 'identity',
+) -> nn.Module:
+    """
+        Builds a feedforward neural network
+
+        arguments:
+            n_layers: number of hidden layers
+            size: dimension of each hidden layer
+            activation: activation of each hidden layer
+
+            input_size: size of the input layer
+            output_size: size of the output layer
+            output_activation: activation of the output layer
+
+        returns:
+            MLP (nn.Module)
+    """
+    if isinstance(activation, str):
+        activation = _str_to_activation[activation]
+    if isinstance(output_activation, str):
+        output_activation = _str_to_activation[output_activation]
+
+    # TODO: return a MLP. This should be an instance of nn.Module
+    # Note: nn.Sequential is an instance of nn.Module.
+    raise NotImplementedError
+
+
+device = None
+
+
+def init_gpu(use_gpu=True, gpu_id=0):
+    global device
+    if torch.cuda.is_available() and use_gpu:
+        device = torch.device("cuda:" + str(gpu_id))
+        print("Using GPU id {}".format(gpu_id))
+    else:
+        device = torch.device("cpu")
+        print("GPU not detected. Defaulting to CPU.")
+
+
+def set_device(gpu_id):
+    torch.cuda.set_device(gpu_id)
+
+
+def from_numpy(*args, **kwargs):
+    return torch.from_numpy(*args, **kwargs).float().to(device)
+
+
+def to_numpy(tensor):
+    return tensor.to('cpu').detach().numpy()