This repository has been archived by the owner on Feb 3, 2023. It is now read-only.
forked from berkeleydeeprlcourse/homework_fall2022
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
299 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,240 @@ | ||
from typing import Any, Dict, List, Tuple | ||
|
||
import os | ||
from pathlib import Path | ||
|
||
import tensorflow as tf | ||
tf.get_logger().setLevel("ERROR") | ||
|
||
from tensorflow.python.summary.summary_iterator import summary_iterator | ||
|
||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
import pandas as pd | ||
import seaborn as sns | ||
|
||
|
||
run_logs_dir = os.path.join(*Path(__file__).parts[:-3], "run_logs") | ||
|
||
def load_eventfile_by_folder_prefix(prefix: str) -> List: | ||
# Find the appropriate full file name | ||
is_prefix = lambda s: s.startswith(prefix) | ||
# We take the first element by default | ||
full_folder_name = list(filter(is_prefix, os.listdir(run_logs_dir)))[0] | ||
|
||
# Get the full path of the eventfile directory | ||
eventfile_dir = os.path.join(run_logs_dir, full_folder_name) | ||
|
||
# Get the eventfile_path | ||
eventfile_name = os.listdir(eventfile_dir)[0] | ||
eventfile_path = os.path.join(eventfile_dir, eventfile_name) | ||
|
||
return list(summary_iterator(eventfile_path)) | ||
|
||
|
||
def filter_summaries_by_tag(summaries: List, tag: str) -> List[Tuple]: | ||
""" | ||
Filters summaries for all events | ||
""" | ||
value_is_tag = lambda v: v.tag == tag | ||
get_value_tag_from_event = lambda e: next(filter(value_is_tag, e.summary.value), None) | ||
|
||
filtered = [] | ||
for event in summaries: | ||
value = get_value_tag_from_event(event) | ||
if value is None: | ||
continue | ||
|
||
filtered.append((event, value)) | ||
|
||
return filtered | ||
|
||
|
||
def get_first_simple_value(summaries: List[Tuple]) -> float: | ||
""" | ||
Takes in the output of `filter_summaries_by_tag` | ||
""" | ||
return next(iter(summaries))[1].simple_value | ||
|
||
|
||
def get_first_tag_simple_value(summaries: List, tag: str) -> float: | ||
filtered = filter_summaries_by_tag(summaries, tag) | ||
return get_first_simple_value(filtered) | ||
|
||
|
||
def get_property_and_steps(experiment_prefix: str, property_name: str) -> Tuple[List[float], List[float]]: | ||
""" | ||
Returns a tuple of steps and property values. | ||
The arrays are sorted ascending in steps. | ||
""" | ||
experiment_summary = load_eventfile_by_folder_prefix(experiment_prefix) | ||
|
||
train_returns = filter_summaries_by_tag(experiment_summary, property_name) | ||
steps = [r[0].step for r in train_returns] | ||
returns = [r[1].simple_value for r in train_returns] | ||
|
||
steps = np.array(steps) | ||
returns = np.array(returns) | ||
|
||
sorted_idxs = steps.argsort() | ||
|
||
steps = steps[sorted_idxs] | ||
returns = returns[sorted_idxs] | ||
|
||
return steps, returns | ||
|
||
|
||
def get_train_averagereturns(experiment_prefix: str) -> Tuple[List[float], List[float]]: | ||
return get_property_and_steps(experiment_prefix, "Train_AverageReturn") | ||
|
||
|
||
def get_eval_averagereturns(experiment_prefix: str) -> Tuple[List[float], List[float]]: | ||
return get_property_and_steps(experiment_prefix, "Eval_AverageReturn") | ||
|
||
|
||
def get_train_bestreturns(experiment_prefix: str) -> Tuple[List[float], List[float]]: | ||
return get_property_and_steps(experiment_prefix, "Train_BestReturn") | ||
|
||
|
||
def q2(): | ||
config_prefix = "hw4_q2_obstacles_singleiteration_obstacles-cs285-v0" | ||
|
||
rows, cols = 1, 1 | ||
fig, ax = plt.subplots(rows, cols, figsize=(10 * cols, 8 * rows)) | ||
|
||
def scatter_and_label(steps, returns, label, expected_return): | ||
points = ax.scatter(steps, returns, label=label) | ||
for xy in zip(steps, returns): | ||
ax.annotate(f"({xy[0]}, {xy[1]:.1f})", xy=xy, textcoords="data") | ||
|
||
ax.hlines(y=expected_return, xmin=-0.3, xmax=0.3, label=f"Expected {label.lower()}", linestyles=["--"], color=points.get_facecolor()) | ||
|
||
scatter_and_label(*get_eval_averagereturns(config_prefix), "Eval returns", -50) | ||
scatter_and_label(*get_train_averagereturns(config_prefix), "Train returns", -160) | ||
|
||
ax.set_xlabel("Train iterations") | ||
ax.set_ylabel("Return") | ||
ax.legend() | ||
|
||
fig.suptitle("Single iteration MPC policy performance on the obstacles environment") | ||
fig.tight_layout() | ||
fig.savefig("report_resources/q2.png") | ||
|
||
|
||
def q3(): | ||
configs = { | ||
("Obstacles", "hw4_q3_obstacles_obstacles-cs285-v0", -20), | ||
("Reacher", "hw4_q3_reacher_reacher-cs285-v0", -250), | ||
("Cheetah", "hw4_q3_cheetah_cheetah-cs285-v0", 350), | ||
} | ||
|
||
rows, cols = 1, 3 | ||
fig, axs = plt.subplots(rows, cols, figsize=(10 * cols, 8 * rows)) | ||
|
||
for ax, (config_name, config_prefix, expected_return) in zip(axs, configs): | ||
steps, eval_returns = get_eval_averagereturns(config_prefix) | ||
|
||
ax.plot(steps, eval_returns) | ||
ax.hlines(y=expected_return, xmin=min(steps), xmax=max(steps), label="Expected eval return", color="red") | ||
ax.set_title(f"MBRL performance on {config_name} environment") | ||
ax.set_xlabel("Train iterations") | ||
ax.set_ylabel("Eval average return") | ||
ax.legend() | ||
|
||
fig.suptitle("Model based RL (MBRL) performance on various environments") | ||
fig.tight_layout() | ||
fig.savefig("report_resources/q3.png") | ||
|
||
|
||
def q4(): | ||
prefix_template = "hw4_q4_reacher_{key}{value}_reacher-cs285-v0" | ||
configs = [ | ||
{ | ||
"name": "Ensemble size", | ||
"key": "ensemble", | ||
"values": [1, 3, 5], | ||
}, | ||
{ | ||
"name": "Horizon", | ||
"key": "horizon", | ||
"values": [5, 15, 30], | ||
}, | ||
{ | ||
"name": "Num candidate sequences", | ||
"key": "numseq", | ||
"values": [100, 1000], | ||
}, | ||
] | ||
|
||
rows, cols = 1, 3 | ||
fig, axs = plt.subplots(rows, cols, figsize=(10 * cols, 8 * rows)) | ||
|
||
for ax, config in zip(axs, configs): | ||
name, key, values = config["name"], config["key"], config["values"] | ||
for value in values: | ||
config_prefix = prefix_template.format(key=key, value=value) | ||
steps, eval_returns = get_eval_averagereturns(config_prefix) | ||
|
||
ax.plot(steps, eval_returns, label=f"{name}={value}") | ||
|
||
ax.set_title(f"Ablation over {name.lower()}") | ||
ax.set_xlabel("Train iterations") | ||
ax.set_ylabel("Eval average return") | ||
ax.legend() | ||
|
||
fig.suptitle("Ablation of model-based RL (MBRL) performance on reacher environment") | ||
fig.tight_layout() | ||
fig.savefig("report_resources/q4.png") | ||
|
||
|
||
def q5(): | ||
configs = { | ||
"CEM 2 iterations": "hw4_q5_cheetah_cem_2_cheetah-cs285-v0", | ||
"CEM 4 iterations": "hw4_q5_cheetah_cem_4_cheetah-cs285-v0", | ||
"Random shooting": "hw4_q5_cheetah_random_cheetah-cs285-v0", | ||
} | ||
|
||
rows, cols = 1, 1 | ||
fig, ax = plt.subplots(rows, cols, figsize=(10 * cols, 8 * rows)) | ||
|
||
for config_name, config_prefix in configs.items(): | ||
steps, eval_returns = get_eval_averagereturns(config_prefix) | ||
|
||
ax.plot(steps, eval_returns, label=config_name) | ||
|
||
ax.set_xlabel("Train iterations") | ||
ax.set_ylabel("Eval average return") | ||
ax.legend() | ||
|
||
fig.suptitle("Comparison of sampling methods for Model based RL (MBRL) performance on cheetah environment") | ||
fig.tight_layout() | ||
fig.savefig("report_resources/q5.png") | ||
|
||
|
||
def q6(): | ||
configs = { | ||
"MBPO rollout length 0": "hw4_q6_cheetah_rlenl0_cheetah-cs285-v0", | ||
"MBPO rollout length 1": "hw4_q6_cheetah_rlen1_cheetah-cs285-v0", | ||
"MBPO rollout length 10": "hw4_q6_cheetah_rlen10_cheetah-cs285-v0", | ||
} | ||
|
||
rows, cols = 1, 1 | ||
fig, ax = plt.subplots(rows, cols, figsize=(10 * cols, 8 * rows)) | ||
|
||
for config_name, config_prefix in configs.items(): | ||
steps, eval_returns = get_eval_averagereturns(config_prefix) | ||
|
||
ax.plot(steps, eval_returns, label=config_name) | ||
|
||
ax.set_xlabel("Train iterations") | ||
ax.set_ylabel("Eval average return") | ||
ax.legend() | ||
|
||
fig.suptitle("Comparison of rollout lengths for model-based policy optimization (MBPO) performance on cheetah environment") | ||
fig.tight_layout() | ||
fig.savefig("report_resources/q6.png") | ||
|
||
|
||
if __name__ == "__main__": | ||
q3() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
# Problem 1 | ||
<div style="text-align: center"> | ||
<img src="run_logs/hw4_q1_cheetah_n500_arch1x32_cheetah-cs285-v0_01-11-2022_22-54-53/itr_0_predictions.png" width="250" height="200" /> | ||
<img src="run_logs/hw4_q1_cheetah_n500_arch2x250_cheetah-cs285-v0_01-11-2022_22-55-11/itr_0_predictions.png" width="250" height="200" /> | ||
<img src="run_logs/hw4_q1_cheetah_n5_arch2x250_cheetah-cs285-v0_01-11-2022_22-55-04/itr_0_predictions.png" width="250" height="200" /> | ||
</div> | ||
|
||
The predictions results for a small model (1 layer of size 32) trained for 500 iterations (left), a large model (2 layers of size 250) trained for 500 iterations (center), and a large model (2 layers of size 250) trained for 5 iterations (right). | ||
|
||
Clearly the large model trained for 500 iterations performs the best with the lowest mean prediction error (MPE) value of 0.028, outclassing the small model trained for the same amount of time. However, the large model definitely needs many iterations of training as the same size model trained for 5 iterations performed much much worse than either of the two other configurations. | ||
|
||
This environment may be too complex for smaller models to correctly learn and also too complex for larger models to fully understand without sufficient datapoints. | ||
|
||
<div style="page-break-after: always;"></div> | ||
|
||
# Problem 2 | ||
<div style="text-align: center"> | ||
<img src="report_resources/q2.png" width="250" height="200" /> | ||
</div> | ||
|
||
<div style="page-break-after: always;"></div> | ||
|
||
# Problem 3 | ||
<div style="text-align: center"> | ||
<img src="report_resources/q3.png" width="750" height="200" /> | ||
</div> | ||
|
||
<div style="page-break-after: always;"></div> | ||
|
||
# Problem 4 | ||
<div style="text-align: center"> | ||
<img src="report_resources/q4.png" width="750" height="200" /> | ||
</div> | ||
|
||
All three of the ablated ensemble size values reached around the same eval average return. Ensemble size of 3 was most consistent in doing so. Ensemble size of 5 had the lowest initial performance, probably because averages of initial estimates by more networks is more variable in the beginning. | ||
|
||
The ablation with horizon length of 5 was wildly variable, jumping up and down. This means that horizon length 5 is too short. The ablation with horizon length of 30 had much lower performance than both of the other configs, signaling that horizon length of 30 is too long. This is probably due to distributional drift, where the model error compounds to a point past a certain horizon length where it's no longer useful. The best horizon length was 15, the middle ground in between the 2. The optimal horizon length probably depends on the maximum number of steps for a particular env and on how frequently we receive rewards. | ||
|
||
The ablations of number of generated candidate action sequences tells us that the more candidate sequences we generate, the better our estimation of the true "optimal" action as the config with 1000 candidate action sequences was less variable and better performing than the config with 100 candidate action sequences. | ||
|
||
<div style="page-break-after: always;"></div> | ||
|
||
# Problem 5 | ||
<div style="text-align: center"> | ||
<img src="report_resources/q5.png" width="250" height="200" /> | ||
</div> | ||
|
||
Cross-entropy sampling method (CEM) is much more consistent than random shooting, both in absolute average return as well as in improvement. | ||
|
||
CEM with 4 iterations is much better than CEM with 2 iterations, where the former makes huge performance increases over single iterations and ends at almost double the score of the latter. This is probably because we iteratively maximize over elites. More iterations of maximization reduce variance and improve estimates when we sample from our normal distribution with elite mean and variance. | ||
|
||
<div style="page-break-after: always;"></div> | ||
|
||
# Problem 6 | ||
<div style="text-align: center"> | ||
<img src="report_resources/q6.png" width="250" height="200" /> | ||
</div> | ||
|
||
MBPO with rollout lengths of 0 and 1 make progress to move to non-negative reward, but are unable to move past around 0 eval return i.e. they reach 0 return and then oscillate around 0. This is probably because the rollout lookahead length is too short to actually be informative to the policy for this particular environment. The performance of the config with rollout of length 10 steadily increases over iterations up to an impressive performance around 2000 eval average return. MBPO with rollouts of length 10 is able to make significant progress over much fewer policy training iterations than model-free and Dyna-style policy optimization. |
Empty file.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.