StanfordVL · amandlek · Nov 22, 2019 · Nov 24, 2019 · Nov 24, 2019
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+.DS_Store
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/cfg/behavior_policies/iris.yaml b/cfg/behavior_policies/iris.yaml
@@ -0,0 +1,8 @@
+teach_behavior_policy: acteach
+use_meta_target: False
+
+behavior_policy_params:
+    commitment_thresh: 0.6
+    with_commitment: True
+    use_learner: True
+    policy_choice_repeat: 30 # added to allow for policy to act for a longer period of time
diff --git a/cfg/behavior_policies/iris_dqn.yaml b/cfg/behavior_policies/iris_dqn.yaml
@@ -0,0 +1,28 @@
+teach_behavior_policy: dqn
+
+behavior_policy_params:
+    commit_time: 0
+    do_exploration: True
+    num_timesteps: 100000
+    policy_choice_repeat: 30
+
+    dqn_params:
+        gamma: 0.99
+        learning_rate: 5.0e-4
+        buffer_size: 100000
+        exploration_fraction: 0.1
+        exploration_final_eps: 0.02
+        train_freq: 10
+        batch_size: 32
+        checkpoint_freq: 10000
+        checkpoint_path: null
+        learning_starts: 100
+        target_network_update_freq: 1000
+        prioritized_replay: False
+        prioritized_replay_alpha: 0.6
+        prioritized_replay_beta0: 0.4
+        prioritized_replay_beta_iters: 3000000
+        prioritized_replay_eps: 1.0e-6
+        param_noise: False
+        verbose: 1
+        full_tensorboard_log: False
diff --git a/cfg/behavior_policies/iris_ep.yaml b/cfg/behavior_policies/iris_ep.yaml
@@ -0,0 +1,9 @@
+teach_behavior_policy: acteach
+use_meta_target: False
+
+behavior_policy_params:
+    commitment_thresh: 0.6
+    with_commitment: True
+    use_learner: True
+    policy_choice_repeat: 30
+    random_episode_choice: True
diff --git a/cfg/sawyer_can/base/train.yaml b/cfg/sawyer_can/base/train.yaml
@@ -0,0 +1,39 @@
+env_id: SawyerPickPlaceCan
+learner_type: DDPG
+render: False
+render_eval: False
+normalize_returns: False
+normalize_observations: False
+seed: next
+tau: 0.001
+critic_l2_reg: 0.0
+batch_size: 128  # per MPI worker
+actor_lr: 0.0001
+critic_lr: 0.001
+enable_popart: False
+gamma: 0.99
+reward_scale: 1
+clip_norm: null
+noise_type: normal_0.2 # choices are adaptive-param_xx, ou_xx, normal_xx, none
+load_path: null
+
+memory_limit: 1000000
+nb_train_steps: 50  # per epoch cycle and MPI worker
+nb_rollout_steps: 1000 #200  # per epoch cycle and MPI worker
+num_timesteps: 500000
+nb_eval_steps: 1000 #100  # per epoch cycle and MPI worker
+log_interval: 25
+verbose: 1
+do_eval: True
+
+use_meta_target: False
+teach_behavior_policy: null
+
+policy_kwargs:
+    layer_norm: True
+    layers: [64, 64, 64]
+    feature_extraction: mlp # Can be mlp or cnn
+
+env_params:
+    shuffle_order: True
+    render_q_quiver: True
diff --git a/cfg/sawyer_can/base/train_dropout.yaml b/cfg/sawyer_can/base/train_dropout.yaml
@@ -0,0 +1,41 @@
+env_id: SawyerPickPlaceCan
+learner_type: Dropout DDPG
+render: False
+render_eval: False
+normalize_returns: False
+normalize_observations: False
+seed: next
+critic_l2_reg: 0.0
+action_l2: 0.1
+tau: 0.001
+batch_size: 128  # per MPI worker
+actor_lr: 0.0001
+critic_lr: 0.001
+enable_popart: False
+gamma: 0.99
+reward_scale: 1
+clip_norm: null
+noise_type: normal_0.2 # choices are adaptive-param_xx, ou_xx, normal_xx, none
+load_path: null
+
+memory_limit: 1000000
+nb_train_steps: 50  # per epoch cycle and MPI worker
+nb_rollout_steps: 1000 #200  # per epoch cycle and MPI worker
+num_timesteps: 500000
+nb_eval_steps: 1000 #100  # per epoch cycle and MPI worker
+log_interval: 25
+verbose: 1
+do_eval: True
+
+use_meta_target: False
+teach_behavior_policy: null
+
+dropout_tau: 10.0
+include_mc_stats: True
+
+policy_kwargs:
+    dropout_keep_prob: 0.9
+    layers: [64, 64, 64]
+    mc_samples: 50
+    layer_norm: True
+    feature_extraction: mlp # Can be mlp or cnn
diff --git a/cfg/sawyer_can/eval.yaml b/cfg/sawyer_can/eval.yaml
@@ -0,0 +1,15 @@
+include:
+  - cfg/sawyer_can/base/train_dropout.yaml
+
+env_id: SawyerPickPlaceCan
+render: True
+render_eval: True
+noise_type: normal_0.0000001 # can try null here, may need a bit of debugging
+
+load_path: tmp/model_hour_11.pkl
+nb_train_steps: 0  # per epoch cycle and MPI worker
+nb_rollout_steps: 200  # per epoch cycle and MPI worker
+nb_eval_steps: 200  # per epoch cycle and MPI worker
+log_interval: 1
+verbose: 1
+do_eval: True
diff --git a/cfg/sawyer_can/experiments/efficiency/1full_optimal/train_critic.yaml b/cfg/sawyer_can/experiments/efficiency/1full_optimal/train_critic.yaml
@@ -0,0 +1,6 @@
+include:
+  - cfg/sawyer_can/base/train.yaml
+  - cfg/sawyer_can/teachers/full_optimal.yaml
+  - cfg/behavior_policies/critic.yaml
+
+experiment_name: efficiency_1full_optimal_ddpgcritic
diff --git a/cfg/sawyer_can/experiments/efficiency/1full_optimal/train_dqn.yaml b/cfg/sawyer_can/experiments/efficiency/1full_optimal/train_dqn.yaml
@@ -0,0 +1,6 @@
+include:
+  - cfg/sawyer_can/base/train_dropout.yaml
+  - cfg/sawyer_can/teachers/full_optimal.yaml
+  - cfg/behavior_policies/dqn.yaml
+
+experiment_name: efficiency_1full_optimal_dqn
diff --git a/cfg/sawyer_can/experiments/efficiency/1full_optimal/train_iris.yaml b/cfg/sawyer_can/experiments/efficiency/1full_optimal/train_iris.yaml
@@ -0,0 +1,6 @@
+include:
+  - cfg/sawyer_can/base/train_dropout.yaml
+  - cfg/sawyer_can/teachers/full_optimal.yaml
+  - cfg/behavior_policies/iris.yaml
+
+experiment_name: efficiency_1full_optimal_ours
diff --git a/cfg/sawyer_can/experiments/efficiency/1full_optimal/train_ours.yaml b/cfg/sawyer_can/experiments/efficiency/1full_optimal/train_ours.yaml
@@ -0,0 +1,6 @@
+include:
+  - cfg/sawyer_can/base/train_dropout.yaml
+  - cfg/sawyer_can/teachers/full_optimal.yaml
+  - cfg/behavior_policies/acteach.yaml
+
+experiment_name: efficiency_1full_optimal_ours
diff --git a/cfg/sawyer_can/experiments/efficiency/1full_optimal/train_random.yaml b/cfg/sawyer_can/experiments/efficiency/1full_optimal/train_random.yaml
@@ -0,0 +1,6 @@
+include:
+  - cfg/sawyer_can/base/train_dropout.yaml
+  - cfg/sawyer_can/teachers/full_optimal.yaml
+  - cfg/behavior_policies/random.yaml
+
+experiment_name: efficiency_1full_optimal_random
diff --git a/cfg/sawyer_can/experiments/efficiency/1full_suboptimal/train_critic.yaml b/cfg/sawyer_can/experiments/efficiency/1full_suboptimal/train_critic.yaml
@@ -0,0 +1,6 @@
+include:
+  - cfg/sawyer_can/base/train.yaml
+  - cfg/sawyer_can/teachers/full_suboptimal.yaml
+  - cfg/behavior_policies/critic.yaml
+
+experiment_name: efficiency_1full_suboptimal_ddpgcritic
diff --git a/cfg/sawyer_can/experiments/efficiency/1full_suboptimal/train_dqn.yaml b/cfg/sawyer_can/experiments/efficiency/1full_suboptimal/train_dqn.yaml
@@ -0,0 +1,6 @@
+include:
+  - cfg/sawyer_can/base/train_dropout.yaml
+  - cfg/sawyer_can/teachers/full_suboptimal.yaml
+  - cfg/behavior_policies/dqn.yaml
+
+experiment_name: efficiency_1full_suboptimal_dqn
diff --git a/cfg/sawyer_can/experiments/efficiency/1full_suboptimal/train_iris.yaml b/cfg/sawyer_can/experiments/efficiency/1full_suboptimal/train_iris.yaml
@@ -0,0 +1,6 @@
+include:
+  - cfg/sawyer_can/base/train_dropout.yaml
+  - cfg/sawyer_can/teachers/full_suboptimal.yaml
+  - cfg/behavior_policies/iris.yaml
+
+experiment_name: efficiency_1full_suboptimal_ours
diff --git a/cfg/sawyer_can/experiments/efficiency/1full_suboptimal/train_ours.yaml b/cfg/sawyer_can/experiments/efficiency/1full_suboptimal/train_ours.yaml
@@ -0,0 +1,6 @@
+include:
+  - cfg/sawyer_can/base/train_dropout.yaml
+  - cfg/sawyer_can/teachers/full_suboptimal.yaml
+  - cfg/behavior_policies/acteach.yaml
+
+experiment_name: efficiency_1full_suboptimal_ours
diff --git a/cfg/sawyer_can/experiments/efficiency/1full_suboptimal/train_random.yaml b/cfg/sawyer_can/experiments/efficiency/1full_suboptimal/train_random.yaml
@@ -0,0 +1,6 @@
+include:
+  - cfg/sawyer_can/base/train_dropout.yaml
+  - cfg/sawyer_can/teachers/full_suboptimal.yaml
+  - cfg/behavior_policies/random.yaml
+
+experiment_name: efficiency_1full_suboptimal_random
diff --git a/cfg/sawyer_can/experiments/efficiency/train_bayesian_ddpg.yaml b/cfg/sawyer_can/experiments/efficiency/train_bayesian_ddpg.yaml
@@ -0,0 +1,4 @@
+include:
+  - cfg/sawyer_can/base/train_dropout.yaml
+
+experiment_name: efficiency_no_teachers_ddpg
diff --git a/cfg/sawyer_can/teachers/full_optimal.yaml b/cfg/sawyer_can/teachers/full_optimal.yaml
@@ -0,0 +1,4 @@
+teachers:
+  - type: optimal
+  - env: SawyerPickPlaceCan
+  - agent: "/home/robot/installed_libraries/batchRL/iris_trained_models/"
diff --git a/cfg/sawyer_can/teachers/full_suboptimal.yaml b/cfg/sawyer_can/teachers/full_suboptimal.yaml
@@ -0,0 +1,3 @@
+teachers:
+  - type: optimal
+    noise_type: normal_0.1
diff --git a/cfg/sawyer_lift/base/train.yaml b/cfg/sawyer_lift/base/train.yaml
@@ -0,0 +1,39 @@
+env_id: SawyerLift
+learner_type: DDPG
+render: False
+render_eval: False
+normalize_returns: False
+normalize_observations: False
+seed: next
+tau: 0.001
+critic_l2_reg: 0.0
+batch_size: 128  # per MPI worker
+actor_lr: 0.0001
+critic_lr: 0.001
+enable_popart: False
+gamma: 0.99
+reward_scale: 1
+clip_norm: null
+noise_type: normal_0.2 # choices are adaptive-param_xx, ou_xx, normal_xx, none
+load_path: null
+
+memory_limit: 1000000
+nb_train_steps: 50  # per epoch cycle and MPI worker
+nb_rollout_steps: 1000 #200  # per epoch cycle and MPI worker
+num_timesteps: 500000
+nb_eval_steps: 1000 #100  # per epoch cycle and MPI worker
+log_interval: 25
+verbose: 1
+do_eval: True
+
+use_meta_target: False
+teach_behavior_policy: null
+
+policy_kwargs:
+    layer_norm: True
+    layers: [64, 64, 64]
+    feature_extraction: mlp # Can be mlp or cnn
+
+env_params:
+    shuffle_order: True
+    render_q_quiver: True
diff --git a/cfg/sawyer_lift/base/train_dropout.yaml b/cfg/sawyer_lift/base/train_dropout.yaml
@@ -0,0 +1,41 @@
+env_id: SawyerLift
+learner_type: Dropout DDPG
+render: False
+render_eval: False
+normalize_returns: False
+normalize_observations: False
+seed: next
+critic_l2_reg: 0.0
+action_l2: 0.1
+tau: 0.001
+batch_size: 128  # per MPI worker
+actor_lr: 0.0001
+critic_lr: 0.001
+enable_popart: False
+gamma: 0.99
+reward_scale: 1
+clip_norm: null
+noise_type: normal_0.2 # choices are adaptive-param_xx, ou_xx, normal_xx, none
+load_path: null
+
+memory_limit: 1000000
+nb_train_steps: 50  # per epoch cycle and MPI worker
+nb_rollout_steps: 1000 #200  # per epoch cycle and MPI worker
+num_timesteps: 500000
+nb_eval_steps: 1000 #100  # per epoch cycle and MPI worker
+log_interval: 25
+verbose: 1
+do_eval: True
+
+use_meta_target: False
+teach_behavior_policy: null
+
+dropout_tau: 10.0
+include_mc_stats: True
+
+policy_kwargs:
+    dropout_keep_prob: 0.9
+    layers: [64, 64, 64]
+    mc_samples: 50
+    layer_norm: True
+    feature_extraction: mlp # Can be mlp or cnn
diff --git a/cfg/sawyer_lift/eval.yaml b/cfg/sawyer_lift/eval.yaml
@@ -0,0 +1,15 @@
+include:
+  - cfg/sawyer_lift/base/train_dropout.yaml
+
+env_id: SawyerLift
+render: True
+render_eval: True
+noise_type: normal_0.0000001 # can try null here, may need a bit of debugging
+
+load_path: "/Users/ajaymandlekar/Desktop/Google Drive/Stanford/ccr/ac-teach/logs/SawyerLift/efficiency_no_teachers_ddpg/seed_0/model.pkl"
+nb_train_steps: 0  # per epoch cycle and MPI worker
+nb_rollout_steps: 1000 #200  # per epoch cycle and MPI worker
+nb_eval_steps: 1000 #200  # per epoch cycle and MPI worker
+log_interval: 1
+verbose: 1
+do_eval: True
diff --git a/cfg/sawyer_lift/experiments/efficiency/1full_optimal/train_critic.yaml b/cfg/sawyer_lift/experiments/efficiency/1full_optimal/train_critic.yaml
@@ -0,0 +1,6 @@
+include:
+  - cfg/sawyer_lift/base/train.yaml
+  - cfg/sawyer_lift/teachers/full_optimal.yaml
+  - cfg/behavior_policies/critic.yaml
+
+experiment_name: efficiency_1full_optimal_ddpgcritic
diff --git a/cfg/sawyer_lift/experiments/efficiency/1full_optimal/train_dqn.yaml b/cfg/sawyer_lift/experiments/efficiency/1full_optimal/train_dqn.yaml
@@ -0,0 +1,6 @@
+include:
+  - cfg/sawyer_lift/base/train_dropout.yaml
+  - cfg/sawyer_lift/teachers/full_optimal.yaml
+  - cfg/behavior_policies/dqn.yaml
+
+experiment_name: efficiency_1full_optimal_dqn
diff --git a/cfg/sawyer_lift/experiments/efficiency/1full_optimal/train_iris.yaml b/cfg/sawyer_lift/experiments/efficiency/1full_optimal/train_iris.yaml
@@ -0,0 +1,6 @@
+include:
+  - cfg/sawyer_lift/base/train_dropout.yaml
+  - cfg/sawyer_lift/teachers/full_optimal.yaml
+  - cfg/behavior_policies/iris.yaml
+
+experiment_name: efficiency_1full_optimal_ours
diff --git a/cfg/sawyer_lift/experiments/efficiency/1full_optimal/train_iris_dqn.yaml b/cfg/sawyer_lift/experiments/efficiency/1full_optimal/train_iris_dqn.yaml
@@ -0,0 +1,6 @@
+include:
+  - cfg/sawyer_lift/base/train_dropout.yaml
+  - cfg/sawyer_lift/teachers/full_optimal.yaml
+  - cfg/behavior_policies/iris_dqn.yaml
+
+experiment_name: efficiency_1full_optimal_iris_dqn
diff --git a/cfg/sawyer_lift/experiments/efficiency/1full_optimal/train_iris_ep.yaml b/cfg/sawyer_lift/experiments/efficiency/1full_optimal/train_iris_ep.yaml
@@ -0,0 +1,6 @@
+include:
+  - cfg/sawyer_lift/base/train_dropout.yaml
+  - cfg/sawyer_lift/teachers/full_optimal.yaml
+  - cfg/behavior_policies/iris_ep.yaml
+
+experiment_name: efficiency_1full_optimal_iris_ep
diff --git a/cfg/sawyer_lift/experiments/efficiency/1full_optimal/train_ours.yaml b/cfg/sawyer_lift/experiments/efficiency/1full_optimal/train_ours.yaml
@@ -0,0 +1,6 @@
+include:
+  - cfg/sawyer_lift/base/train_dropout.yaml
+  - cfg/sawyer_lift/teachers/full_optimal.yaml
+  - cfg/behavior_policies/acteach.yaml
+
+experiment_name: efficiency_1full_optimal_ours
diff --git a/cfg/sawyer_lift/experiments/efficiency/1full_optimal/train_random.yaml b/cfg/sawyer_lift/experiments/efficiency/1full_optimal/train_random.yaml
@@ -0,0 +1,6 @@
+include:
+  - cfg/sawyer_lift/base/train_dropout.yaml
+  - cfg/sawyer_lift/teachers/full_optimal.yaml
+  - cfg/behavior_policies/random.yaml
+
+experiment_name: efficiency_1full_optimal_random
diff --git a/cfg/sawyer_lift/experiments/efficiency/1full_suboptimal/train_critic.yaml b/cfg/sawyer_lift/experiments/efficiency/1full_suboptimal/train_critic.yaml
@@ -0,0 +1,6 @@
+include:
+  - cfg/sawyer_lift/base/train.yaml
+  - cfg/sawyer_lift/teachers/full_suboptimal.yaml
+  - cfg/behavior_policies/critic.yaml
+
+experiment_name: efficiency_1full_suboptimal_ddpgcritic
diff --git a/cfg/sawyer_lift/experiments/efficiency/1full_suboptimal/train_dqn.yaml b/cfg/sawyer_lift/experiments/efficiency/1full_suboptimal/train_dqn.yaml
@@ -0,0 +1,6 @@
+include:
+  - cfg/sawyer_lift/base/train_dropout.yaml
+  - cfg/sawyer_lift/teachers/full_suboptimal.yaml
+  - cfg/behavior_policies/dqn.yaml
+
+experiment_name: efficiency_1full_suboptimal_dqn