portal-cornell · kushal2000 · Mar 25, 2024 · Mar 25, 2024 · Apr 4, 2024 · Apr 5, 2024
diff --git a/scripts/eval_checkpoint.py b/scripts/eval_checkpoint.py
@@ -14,6 +14,7 @@
 import random
 from tqdm import tqdm
 import json
+from xskill.utility.observation_indices import ACTION_INDICES
 
 def create_policy_nets(cfg):
     """
@@ -81,6 +82,21 @@ def create_policy_nets(cfg):
 
     return nets
 
+def get_attempted_unspecified_tasks(task_list, initial_obs, final_obs, epsilon=1e-3):
+    count = 0
+    obs_diff = np.abs(final_obs - initial_obs)
+    # remove "full _____" tasks that are identical to their corresponding non-full tasks
+    unspecified_tasks = [task for task in ACTION_INDICES \
+                            if task not in task_list \
+                            and (len(task) < 5 or f"{task[5:]}" not in ACTION_INDICES)]
+    for task in unspecified_tasks:
+        if np.any(obs_diff[ACTION_INDICES[task]] > epsilon):
+            count += 1
+
+    return count
+
+
+
 @hydra.main(
     version_base=None,
     config_path="../config/simulation",
@@ -125,10 +141,16 @@ def main(cfg: DictConfig):
     eval_eps = np.arange(len(eval_mask))[eval_mask]
 
     result_dict = {
-        'robot':{f'{ckpt_num}': {} for ckpt_num in cfg.checkpoint_list},
-        'human':{f'{ckpt_num}': {} for ckpt_num in cfg.checkpoint_list}
+        'robot':{
+            f'{ckpt_num}': {f'{speed}': {} for speed in cfg.robot_speeds} for ckpt_num in cfg.checkpoint_list
+        },
+        'human':{
+            f'{ckpt_num}': {f'{speed}': {} for speed in cfg.human_speeds} for ckpt_num in cfg.checkpoint_list
+        }
     }
 
+
+
     speeds = {
         'robot': cfg.robot_speeds,
         'human': cfg.human_speeds
@@ -139,39 +161,64 @@ def main(cfg: DictConfig):
         for demo_type in ['robot', 'human']:
             cfg.eval_cfg.demo_type = demo_type
             for speed in speeds[demo_type]:
+                task_list=["slide cabinet", "light switch", "kettle", "microwave"]
                 eval_callback.task_progess_ratio = speed
                 tasks_completed = 0
+                all_correct_count = 0
+                num_unspecified_tasks = 0
                 for seed in eval_eps:
                     cfg.eval_cfg.demo_item = seed.item()
-                    num_completed, _ = eval_callback.eval(
+                    num_completed, _, initial_obs, final_obs = eval_callback.eval(
                         nets,
                         noise_scheduler,
                         stats,
                         cfg.eval_cfg,
                         save_dir,
                         seed,
-                        epoch_num=None
+                        epoch_num=None,
+                        task_list=task_list
                     )
                     tasks_completed += num_completed
+                    if num_completed == len(task_list):
+                        all_correct_count += 1
+
+                    num_unspecified_tasks += get_attempted_unspecified_tasks(task_list, initial_obs, final_obs)
 
-                result_dict[demo_type][f'{ckpt_num}'][f'{speed}'] = tasks_completed / (4*len(eval_eps))
+
+                result_dict[demo_type][f'{ckpt_num}'][f'{speed}']['share-of-tasks'] = tasks_completed / (4*len(eval_eps))
+                result_dict[demo_type][f'{ckpt_num}'][f'{speed}']['all-tasks'] = all_correct_count / len(eval_eps)
+                result_dict[demo_type][f'{ckpt_num}'][f'{speed}']['num-unspecified'] = num_unspecified_tasks / len(eval_eps)
                 print(result_dict)
-    
+
     with open(os.path.join(save_dir, "policy_results.json"), "w") as outfile:
         json.dump(result_dict, outfile)
 
-    averages = {"robot": {f'{speed}': 0 for speed in speeds['robot']}, "human": {f'{speed}': 0 for speed in speeds['human']}}
-    counts = {"robot": {f'{speed}': 0 for speed in speeds['robot']}, "human": {f'{speed}': 0 for speed in speeds['human']}}
+    averages = {
+        "robot": {
+            f'{speed}': {'share-of-tasks': 0, 'all-tasks': 0, 'num-unspecified': 0} for speed in speeds['robot']
+        }, 
+        "human": {
+            f'{speed}': {'share-of-tasks': 0, 'all-tasks': 0, 'num-unspecified': 0} for speed in speeds['human']
+        }
+    }
+    counts = {
+        "robot": {f'{speed}': 0 for speed in speeds['robot']},
+        "human": {f'{speed}': 0 for speed in speeds['human']}
+    }
 
     for demo_type, values in result_dict.items():
         for ckpt_num, acc_dicts in values.items():
             for exec_speed, acc in acc_dicts.items():
-                averages[demo_type][exec_speed] += acc
+                averages[demo_type][exec_speed]['share-of-tasks'] += acc['share-of-tasks']
+                averages[demo_type][exec_speed]['all-tasks'] += acc['all-tasks']
+                averages[demo_type][exec_speed]['num-unspecified'] += acc['num-unspecified']
                 counts[demo_type][exec_speed] += 1
 
     for demo_type, values in averages.items():
         for exec_speed, summed_acc in values.items():
-            averages[demo_type][exec_speed] /= counts[demo_type][exec_speed]
+            averages[demo_type][exec_speed]['share-of-tasks'] /= counts[demo_type][exec_speed]
+            averages[demo_type][exec_speed]['all-tasks'] /= counts[demo_type][exec_speed]
+            averages[demo_type][exec_speed]['num-unspecified'] /= counts[demo_type][exec_speed]
 
     with open(os.path.join(save_dir, "policy_results_avg.json"), "w") as outfile:
         json.dump(averages, outfile)

diff --git a/xskill/utility/diffusion_bc_callback.py b/xskill/utility/diffusion_bc_callback.py
@@ -245,7 +245,7 @@ def create_env(self):
         env = KitchenAllV0(use_abs_action=True)
         return env
 
-    def eval(self, nets, noise_scheduler, stats, eval_cfg, save_path, seed, epoch_num=None):
+    def eval(self, nets, noise_scheduler, stats, eval_cfg, save_path, seed, epoch_num=None, task_list=["slide cabinet", "light switch", "kettle", "microwave"]):
         """
         pretrain resize doesn't matter here.
         use bc_resize to resize the env input to desired size
@@ -276,6 +276,7 @@ def eval(self, nets, noise_scheduler, stats, eval_cfg, save_path, seed, epoch_nu
         # get first observation
         max_steps = eval_cfg.max_steps
         obs = self.env.reset()
+        initial_obs = obs
         # keep a queue of last 2 steps of observations
         obs_horizon = eval_cfg.obs_horizon
         img_obs_deque = collections.deque(
@@ -295,8 +296,7 @@ def eval(self, nets, noise_scheduler, stats, eval_cfg, save_path, seed, epoch_nu
         B = 1
 
         # track completion order
-        task_stack = deque(
-            ["slide cabinet", "light switch", "kettle", "microwave"])
+        task_stack = deque(task_list)
         complete_queue = queue.Queue()
         predict_protos = []
         while not done:
@@ -402,6 +402,8 @@ def eval(self, nets, noise_scheduler, stats, eval_cfg, save_path, seed, epoch_nu
                     if step_idx > max_steps:
                         done = True
 
+        final_obs = obs
+
         predict_protos = np.array(predict_protos)
         eval_save_path = os.path.join(save_path, "evaluation")
         if epoch_num is not None:
@@ -436,4 +438,4 @@ def eval(self, nets, noise_scheduler, stats, eval_cfg, save_path, seed, epoch_nu
             if task == task_stack[-1]:
                 task_stack.pop()
                 order_task_completed_reward += 1
-        return len(total_task_completed), order_task_completed_reward
+        return len(total_task_completed), order_task_completed_reward, initial_obs, final_obs
diff --git a/xskill/utility/observation_indices.py b/xskill/utility/observation_indices.py
@@ -0,0 +1,35 @@
+import numpy as np
+
+ACTION_INDICES = {
+    "bottom burner": np.array([11, 12]),
+    "top burner": np.array([15, 16]),
+    "light switch": np.array([17, 18]),
+    "slide cabinet": np.array([19]),
+    "full slide cabinet": np.array([19]),
+    "hinge cabinet": np.array([20, 21]),
+    "full hinge cabinet": np.array([20, 21]),
+    "microwave": np.array([22]),
+    "full microwave": np.array([22]),
+    "kettle": np.array([23, 24, 25, 26, 27, 28, 29]),
+    "lift kettle": np.array([23, 24, 25, 26, 27, 28, 29]),
+}
+
+ACTION_GOALS = {
+    "bottom burner": [np.array([-0.88, 0])],
+    "top burner": [np.array([-0.92, 0])],
+    "light switch": [np.array([-0.69, -0.05])],
+    "slide cabinet": [np.array([0.37])],
+    "full slide cabinet": [np.array([0.5])],
+    "hinge cabinet": [np.array([0.0, 1.45])],
+    "full hinge cabinet": [np.array([0.0, 1.7])],
+    "microwave": [np.array([-0.75])],
+    "full microwave": [np.array([-1.5])],
+    "kettle": [np.array([-0.23, 0.75, 1.62, 0.99, 0.0, 0.0, -0.06])],
+    "lift kettle": [
+        np.array([-0.26, 0.3, 1.9, 0.99, 0.0, 0.0, -0.06]),
+        np.array([-0.26, 0.65, 1.8, 0.99, 0.0, 0.0, -0.06]),
+        np.array([-0.23, 0.75, 1.62, 0.99, 0.0, 0.0, -0.06]),
+    ],
+}
+
+KETTLE_INIT = np.array([-0.269, 0.35, 1.62, 0.99, 0.0, 0.0, 0.0])