From a8e37cb9218cdd3c20eedd91e4e46021f1f3a81c Mon Sep 17 00:00:00 2001
From: DelinQu <dlqu22@m.fudan.edu.cn>
Date: Thu, 4 Jul 2024 21:18:28 +0800
Subject: [PATCH 1/9] openvla intergration

---
 scripts/openvla_bridge.sh                     |  50 +++++
 scripts/openvla_drawer_variant_agg.sh         |  82 +++++++
 scripts/openvla_drawer_visual_matching.sh     | 132 ++++++++++++
 scripts/openvla_move_near_variant_agg.sh      | 138 ++++++++++++
 scripts/openvla_move_near_visual_matching.sh  |  33 +++
 scripts/openvla_pick_coke_can_variant_agg.sh  | 182 ++++++++++++++++
 .../openvla_pick_coke_can_visual_matching.sh  |  40 ++++
 scripts/openvla_put_in_drawer_variant_agg.sh  |  78 +++++++
 .../openvla_put_in_drawer_visual_matching.sh  |  65 ++++++
 simpler_env/main_inference.py                 |  10 +-
 simpler_env/policies/openvla/openvla_model.py | 203 ++++++++++++++++++
 11 files changed, 1012 insertions(+), 1 deletion(-)
 create mode 100644 scripts/openvla_bridge.sh
 create mode 100644 scripts/openvla_drawer_variant_agg.sh
 create mode 100644 scripts/openvla_drawer_visual_matching.sh
 create mode 100644 scripts/openvla_move_near_variant_agg.sh
 create mode 100644 scripts/openvla_move_near_visual_matching.sh
 create mode 100644 scripts/openvla_pick_coke_can_variant_agg.sh
 create mode 100644 scripts/openvla_pick_coke_can_visual_matching.sh
 create mode 100644 scripts/openvla_put_in_drawer_variant_agg.sh
 create mode 100644 scripts/openvla_put_in_drawer_visual_matching.sh
 create mode 100644 simpler_env/policies/openvla/openvla_model.py

diff --git a/scripts/openvla_bridge.sh b/scripts/openvla_bridge.sh
new file mode 100644
index 0000000..e8c8289
--- /dev/null
+++ b/scripts/openvla_bridge.sh
@@ -0,0 +1,50 @@
+gpu_id=0
+policy_model=openvla
+ckpt_path="openvla/openvla-7b"
+
+scene_name=bridge_table_1_v1
+robot=widowx
+rgb_overlay_path=ManiSkill2_real2sim/data/real_inpainting/bridge_real_eval_1.png
+robot_init_x=0.147
+robot_init_y=0.028
+export DISPLAY=:1.0
+# VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json
+# python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
+#   --robot ${robot} --policy-setup widowx_bridge \
+#   --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+#   --env-name PutCarrotOnPlateInScene-v0 --scene-name ${scene_name} \
+#   --rgb-overlay-path ${rgb_overlay_path} \
+#   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
+#   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
+#
+# python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
+#   --robot ${robot} --policy-setup widowx_bridge \
+#   --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+#   --env-name StackGreenCubeOnYellowCubeBakedTexInScene-v0 --scene-name ${scene_name} \
+#   --rgb-overlay-path ${rgb_overlay_path} \
+#   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
+#   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
+#
+# python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
+#   --robot ${robot} --policy-setup widowx_bridge \
+#   --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+#   --env-name PutSpoonOnTableClothInScene-v0 --scene-name ${scene_name} \
+#   --rgb-overlay-path ${rgb_overlay_path} \
+#   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
+#   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
+
+
+scene_name=bridge_table_1_v2
+robot=widowx_sink_camera_setup
+rgb_overlay_path=ManiSkill2_real2sim/data/real_inpainting/bridge_sink.png
+robot_init_x=0.127
+robot_init_y=0.06
+
+python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
+  --robot ${robot} --policy-setup widowx_bridge \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 120 \
+  --env-name PutEggplantInBasketScene-v0 --scene-name ${scene_name} \
+  --rgb-overlay-path ${rgb_overlay_path} \
+  --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
+
diff --git a/scripts/openvla_drawer_variant_agg.sh b/scripts/openvla_drawer_variant_agg.sh
new file mode 100644
index 0000000..07f6460
--- /dev/null
+++ b/scripts/openvla_drawer_variant_agg.sh
@@ -0,0 +1,82 @@
+# shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
+declare -a ckpt_paths=(
+"openvla/openvla-7b"
+)
+
+declare -a env_names=(
+OpenTopDrawerCustomInScene-v0
+OpenMiddleDrawerCustomInScene-v0
+OpenBottomDrawerCustomInScene-v0
+CloseTopDrawerCustomInScene-v0
+CloseMiddleDrawerCustomInScene-v0
+CloseBottomDrawerCustomInScene-v0
+)
+
+EXTRA_ARGS="--enable-raytracing"
+
+
+# base setup
+scene_name=frl_apartment_stage_simple
+
+EvalSim() {
+  echo ${ckpt_path} ${env_name}
+
+  python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+    --robot google_robot_static \
+    --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+    --env-name ${env_name} --scene-name ${scene_name} \
+    --robot-init-x 0.65 0.85 3 --robot-init-y -0.2 0.2 3 \
+    --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0.0 0.0 1 \
+    --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+    ${EXTRA_ARGS}
+}
+
+
+for ckpt_path in "${ckpt_paths[@]}"; do
+  for env_name in "${env_names[@]}"; do
+    EvalSim
+  done
+done
+
+
+# backgrounds
+
+declare -a scene_names=(
+"modern_bedroom_no_roof"
+"modern_office_no_roof"
+)
+
+for scene_name in "${scene_names[@]}"; do
+  for ckpt_path in "${ckpt_paths[@]}"; do
+    for env_name in "${env_names[@]}"; do
+      EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt"
+      EvalSim
+    done
+  done
+done
+
+
+# lightings
+scene_name=frl_apartment_stage_simple
+
+for ckpt_path in "${ckpt_paths[@]}"; do
+  for env_name in "${env_names[@]}"; do
+    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt light_mode=brighter"
+    EvalSim
+    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt light_mode=darker"
+    EvalSim
+  done
+done
+
+
+# new cabinets
+scene_name=frl_apartment_stage_simple
+
+for ckpt_path in "${ckpt_paths[@]}"; do
+  for env_name in "${env_names[@]}"; do
+    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt station_name=mk_station2"
+    EvalSim
+    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt station_name=mk_station3"
+    EvalSim
+  done
+done
diff --git a/scripts/openvla_drawer_visual_matching.sh b/scripts/openvla_drawer_visual_matching.sh
new file mode 100644
index 0000000..a021660
--- /dev/null
+++ b/scripts/openvla_drawer_visual_matching.sh
@@ -0,0 +1,132 @@
+# shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
+declare -a ckpt_paths=(
+"openvla/openvla-7b"
+)
+
+declare -a env_names=(
+OpenTopDrawerCustomInScene-v0
+OpenMiddleDrawerCustomInScene-v0
+OpenBottomDrawerCustomInScene-v0
+CloseTopDrawerCustomInScene-v0
+CloseMiddleDrawerCustomInScene-v0
+CloseBottomDrawerCustomInScene-v0
+)
+
+# URDF variations
+declare -a urdf_version_arr=("recolor_cabinet_visual_matching_1" "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" None)
+
+for urdf_version in "${urdf_version_arr[@]}"; do
+
+EXTRA_ARGS="--enable-raytracing --additional-env-build-kwargs station_name=mk_station_recolor light_mode=simple disable_bad_material=True urdf_version=${urdf_version}"
+
+EvalOverlay() {
+# A0
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.644 0.644 1 --robot-init-y -0.179 -0.179 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.03 -0.03 1 \
+  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a0.png \
+  ${EXTRA_ARGS}
+
+# A1
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.765 0.765 1 --robot-init-y -0.182 -0.182 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.02 -0.02 1 \
+  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a1.png \
+  ${EXTRA_ARGS}
+
+# A2
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.889 0.889 1 --robot-init-y -0.203 -0.203 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.06 -0.06 1 \
+  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a2.png \
+  ${EXTRA_ARGS}
+
+# B0
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.652 0.652 1 --robot-init-y 0.009 0.009 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b0.png \
+  ${EXTRA_ARGS}
+
+# B1
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.752 0.752 1 --robot-init-y 0.009 0.009 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b1.png \
+  ${EXTRA_ARGS}
+
+# B2
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.851 0.851 1 --robot-init-y 0.035 0.035 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b2.png \
+  ${EXTRA_ARGS}
+
+# C0
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.665 0.665 1 --robot-init-y 0.224 0.224 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c0.png \
+  ${EXTRA_ARGS}
+
+# C1
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.765 0.765 1 --robot-init-y 0.222 0.222 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.025 -0.025 1 \
+  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c1.png \
+  ${EXTRA_ARGS}
+
+# C2
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.865 0.865 1 --robot-init-y 0.222 0.222 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.025 -0.025 1 \
+  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c2.png \
+  ${EXTRA_ARGS}
+}
+
+
+for ckpt_path in "${ckpt_paths[@]}"; do
+  for env_name in "${env_names[@]}"; do
+    EvalOverlay
+  done
+done
+
+
+
+done
diff --git a/scripts/openvla_move_near_variant_agg.sh b/scripts/openvla_move_near_variant_agg.sh
new file mode 100644
index 0000000..e5ad488
--- /dev/null
+++ b/scripts/openvla_move_near_variant_agg.sh
@@ -0,0 +1,138 @@
+
+gpu_id=0
+
+declare -a arr=("openvla/openvla-7b")
+for ckpt_path in "${arr[@]}"; do echo "$ckpt_path"; done
+
+
+# base setup
+
+env_name=MoveNearGoogleInScene-v0
+scene_name=google_pick_coke_can_1_v4
+
+for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1;
+
+done
+
+
+
+# distractor
+
+for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
+  --additional-env-build-kwargs no_distractor=True;
+
+done
+
+
+# backgrounds
+
+env_name=MoveNearGoogleInScene-v0
+declare -a scene_arr=("google_pick_coke_can_1_v4_alt_background" \
+                      "google_pick_coke_can_1_v4_alt_background_2")
+
+for scene_name in "${scene_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1;
+
+done
+
+done
+
+
+
+
+
+# lighting
+
+env_name=MoveNearGoogleInScene-v0
+scene_name=google_pick_coke_can_1_v4
+
+for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
+  --additional-env-build-kwargs slightly_darker_lighting=True;
+
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
+  --additional-env-build-kwargs slightly_brighter_lighting=True;
+
+done
+
+
+
+
+
+# table textures
+
+env_name=MoveNearGoogleInScene-v0
+declare -a scene_arr=("Baked_sc1_staging_objaverse_cabinet1_h870" \
+                      "Baked_sc1_staging_objaverse_cabinet2_h870")
+
+for scene_name in "${scene_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1;
+
+done
+
+done
+
+
+
+
+# camera orientations
+
+declare -a env_arr=("MoveNearAltGoogleCameraInScene-v0" \
+                   "MoveNearAltGoogleCamera2InScene-v0")
+scene_name=google_pick_coke_can_1_v4
+
+for env_name in "${env_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1;
+
+done
+
+done
diff --git a/scripts/openvla_move_near_visual_matching.sh b/scripts/openvla_move_near_visual_matching.sh
new file mode 100644
index 0000000..7e334d8
--- /dev/null
+++ b/scripts/openvla_move_near_visual_matching.sh
@@ -0,0 +1,33 @@
+
+gpu_id=0
+
+declare -a arr=("openvla/openvla-7b")
+
+env_name=MoveNearGoogleBakedTexInScene-v0
+# env_name=MoveNearGoogleBakedTexInScene-v1
+scene_name=google_pick_coke_can_1_v4
+rgb_overlay_path=./ManiSkill2_real2sim/data/real_inpainting/google_move_near_real_eval_1.png
+
+# URDF variations
+declare -a urdf_version_arr=(None "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" "recolor_cabinet_visual_matching_1")
+
+for ckpt_path in "${arr[@]}"; do echo "$ckpt_path"; done
+
+
+for urdf_version in "${urdf_version_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --rgb-overlay-path ${rgb_overlay_path} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
+  --additional-env-build-kwargs urdf_version=${urdf_version} \
+  --additional-env-save-tags baked_except_bpb_orange;
+
+done
+
+done
diff --git a/scripts/openvla_pick_coke_can_variant_agg.sh b/scripts/openvla_pick_coke_can_variant_agg.sh
new file mode 100644
index 0000000..72e9160
--- /dev/null
+++ b/scripts/openvla_pick_coke_can_variant_agg.sh
@@ -0,0 +1,182 @@
+
+gpu_id=0
+
+declare -a arr=("openvla/openvla-7b")
+
+# lr_switch=laying horizontally but flipped left-right to match real eval; upright=standing; laid_vertically=laying vertically
+declare -a coke_can_options_arr=("lr_switch=True" "upright=True" "laid_vertically=True")
+
+for ckpt_path in "${arr[@]}"; do echo "$ckpt_path"; done
+
+
+# base setup
+
+env_name=GraspSingleOpenedCokeCanInScene-v0
+scene_name=google_pick_coke_can_1_v4
+
+for coke_can_option in "${coke_can_options_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --additional-env-build-kwargs ${coke_can_option};
+
+done
+
+done
+
+
+
+# table textures
+
+env_name=GraspSingleOpenedCokeCanInScene-v0
+
+declare -a scene_arr=("Baked_sc1_staging_objaverse_cabinet1_h870" \
+                      "Baked_sc1_staging_objaverse_cabinet2_h870")
+
+
+for coke_can_option in "${coke_can_options_arr[@]}";
+
+do for scene_name in "${scene_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --additional-env-build-kwargs ${coke_can_option};
+
+done
+
+done
+
+done
+
+
+
+
+# distractors
+
+env_name=GraspSingleOpenedCokeCanDistractorInScene-v0
+scene_name=google_pick_coke_can_1_v4
+
+for coke_can_option in "${coke_can_options_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --additional-env-build-kwargs ${coke_can_option};
+
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --additional-env-build-kwargs ${coke_can_option} distractor_config=more;
+
+done
+
+done
+
+
+
+
+# backgrounds
+
+env_name=GraspSingleOpenedCokeCanInScene-v0
+declare -a scene_arr=("google_pick_coke_can_1_v4_alt_background" \
+                      "google_pick_coke_can_1_v4_alt_background_2")
+
+for coke_can_option in "${coke_can_options_arr[@]}";
+
+do for scene_name in "${scene_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --additional-env-build-kwargs ${coke_can_option};
+
+done
+
+done
+
+done
+
+
+
+# lightings
+
+env_name=GraspSingleOpenedCokeCanInScene-v0
+scene_name=google_pick_coke_can_1_v4
+
+for coke_can_option in "${coke_can_options_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --additional-env-build-kwargs ${coke_can_option} slightly_darker_lighting=True;
+
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --additional-env-build-kwargs ${coke_can_option} slightly_brighter_lighting=True;
+
+done
+
+done
+
+
+
+
+# camera orientations
+
+declare -a env_arr=("GraspSingleOpenedCokeCanAltGoogleCameraInScene-v0" \
+                   "GraspSingleOpenedCokeCanAltGoogleCamera2InScene-v0")
+scene_name=google_pick_coke_can_1_v4
+
+for coke_can_option in "${coke_can_options_arr[@]}";
+
+do for env_name in "${env_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --additional-env-build-kwargs ${coke_can_option};
+
+done
+
+done
+
+done
diff --git a/scripts/openvla_pick_coke_can_visual_matching.sh b/scripts/openvla_pick_coke_can_visual_matching.sh
new file mode 100644
index 0000000..083a71e
--- /dev/null
+++ b/scripts/openvla_pick_coke_can_visual_matching.sh
@@ -0,0 +1,40 @@
+
+
+gpu_id=0
+
+declare -a arr=("openvla/openvla-7b")
+
+# lr_switch=laying horizontally but flipped left-right to match real eval; upright=standing; laid_vertically=laying vertically
+declare -a coke_can_options_arr=("lr_switch=True" "upright=True" "laid_vertically=True")
+
+# URDF variations
+declare -a urdf_version_arr=(None "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" "recolor_cabinet_visual_matching_1")
+
+env_name=GraspSingleOpenedCokeCanInScene-v0
+scene_name=google_pick_coke_can_1_v4
+rgb_overlay_path=./ManiSkill2_real2sim/data/real_inpainting/google_coke_can_real_eval_1.png
+
+for ckpt_path in "${arr[@]}"; do echo "$ckpt_path"; done
+
+
+
+for urdf_version in "${urdf_version_arr[@]}";
+
+do for coke_can_option in "${coke_can_options_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --rgb-overlay-path ${rgb_overlay_path} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --additional-env-build-kwargs ${coke_can_option} urdf_version=${urdf_version};
+
+done
+
+done
+
+done
diff --git a/scripts/openvla_put_in_drawer_variant_agg.sh b/scripts/openvla_put_in_drawer_variant_agg.sh
new file mode 100644
index 0000000..a3c63a1
--- /dev/null
+++ b/scripts/openvla_put_in_drawer_variant_agg.sh
@@ -0,0 +1,78 @@
+# shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
+
+
+
+declare -a arr=("openvla/openvla-7b")
+
+declare -a env_names=(
+PlaceIntoClosedTopDrawerCustomInScene-v0
+)
+
+EXTRA_ARGS="--enable-raytracing  --additional-env-build-kwargs model_ids=apple"
+
+
+# base setup
+scene_name=frl_apartment_stage_simple
+
+EvalSim() {
+  echo ${ckpt_path} ${env_name}
+
+  python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+    --robot google_robot_static \
+    --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
+    --env-name ${env_name} --scene-name ${scene_name} \
+    --robot-init-x 0.65 0.65 1 --robot-init-y -0.2 0.2 3 \
+    --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0.0 0.0 1 \
+    --obj-init-x-range -0.08 -0.02 3 --obj-init-y-range -0.02 0.08 3 \
+    ${EXTRA_ARGS}
+}
+
+
+for ckpt_path in "${ckpt_paths[@]}"; do
+  for env_name in "${env_names[@]}"; do
+    EvalSim
+  done
+done
+
+
+# backgrounds
+
+declare -a scene_names=(
+"modern_bedroom_no_roof"
+"modern_office_no_roof"
+)
+
+for scene_name in "${scene_names[@]}"; do
+  for ckpt_path in "${ckpt_paths[@]}"; do
+    for env_name in "${env_names[@]}"; do
+      EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt model_ids=apple"
+      EvalSim
+    done
+  done
+done
+
+
+# lightings
+scene_name=frl_apartment_stage_simple
+
+for ckpt_path in "${ckpt_paths[@]}"; do
+  for env_name in "${env_names[@]}"; do
+    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt light_mode=brighter model_ids=apple"
+    EvalSim
+    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt light_mode=darker model_ids=apple"
+    EvalSim
+  done
+done
+
+
+# new cabinets
+scene_name=frl_apartment_stage_simple
+
+for ckpt_path in "${ckpt_paths[@]}"; do
+  for env_name in "${env_names[@]}"; do
+    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt station_name=mk_station2 model_ids=apple"
+    EvalSim
+    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt station_name=mk_station3 model_ids=apple"
+    EvalSim
+  done
+done
diff --git a/scripts/openvla_put_in_drawer_visual_matching.sh b/scripts/openvla_put_in_drawer_visual_matching.sh
new file mode 100644
index 0000000..98539bf
--- /dev/null
+++ b/scripts/openvla_put_in_drawer_visual_matching.sh
@@ -0,0 +1,65 @@
+# shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
+
+declare -a arr=("openvla/openvla-7b")
+
+
+declare -a env_names=(
+PlaceIntoClosedTopDrawerCustomInScene-v0
+# PlaceIntoClosedMiddleDrawerCustomInScene-v0
+# PlaceIntoClosedBottomDrawerCustomInScene-v0
+)
+
+
+# URDF variations
+declare -a urdf_version_arr=("recolor_cabinet_visual_matching_1" "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" None)
+
+for urdf_version in "${urdf_version_arr[@]}"; do
+
+EXTRA_ARGS="--enable-raytracing --additional-env-build-kwargs station_name=mk_station_recolor light_mode=simple disable_bad_material=True urdf_version=${urdf_version} model_ids=baked_apple_v2"
+
+
+EvalOverlay() {
+# A0
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.644 0.644 1 --robot-init-y -0.179 -0.179 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.03 -0.03 1 \
+  --obj-init-x-range -0.08 -0.02 3 --obj-init-y-range -0.02 0.08 3 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a0.png \
+  ${EXTRA_ARGS}
+
+# B0
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.652 0.652 1 --robot-init-y 0.009 0.009 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --obj-init-x-range -0.08 -0.02 3 --obj-init-y-range -0.02 0.08 3 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b0.png \
+  ${EXTRA_ARGS}
+
+# C0
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.665 0.665 1 --robot-init-y 0.224 0.224 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --obj-init-x-range -0.08 -0.02 3 --obj-init-y-range -0.02 0.08 3 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c0.png \
+  ${EXTRA_ARGS}
+}
+
+
+for ckpt_path in "${ckpt_paths[@]}"; do
+  for env_name in "${env_names[@]}"; do
+    EvalOverlay
+  done
+done
+
+
+
+done
diff --git a/simpler_env/main_inference.py b/simpler_env/main_inference.py
index 320f268..7ebe20c 100644
--- a/simpler_env/main_inference.py
+++ b/simpler_env/main_inference.py
@@ -7,6 +7,7 @@
 from simpler_env.evaluation.maniskill2_evaluator import maniskill2_evaluator
 from simpler_env.policies.octo.octo_server_model import OctoServerInference
 from simpler_env.policies.rt1.rt1_model import RT1Inference
+from simpler_env.policies.openvla.openvla_model import OpenVALInference
 
 try:
     from simpler_env.policies.octo.octo_model import OctoInference
@@ -28,7 +29,7 @@
             gpus[0],
             [tf.config.LogicalDeviceConfiguration(memory_limit=args.tf_memory_limit)],
         )
-
+    print(f"**** {args.policy_model} ****")
     # policy model creation; update this if you are using a new policy model
     if args.policy_model == "rt1":
         assert args.ckpt_path is not None
@@ -53,6 +54,13 @@
                 init_rng=args.octo_init_rng,
                 action_scale=args.action_scale,
             )
+    elif args.policy_model == "openvla":
+        assert args.ckpt_path is not None
+        model = OpenVALInference(
+            saved_model_path=args.ckpt_path,
+            policy_setup=args.policy_setup,
+            action_scale=args.action_scale,
+        )
     else:
         raise NotImplementedError()
 
diff --git a/simpler_env/policies/openvla/openvla_model.py b/simpler_env/policies/openvla/openvla_model.py
new file mode 100644
index 0000000..66bdf0d
--- /dev/null
+++ b/simpler_env/policies/openvla/openvla_model.py
@@ -0,0 +1,203 @@
+from collections import deque
+from typing import Optional, Sequence
+import os
+import matplotlib.pyplot as plt
+import numpy as np
+from transforms3d.euler import euler2axangle
+from simpler_env.utils.action.action_ensemble import ActionEnsembler
+from transformers import AutoModelForVision2Seq, AutoProcessor
+from PIL import Image
+import torch
+import cv2 as cv
+
+
+class OpenVALInference:
+    def __init__(
+        self,
+        saved_model_path: str = "openvla/openvla-7b",
+        unnorm_key: Optional[str] = None,
+        policy_setup: str = "widowx_bridge",
+        horizon: int = 2,
+        pred_action_horizon: int = 1,
+        exec_horizon: int = 1,
+        image_size: list[int] = [224, 224],
+        action_scale: float = 1.0,
+    ) -> None:
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        if policy_setup == "widowx_bridge":
+            unnorm_key = "bridge_orig" if unnorm_key is None else unnorm_key
+            action_ensemble = True
+            action_ensemble_temp = 0.0
+            self.sticky_gripper_num_repeat = 1
+        elif policy_setup == "google_robot":
+            unnorm_key = "fractal20220817_data" if unnorm_key is None else unnorm_key
+            action_ensemble = True
+            action_ensemble_temp = 0.0
+            self.sticky_gripper_num_repeat = 15
+        else:
+            raise NotImplementedError(
+                f"Policy setup {policy_setup} not supported for octo models. The other datasets can be found in the huggingface config.json file."
+            )
+        self.policy_setup = policy_setup
+        self.unnorm_key = unnorm_key
+
+        print(f"*** policy_setup: {policy_setup}, unnorm_key: {unnorm_key} ***")
+        self.processor = AutoProcessor.from_pretrained(saved_model_path, trust_remote_code=True)
+        self.vla = AutoModelForVision2Seq.from_pretrained(
+            "openvla/openvla-7b",
+            attn_implementation="flash_attention_2",  # [Optional] Requires `flash_attn`
+            torch_dtype=torch.bfloat16,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+        ).cuda()
+
+        self.image_size = image_size
+        self.action_scale = action_scale
+        self.horizon = horizon
+        self.pred_action_horizon = pred_action_horizon
+        self.exec_horizon = exec_horizon
+        self.action_ensemble = action_ensemble
+        self.action_ensemble_temp = action_ensemble_temp
+
+        self.sticky_action_is_on = False
+        self.gripper_action_repeat = 0
+        self.sticky_gripper_action = 0.0
+        self.previous_gripper_action = None
+
+        self.task = None
+        self.task_description = None
+        self.image_history = deque(maxlen=self.horizon)
+        if self.action_ensemble:
+            self.action_ensembler = ActionEnsembler(self.pred_action_horizon, self.action_ensemble_temp)
+        else:
+            self.action_ensembler = None
+        self.num_image_history = 0
+
+    def _add_image_to_history(self, image: np.ndarray) -> None:
+        self.image_history.append(image)
+        self.num_image_history = min(self.num_image_history + 1, self.horizon)
+
+    def reset(self, task_description: str) -> None:
+        self.task_description = task_description
+        self.image_history.clear()
+        if self.action_ensemble:
+            self.action_ensembler.reset()
+        self.num_image_history = 0
+
+        self.sticky_action_is_on = False
+        self.gripper_action_repeat = 0
+        self.sticky_gripper_action = 0.0
+        self.previous_gripper_action = None
+
+    def step(
+        self, image: np.ndarray, task_description: Optional[str] = None, *args, **kwargs
+    ) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
+        """
+        Input:
+            image: np.ndarray of shape (H, W, 3), uint8
+            task_description: Optional[str], task description; if different from previous task description, policy state is reset
+        Output:
+            raw_action: dict; raw policy action output
+            action: dict; processed action to be sent to the maniskill2 environment, with the following keys:
+                - 'world_vector': np.ndarray of shape (3,), xyz translation of robot end-effector
+                - 'rot_axangle': np.ndarray of shape (3,), axis-angle representation of end-effector rotation
+                - 'gripper': np.ndarray of shape (1,), gripper action
+                - 'terminate_episode': np.ndarray of shape (1,), 1 if episode should be terminated, 0 otherwise
+        """
+        if task_description is not None:
+            if task_description != self.task_description:
+                self.reset(task_description)
+
+        assert image.dtype == np.uint8
+        self._add_image_to_history(self._resize_image(image))
+
+        image: Image.Image = Image.fromarray(image)
+        prompt = task_description
+
+        # predict action (7-dof; un-normalize for bridgev2)
+        inputs = self.processor(prompt, image).to("cuda:0", dtype=torch.bfloat16)
+        raw_actions = self.vla.predict_action(**inputs, unnorm_key=self.unnorm_key, do_sample=False)[None]
+        # print(f"*** raw actions {raw_actions} ***")
+
+        if self.action_ensemble:
+            raw_actions = self.action_ensembler.ensemble_action(raw_actions)[None]
+        raw_action = {
+            "world_vector": np.array(raw_actions[0, :3]),
+            "rotation_delta": np.array(raw_actions[0, 3:6]),
+            "open_gripper": np.array(raw_actions[0, 6:7]),  # range [0, 1]; 1 = open; 0 = close
+        }
+
+        # process raw_action to obtain the action to be sent to the maniskill2 environment
+        action = {}
+        action["world_vector"] = raw_action["world_vector"] * self.action_scale
+        action_rotation_delta = np.asarray(raw_action["rotation_delta"], dtype=np.float64)
+        roll, pitch, yaw = action_rotation_delta
+        action_rotation_ax, action_rotation_angle = euler2axangle(roll, pitch, yaw)
+        action_rotation_axangle = action_rotation_ax * action_rotation_angle
+        action["rot_axangle"] = action_rotation_axangle * self.action_scale
+
+        if self.policy_setup == "google_robot":
+            current_gripper_action = raw_action["open_gripper"]
+            if self.previous_gripper_action is None:
+                relative_gripper_action = np.array([0])
+            else:
+                relative_gripper_action = self.previous_gripper_action - current_gripper_action
+            self.previous_gripper_action = current_gripper_action
+
+            if np.abs(relative_gripper_action) > 0.5 and self.sticky_action_is_on is False:
+                self.sticky_action_is_on = True
+                self.sticky_gripper_action = relative_gripper_action
+
+            if self.sticky_action_is_on:
+                self.gripper_action_repeat += 1
+                relative_gripper_action = self.sticky_gripper_action
+
+            if self.gripper_action_repeat == self.sticky_gripper_num_repeat:
+                self.sticky_action_is_on = False
+                self.gripper_action_repeat = 0
+                self.sticky_gripper_action = 0.0
+
+            action["gripper"] = relative_gripper_action
+
+        elif self.policy_setup == "widowx_bridge":
+            action["gripper"] = 2.0 * (raw_action["open_gripper"] > 0.5) - 1.0
+
+        action["terminate_episode"] = np.array([0.0])
+
+        return raw_action, action
+
+    def _resize_image(self, image: np.ndarray) -> np.ndarray:
+        image = cv.resize(image, tuple(self.image_size), interpolation=cv.INTER_AREA)
+        return image
+
+    def visualize_epoch(
+        self, predicted_raw_actions: Sequence[np.ndarray], images: Sequence[np.ndarray], save_path: str
+    ) -> None:
+        images = [self._resize_image(image) for image in images]
+        ACTION_DIM_LABELS = ["x", "y", "z", "roll", "pitch", "yaw", "grasp"]
+
+        img_strip = np.concatenate(np.array(images[::3]), axis=1)
+
+        # set up plt figure
+        figure_layout = [["image"] * len(ACTION_DIM_LABELS), ACTION_DIM_LABELS]
+        plt.rcParams.update({"font.size": 12})
+        fig, axs = plt.subplot_mosaic(figure_layout)
+        fig.set_size_inches([45, 10])
+
+        # plot actions
+        pred_actions = np.array(
+            [
+                np.concatenate([a["world_vector"], a["rotation_delta"], a["open_gripper"]], axis=-1)
+                for a in predicted_raw_actions
+            ]
+        )
+        for action_dim, action_label in enumerate(ACTION_DIM_LABELS):
+            # actions have batch, horizon, dim, in this example we just take the first action for simplicity
+            axs[action_label].plot(pred_actions[:, action_dim], label="predicted action")
+            axs[action_label].set_title(action_label)
+            axs[action_label].set_xlabel("Time in one episode")
+
+        axs["image"].imshow(img_strip)
+        axs["image"].set_xlabel("Time in one episode (subsampled)")
+        plt.legend()
+        plt.savefig(save_path)

From 7bed3d4bce61ef2bf58ccbaf8f81ccff306883b9 Mon Sep 17 00:00:00 2001
From: DelinQu <dlqu22@m.fudan.edu.cn>
Date: Fri, 5 Jul 2024 18:23:37 +0800
Subject: [PATCH 2/9] intergrate openvla policy and PR

---
 scripts/openvla_bridge.sh | 49 +++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/scripts/openvla_bridge.sh b/scripts/openvla_bridge.sh
index e8c8289..80e733e 100644
--- a/scripts/openvla_bridge.sh
+++ b/scripts/openvla_bridge.sh
@@ -7,31 +7,30 @@ robot=widowx
 rgb_overlay_path=ManiSkill2_real2sim/data/real_inpainting/bridge_real_eval_1.png
 robot_init_x=0.147
 robot_init_y=0.028
-export DISPLAY=:1.0
-# VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json
-# python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
-#   --robot ${robot} --policy-setup widowx_bridge \
-#   --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
-#   --env-name PutCarrotOnPlateInScene-v0 --scene-name ${scene_name} \
-#   --rgb-overlay-path ${rgb_overlay_path} \
-#   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
-#   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
-#
-# python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
-#   --robot ${robot} --policy-setup widowx_bridge \
-#   --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
-#   --env-name StackGreenCubeOnYellowCubeBakedTexInScene-v0 --scene-name ${scene_name} \
-#   --rgb-overlay-path ${rgb_overlay_path} \
-#   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
-#   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
-#
-# python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
-#   --robot ${robot} --policy-setup widowx_bridge \
-#   --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
-#   --env-name PutSpoonOnTableClothInScene-v0 --scene-name ${scene_name} \
-#   --rgb-overlay-path ${rgb_overlay_path} \
-#   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
-#   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
+
+python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
+  --robot ${robot} --policy-setup widowx_bridge \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+  --env-name PutCarrotOnPlateInScene-v0 --scene-name ${scene_name} \
+  --rgb-overlay-path ${rgb_overlay_path} \
+  --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
+
+python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
+  --robot ${robot} --policy-setup widowx_bridge \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+  --env-name StackGreenCubeOnYellowCubeBakedTexInScene-v0 --scene-name ${scene_name} \
+  --rgb-overlay-path ${rgb_overlay_path} \
+  --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
+
+python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
+  --robot ${robot} --policy-setup widowx_bridge \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+  --env-name PutSpoonOnTableClothInScene-v0 --scene-name ${scene_name} \
+  --rgb-overlay-path ${rgb_overlay_path} \
+  --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
 
 
 scene_name=bridge_table_1_v2

From f0674e7701750e6262879b26a5a0a0954726479a Mon Sep 17 00:00:00 2001
From: DelinQu <dlqu22@m.fudan.edu.cn>
Date: Sat, 6 Jul 2024 21:50:17 +0800
Subject: [PATCH 3/9] openvla policy intergration pull request

---
 simpler_env/main_inference.py                 | 4 ++--
 simpler_env/policies/openvla/openvla_model.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/simpler_env/main_inference.py b/simpler_env/main_inference.py
index 7ebe20c..699c00a 100644
--- a/simpler_env/main_inference.py
+++ b/simpler_env/main_inference.py
@@ -7,7 +7,7 @@
 from simpler_env.evaluation.maniskill2_evaluator import maniskill2_evaluator
 from simpler_env.policies.octo.octo_server_model import OctoServerInference
 from simpler_env.policies.rt1.rt1_model import RT1Inference
-from simpler_env.policies.openvla.openvla_model import OpenVALInference
+from simpler_env.policies.openvla.openvla_model import OpenVLAInference
 
 try:
     from simpler_env.policies.octo.octo_model import OctoInference
@@ -56,7 +56,7 @@
             )
     elif args.policy_model == "openvla":
         assert args.ckpt_path is not None
-        model = OpenVALInference(
+        model = OpenVLAInference(
             saved_model_path=args.ckpt_path,
             policy_setup=args.policy_setup,
             action_scale=args.action_scale,
diff --git a/simpler_env/policies/openvla/openvla_model.py b/simpler_env/policies/openvla/openvla_model.py
index 66bdf0d..b21f137 100644
--- a/simpler_env/policies/openvla/openvla_model.py
+++ b/simpler_env/policies/openvla/openvla_model.py
@@ -11,7 +11,7 @@
 import cv2 as cv
 
 
-class OpenVALInference:
+class OpenVLAInference:
     def __init__(
         self,
         saved_model_path: str = "openvla/openvla-7b",
@@ -144,7 +144,7 @@ def step(
                 relative_gripper_action = self.previous_gripper_action - current_gripper_action
             self.previous_gripper_action = current_gripper_action
 
-            if np.abs(relative_gripper_action) > 0.5 and self.sticky_action_is_on is False:
+            if np.abs(relative_gripper_action) > 0.5 and (not self.sticky_action_is_on):
                 self.sticky_action_is_on = True
                 self.sticky_gripper_action = relative_gripper_action
 

From 29053792b9e3103c30e27b69a0268b4d3da8a710 Mon Sep 17 00:00:00 2001
From: xuanlinli17 <xuanlinli17@gmail.com>
Date: Sun, 28 Jul 2024 09:50:03 -0700
Subject: [PATCH 4/9] update openvla inference scripts

---
 scripts/octo_bridge.sh                        |  0
 scripts/octo_drawer_variant_agg.sh            |  0
 scripts/octo_drawer_visual_matching.sh        |  0
 scripts/octo_move_near_variant_agg.sh         |  0
 scripts/octo_move_near_visual_matching.sh     |  0
 scripts/octo_pick_coke_can_variant_agg.sh     |  0
 scripts/octo_pick_coke_can_visual_matching.sh |  0
 scripts/octo_put_in_drawer_variant_agg.sh     |  0
 scripts/octo_put_in_drawer_visual_matching.sh |  0
 scripts/openvla_bridge.sh                     |  0
 scripts/openvla_drawer_variant_agg.sh         |  0
 scripts/openvla_drawer_visual_matching.sh     |  0
 scripts/openvla_move_near_variant_agg.sh      |  0
 scripts/openvla_move_near_visual_matching.sh  |  0
 scripts/openvla_pick_coke_can_variant_agg.sh  |  0
 .../openvla_pick_coke_can_visual_matching.sh  |  0
 scripts/openvla_put_in_drawer_variant_agg.sh  |  2 +-
 .../openvla_put_in_drawer_visual_matching.sh  |  2 +-
 scripts/rt1_drawer_variant_agg.sh             |  0
 scripts/rt1_drawer_visual_matching.sh         |  0
 scripts/rt1_move_near_variant_agg.sh          |  0
 scripts/rt1_move_near_visual_matching.sh      |  0
 scripts/rt1_pick_coke_can_variant_agg.sh      |  0
 scripts/rt1_pick_coke_can_visual_matching.sh  |  0
 scripts/rt1_put_in_drawer_variant_agg.sh      |  0
 scripts/rt1_put_in_drawer_visual_matching.sh  |  0
 scripts/rt1x_bridge.sh                        |  0
 simpler_env/policies/openvla/openvla_model.py | 26 ++-----------------
 28 files changed, 4 insertions(+), 26 deletions(-)
 mode change 100644 => 100755 scripts/octo_bridge.sh
 mode change 100644 => 100755 scripts/octo_drawer_variant_agg.sh
 mode change 100644 => 100755 scripts/octo_drawer_visual_matching.sh
 mode change 100644 => 100755 scripts/octo_move_near_variant_agg.sh
 mode change 100644 => 100755 scripts/octo_move_near_visual_matching.sh
 mode change 100644 => 100755 scripts/octo_pick_coke_can_variant_agg.sh
 mode change 100644 => 100755 scripts/octo_pick_coke_can_visual_matching.sh
 mode change 100644 => 100755 scripts/octo_put_in_drawer_variant_agg.sh
 mode change 100644 => 100755 scripts/octo_put_in_drawer_visual_matching.sh
 mode change 100644 => 100755 scripts/openvla_bridge.sh
 mode change 100644 => 100755 scripts/openvla_drawer_variant_agg.sh
 mode change 100644 => 100755 scripts/openvla_drawer_visual_matching.sh
 mode change 100644 => 100755 scripts/openvla_move_near_variant_agg.sh
 mode change 100644 => 100755 scripts/openvla_move_near_visual_matching.sh
 mode change 100644 => 100755 scripts/openvla_pick_coke_can_variant_agg.sh
 mode change 100644 => 100755 scripts/openvla_pick_coke_can_visual_matching.sh
 mode change 100644 => 100755 scripts/openvla_put_in_drawer_variant_agg.sh
 mode change 100644 => 100755 scripts/openvla_put_in_drawer_visual_matching.sh
 mode change 100644 => 100755 scripts/rt1_drawer_variant_agg.sh
 mode change 100644 => 100755 scripts/rt1_drawer_visual_matching.sh
 mode change 100644 => 100755 scripts/rt1_move_near_variant_agg.sh
 mode change 100644 => 100755 scripts/rt1_move_near_visual_matching.sh
 mode change 100644 => 100755 scripts/rt1_pick_coke_can_variant_agg.sh
 mode change 100644 => 100755 scripts/rt1_pick_coke_can_visual_matching.sh
 mode change 100644 => 100755 scripts/rt1_put_in_drawer_variant_agg.sh
 mode change 100644 => 100755 scripts/rt1_put_in_drawer_visual_matching.sh
 mode change 100644 => 100755 scripts/rt1x_bridge.sh

diff --git a/scripts/octo_bridge.sh b/scripts/octo_bridge.sh
old mode 100644
new mode 100755
diff --git a/scripts/octo_drawer_variant_agg.sh b/scripts/octo_drawer_variant_agg.sh
old mode 100644
new mode 100755
diff --git a/scripts/octo_drawer_visual_matching.sh b/scripts/octo_drawer_visual_matching.sh
old mode 100644
new mode 100755
diff --git a/scripts/octo_move_near_variant_agg.sh b/scripts/octo_move_near_variant_agg.sh
old mode 100644
new mode 100755
diff --git a/scripts/octo_move_near_visual_matching.sh b/scripts/octo_move_near_visual_matching.sh
old mode 100644
new mode 100755
diff --git a/scripts/octo_pick_coke_can_variant_agg.sh b/scripts/octo_pick_coke_can_variant_agg.sh
old mode 100644
new mode 100755
diff --git a/scripts/octo_pick_coke_can_visual_matching.sh b/scripts/octo_pick_coke_can_visual_matching.sh
old mode 100644
new mode 100755
diff --git a/scripts/octo_put_in_drawer_variant_agg.sh b/scripts/octo_put_in_drawer_variant_agg.sh
old mode 100644
new mode 100755
diff --git a/scripts/octo_put_in_drawer_visual_matching.sh b/scripts/octo_put_in_drawer_visual_matching.sh
old mode 100644
new mode 100755
diff --git a/scripts/openvla_bridge.sh b/scripts/openvla_bridge.sh
old mode 100644
new mode 100755
diff --git a/scripts/openvla_drawer_variant_agg.sh b/scripts/openvla_drawer_variant_agg.sh
old mode 100644
new mode 100755
diff --git a/scripts/openvla_drawer_visual_matching.sh b/scripts/openvla_drawer_visual_matching.sh
old mode 100644
new mode 100755
diff --git a/scripts/openvla_move_near_variant_agg.sh b/scripts/openvla_move_near_variant_agg.sh
old mode 100644
new mode 100755
diff --git a/scripts/openvla_move_near_visual_matching.sh b/scripts/openvla_move_near_visual_matching.sh
old mode 100644
new mode 100755
diff --git a/scripts/openvla_pick_coke_can_variant_agg.sh b/scripts/openvla_pick_coke_can_variant_agg.sh
old mode 100644
new mode 100755
diff --git a/scripts/openvla_pick_coke_can_visual_matching.sh b/scripts/openvla_pick_coke_can_visual_matching.sh
old mode 100644
new mode 100755
diff --git a/scripts/openvla_put_in_drawer_variant_agg.sh b/scripts/openvla_put_in_drawer_variant_agg.sh
old mode 100644
new mode 100755
index a3c63a1..fb8ce51
--- a/scripts/openvla_put_in_drawer_variant_agg.sh
+++ b/scripts/openvla_put_in_drawer_variant_agg.sh
@@ -2,7 +2,7 @@
 
 
 
-declare -a arr=("openvla/openvla-7b")
+declare -a ckpt_paths=("openvla/openvla-7b")
 
 declare -a env_names=(
 PlaceIntoClosedTopDrawerCustomInScene-v0
diff --git a/scripts/openvla_put_in_drawer_visual_matching.sh b/scripts/openvla_put_in_drawer_visual_matching.sh
old mode 100644
new mode 100755
index 98539bf..14dd46a
--- a/scripts/openvla_put_in_drawer_visual_matching.sh
+++ b/scripts/openvla_put_in_drawer_visual_matching.sh
@@ -1,6 +1,6 @@
 # shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
 
-declare -a arr=("openvla/openvla-7b")
+declare -a ckpt_paths=("openvla/openvla-7b")
 
 
 declare -a env_names=(
diff --git a/scripts/rt1_drawer_variant_agg.sh b/scripts/rt1_drawer_variant_agg.sh
old mode 100644
new mode 100755
diff --git a/scripts/rt1_drawer_visual_matching.sh b/scripts/rt1_drawer_visual_matching.sh
old mode 100644
new mode 100755
diff --git a/scripts/rt1_move_near_variant_agg.sh b/scripts/rt1_move_near_variant_agg.sh
old mode 100644
new mode 100755
diff --git a/scripts/rt1_move_near_visual_matching.sh b/scripts/rt1_move_near_visual_matching.sh
old mode 100644
new mode 100755
diff --git a/scripts/rt1_pick_coke_can_variant_agg.sh b/scripts/rt1_pick_coke_can_variant_agg.sh
old mode 100644
new mode 100755
diff --git a/scripts/rt1_pick_coke_can_visual_matching.sh b/scripts/rt1_pick_coke_can_visual_matching.sh
old mode 100644
new mode 100755
diff --git a/scripts/rt1_put_in_drawer_variant_agg.sh b/scripts/rt1_put_in_drawer_variant_agg.sh
old mode 100644
new mode 100755
diff --git a/scripts/rt1_put_in_drawer_visual_matching.sh b/scripts/rt1_put_in_drawer_visual_matching.sh
old mode 100644
new mode 100755
diff --git a/scripts/rt1x_bridge.sh b/scripts/rt1x_bridge.sh
old mode 100644
new mode 100755
diff --git a/simpler_env/policies/openvla/openvla_model.py b/simpler_env/policies/openvla/openvla_model.py
index b21f137..ae43ef0 100644
--- a/simpler_env/policies/openvla/openvla_model.py
+++ b/simpler_env/policies/openvla/openvla_model.py
@@ -1,10 +1,8 @@
-from collections import deque
 from typing import Optional, Sequence
 import os
 import matplotlib.pyplot as plt
 import numpy as np
 from transforms3d.euler import euler2axangle
-from simpler_env.utils.action.action_ensemble import ActionEnsembler
 from transformers import AutoModelForVision2Seq, AutoProcessor
 from PIL import Image
 import torch
@@ -17,7 +15,7 @@ def __init__(
         saved_model_path: str = "openvla/openvla-7b",
         unnorm_key: Optional[str] = None,
         policy_setup: str = "widowx_bridge",
-        horizon: int = 2,
+        horizon: int = 1,
         pred_action_horizon: int = 1,
         exec_horizon: int = 1,
         image_size: list[int] = [224, 224],
@@ -26,13 +24,9 @@ def __init__(
         os.environ["TOKENIZERS_PARALLELISM"] = "false"
         if policy_setup == "widowx_bridge":
             unnorm_key = "bridge_orig" if unnorm_key is None else unnorm_key
-            action_ensemble = True
-            action_ensemble_temp = 0.0
             self.sticky_gripper_num_repeat = 1
         elif policy_setup == "google_robot":
             unnorm_key = "fractal20220817_data" if unnorm_key is None else unnorm_key
-            action_ensemble = True
-            action_ensemble_temp = 0.0
             self.sticky_gripper_num_repeat = 15
         else:
             raise NotImplementedError(
@@ -56,8 +50,6 @@ def __init__(
         self.horizon = horizon
         self.pred_action_horizon = pred_action_horizon
         self.exec_horizon = exec_horizon
-        self.action_ensemble = action_ensemble
-        self.action_ensemble_temp = action_ensemble_temp
 
         self.sticky_action_is_on = False
         self.gripper_action_repeat = 0
@@ -66,22 +58,10 @@ def __init__(
 
         self.task = None
         self.task_description = None
-        self.image_history = deque(maxlen=self.horizon)
-        if self.action_ensemble:
-            self.action_ensembler = ActionEnsembler(self.pred_action_horizon, self.action_ensemble_temp)
-        else:
-            self.action_ensembler = None
         self.num_image_history = 0
 
-    def _add_image_to_history(self, image: np.ndarray) -> None:
-        self.image_history.append(image)
-        self.num_image_history = min(self.num_image_history + 1, self.horizon)
-
     def reset(self, task_description: str) -> None:
         self.task_description = task_description
-        self.image_history.clear()
-        if self.action_ensemble:
-            self.action_ensembler.reset()
         self.num_image_history = 0
 
         self.sticky_action_is_on = False
@@ -109,7 +89,7 @@ def step(
                 self.reset(task_description)
 
         assert image.dtype == np.uint8
-        self._add_image_to_history(self._resize_image(image))
+        image = self._resize_image(image)
 
         image: Image.Image = Image.fromarray(image)
         prompt = task_description
@@ -119,8 +99,6 @@ def step(
         raw_actions = self.vla.predict_action(**inputs, unnorm_key=self.unnorm_key, do_sample=False)[None]
         # print(f"*** raw actions {raw_actions} ***")
 
-        if self.action_ensemble:
-            raw_actions = self.action_ensembler.ensemble_action(raw_actions)[None]
         raw_action = {
             "world_vector": np.array(raw_actions[0, :3]),
             "rotation_delta": np.array(raw_actions[0, 3:6]),

From 52727f9f29ab4306e946fa595c9d0392b95f5b62 Mon Sep 17 00:00:00 2001
From: xuanlinli17 <xuanlinli17@gmail.com>
Date: Sun, 28 Jul 2024 09:52:19 -0700
Subject: [PATCH 5/9] update readme

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index f232c0d..ee813be 100644
--- a/README.md
+++ b/README.md
@@ -289,6 +289,13 @@ If you are using CUDA 12, then to use GPU for Octo inference, you need CUDA vers
 
 `PATH=/usr/local/cuda-12.3/bin:$PATH   LD_LIBRARY_PATH=/usr/local/cuda-12.3/lib64:$LD_LIBRARY_PATH   bash scripts/octo_xxx_script.sh`
 
+### OpenVLA Inference Setup
+
+```
+pip install torch==2.3.1 torchvision==0.18.1 timm==0.9.10 tokenizers==0.15.2 accelerate==0.32.1
+pip install flash-attn==2.6.1 --no-build-isolation
+```
+
 ## Troubleshooting
 
 1. If you encounter issues such as

From 37d8fa8de5cd0f7b2dda472fa8ebd2b48c6994f0 Mon Sep 17 00:00:00 2001
From: xuanlinli17 <xuanlinli17@gmail.com>
Date: Sat, 17 Aug 2024 23:40:26 -0700
Subject: [PATCH 6/9] Add OpenVLA metrics

---
 simpler_env/utils/metrics.py            | 82 +++++++++++++++++++++++++
 tools/calc_metrics_evaluation_videos.py | 15 +++++
 2 files changed, 97 insertions(+)

diff --git a/simpler_env/utils/metrics.py b/simpler_env/utils/metrics.py
index 39a251c..019a5a2 100644
--- a/simpler_env/utils/metrics.py
+++ b/simpler_env/utils/metrics.py
@@ -5,6 +5,33 @@
 import numpy as np
 
 REAL_PERF = {    # Real robot eval performance --> extract via: REAL_PERF[task][policy]
+    "google_robot_pick_coke_can_horizontal": {
+        "rt-2-x": 0.920,
+        "rt-1-converged": 0.960,
+        "rt-1-15pct": 1.000,
+        "rt-1-x": 0.880,
+        "rt-1-begin": 0.200,
+        "octo-base": 0.440,
+        "openvla-7b": 0.640,
+    },
+    "google_robot_pick_coke_can_vertical": {
+        "rt-2-x": 0.800,
+        "rt-1-converged": 0.880,
+        "rt-1-15pct": 0.960,
+        "rt-1-x": 0.560,
+        "rt-1-begin": 0.000,
+        "octo-base": 0.200,
+        "openvla-7b": 0.280,
+    },
+    "google_robot_pick_coke_can_standing": {
+        "rt-2-x": 1.000,
+        "rt-1-converged": 0.720,
+        "rt-1-15pct": 0.800,
+        "rt-1-x": 0.840,
+        "rt-1-begin": 0.200,
+        "octo-base": 0.240,
+        "openvla-7b": 0.360,
+    },
     "google_robot_pick_coke_can": {
         "rt-2-x": 0.907,
         "rt-1-converged": 0.853,
@@ -12,6 +39,7 @@
         "rt-1-x": 0.760,
         "rt-1-begin": 0.133,
         "octo-base": 0.293,
+        "openvla-7b": 0.427,
     },
     "google_robot_move_near": {
         "rt-2-x": 0.733,
@@ -20,6 +48,7 @@
         "rt-1-x": 0.450,
         "rt-1-begin": 0.017,
         "octo-base": 0.350,
+        "openvla-7b": 0.667,
     },
     "google_robot_open_drawer": {
         "rt-2-x": 0.333,
@@ -28,6 +57,7 @@
         "rt-1-x": 0.519,
         "rt-1-begin": 0.000,
         "octo-base": 0.148,
+        "openvla-7b": 0.111,
     },
     "google_robot_close_drawer": {
         "rt-2-x": 0.630,
@@ -36,6 +66,16 @@
         "rt-1-x": 0.741,
         "rt-1-begin": 0.000,
         "octo-base": 0.519,
+        "openvla-7b": 0.148,
+    },
+    "google_robot_drawer": {
+        "rt-2-x": 0.481,
+        "rt-1-converged": 0.870,
+        "rt-1-15pct": 0.796,
+        "rt-1-x": 0.630,
+        "rt-1-begin": 0.000,
+        "octo-base": 0.333,
+        "openvla-7b": 0.130,
     },
     "google_robot_place_apple_in_closed_top_drawer": {
         "rt-2-x": 0.074,
@@ -44,6 +84,7 @@
         "rt-1-x": 0.407,
         "rt-1-begin": 0.000,
         "octo-base": 0.000,
+        "openvla-7b": 0.000,
     },
     "widowx_spoon_on_towel": {
         "rt-1-x": 0.000,
@@ -69,6 +110,33 @@
 
 
 SIMPLER_PERF = {    # SIMPLER simulated eval performance --> extract via: SIMPLER_PERF[task][policy]
+    "google_robot_pick_coke_can_horizontal": {
+        "rt-2-x": 0.740,
+        "rt-1-converged": 0.960,
+        "rt-1-15pct": 0.860,
+        "rt-1-x": 0.820,
+        "rt-1-begin": 0.050,
+        "octo-base": 0.210,
+        "openvla-7b": 0.310,
+    },
+    "google_robot_pick_coke_can_vertical": {
+        "rt-2-x": 0.740,
+        "rt-1-converged": 0.900,
+        "rt-1-15pct": 0.790,
+        "rt-1-x": 0.330,
+        "rt-1-begin": 0.000,
+        "octo-base": 0.210,
+        "openvla-7b": 0.030,
+    },
+    "google_robot_pick_coke_can_standing": {
+        "rt-2-x": 0.880,
+        "rt-1-converged": 0.710,
+        "rt-1-15pct": 0.480,
+        "rt-1-x": 0.550,
+        "rt-1-begin": 0.030,
+        "octo-base": 0.090,
+        "openvla-7b": 0.190,
+    },
     "google_robot_pick_coke_can": {
         "rt-2-x": 0.787,
         "rt-1-converged": 0.857,
@@ -76,6 +144,7 @@
         "rt-1-x": 0.567,
         "rt-1-begin": 0.027,
         "octo-base": 0.170,
+        "openvla-7b": 0.177,
     },
     "google_robot_move_near": {
         "rt-2-x": 0.779,
@@ -84,6 +153,7 @@
         "rt-1-x": 0.317,
         "rt-1-begin": 0.050,
         "octo-base": 0.042,
+        "openvla-7b": 0.492,
     },
     "google_robot_open_drawer": {
         "rt-2-x": 0.157,
@@ -92,6 +162,7 @@
         "rt-1-x": 0.296,
         "rt-1-begin": 0.000,
         "octo-base": 0.009,
+        "openvla-7b": 0.250,
     },
     "google_robot_close_drawer": {
         "rt-2-x": 0.343,
@@ -100,6 +171,16 @@
         "rt-1-x": 0.891,
         "rt-1-begin": 0.278,
         "octo-base": 0.444,
+        "openvla-7b": 0.574,
+    },
+    "google_robot_drawer": {
+        "rt-2-x": 0.250,
+        "rt-1-converged": 0.730,
+        "rt-1-15pct": 0.565,
+        "rt-1-x": 0.597,
+        "rt-1-begin": 0.139,
+        "octo-base": 0.227,
+        "openvla-7b": 0.412,
     },
     "google_robot_place_apple_in_closed_top_drawer": {
         "rt-2-x": 0.037,
@@ -108,6 +189,7 @@
         "rt-1-x": 0.213,
         "rt-1-begin": 0.000,
         "octo-base": 0.000,
+        "openvla-7b": 0.000,
     },
     "widowx_spoon_on_towel": {
         "rt-1-x": 0.000,
diff --git a/tools/calc_metrics_evaluation_videos.py b/tools/calc_metrics_evaluation_videos.py
index 1981ad2..3ea1eff 100644
--- a/tools/calc_metrics_evaluation_videos.py
+++ b/tools/calc_metrics_evaluation_videos.py
@@ -27,6 +27,7 @@ def calc_pick_coke_can_stats(root_result_dir):
             "rt-1-x": 0.88,
             "rt-1-begin": 0.20,
             "octo-base": 0.44,
+            "openvla-7b": 0.64,
         },
         "vertical": {
             "rt-2-x": 0.80,
@@ -35,6 +36,7 @@ def calc_pick_coke_can_stats(root_result_dir):
             "rt-1-x": 0.56,
             "rt-1-begin": 0.00,
             "octo-base": 0.20,
+            "openvla-7b": 0.28,
         },
         "standing": {
             "rt-2-x": 1.00,
@@ -43,6 +45,7 @@ def calc_pick_coke_can_stats(root_result_dir):
             "rt-1-x": 0.84,
             "rt-1-begin": 0.20,
             "octo-base": 0.24,
+            "openvla-7b": 0.36,
         },
     }
 
@@ -282,6 +285,7 @@ def calc_move_near_stats(root_result_dir):
         "rt-1-x": 0.45,
         "rt-1-begin": 0.017,
         "octo-base": 0.35,
+        "openvla-7b": 0.667,
     }
 
     ckpt_alias_keys = list(move_near_real_success.keys())
@@ -413,6 +417,7 @@ def calc_drawer_stats(root_result_dir):
             "rt-1-x": 0.519,
             "rt-1-begin": 0.000,
             "octo-base": 0.148,
+            "openvla-7b": 0.111,
         },
         "close": {
             "rt-2-x": 0.630,
@@ -421,6 +426,7 @@ def calc_drawer_stats(root_result_dir):
             "rt-1-x": 0.741,
             "rt-1-begin": 0.000,
             "octo-base": 0.519,
+            "openvla-7b": 0.148,
         },
     }
 
@@ -642,6 +648,7 @@ def calc_long_horizon_apple_in_drawer_stats(root_result_dir):
             "rt-1-x": 0.407,
             "rt-1-begin": 0.000,
             "octo-base": 0.000,
+            "openvla-7b": 0.000,
         },
     }
 
@@ -855,21 +862,25 @@ def calc_bridge_put_on_env_stats(root_result_dir):
             "rt-1-x": 0.042,
             "octo-base": 0.500,
             "octo-small": 0.542,
+            "openvla-7b": 0.10,
         },
         "put_carrot_on_plate": {
             "rt-1-x": 0.167,
             "octo-base": 0.500,
             "octo-small": 0.208,
+            "openvla-7b": 0.10,
         },
         "stack_green_block_on_yellow_block": {
             "rt-1-x": 0.000,
             "octo-base": 0.292,
             "octo-small": 0.583,
+            "openvla-7b": 0.10,
         },
         "put_eggplant_in_basket": {
             "rt-1-x": 0.000,
             "octo-base": 0.400,
             "octo-small": 0.600,
+            "openvla-7b": 0.10,
         },
     }
     real_success_dict = {
@@ -877,17 +888,20 @@ def calc_bridge_put_on_env_stats(root_result_dir):
             "rt-1-x": 0.000,
             "octo-base": 0.333,
             "octo-small": 0.417,
+            "openvla-7b": 0.10,
         },
         "put_carrot_on_plate": {"rt-1-x": 0.00, "octo-base": 0.25, "octo-small": 0.083},
         "stack_green_block_on_yellow_block": {
             "rt-1-x": 0.000,
             "octo-base": 0.000,
             "octo-small": 0.125,
+            "openvla-7b": 0.10,
         },
         "put_eggplant_in_basket": {
             "rt-1-x": 0.000,
             "octo-base": 0.250,
             "octo-small": 0.400,
+            "openvla-7b": 0.10,
         },
     }
 
@@ -1023,6 +1037,7 @@ def calc_bridge_put_on_env_stats(root_result_dir):
     "octo-base": "octo-base",
     "octo-small": "octo-small",
     "octo-server": "octo-server",
+    "openvla-7b": "openvla-7b",
 }
 
 parser = argparse.ArgumentParser()

From 3667e65213d9f0d826081b1f9d720297b4e70fae Mon Sep 17 00:00:00 2001
From: xuanlinli17 <xuanlinli17@gmail.com>
Date: Sat, 17 Aug 2024 23:55:38 -0700
Subject: [PATCH 7/9] update readme

---
 README.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index ee813be..c998a09 100644
--- a/README.md
+++ b/README.md
@@ -23,9 +23,10 @@ We hope that our work guides and inspires future real-to-sim evaluation efforts.
   - [Code Structure](#code-structure)
   - [Adding New Policies](#adding-new-policies)
   - [Adding New Real-to-Sim Evaluation Environments and Robots](#adding-new-real-to-sim-evaluation-environments-and-robots)
-  - [Full Installation (RT-1 and Octo Inference, Env Building)](#full-installation-rt-1-and-octo-inference-env-building)
+  - [Full Installation (RT-1, Octo, OpenVLA Inference, Env Building)](#full-installation-rt-1-octo-openvla-inference-env-building)
     - [RT-1 Inference Setup](#rt-1-inference-setup)
     - [Octo Inference Setup](#octo-inference-setup)
+    - [OpenVLA Inference Setup](#openvla-inference-setup)
   - [Troubleshooting](#troubleshooting)
   - [Citation](#citation)
 
@@ -97,7 +98,7 @@ cd {this_repo}
 pip install -e .
 ```
 
-**If you'd like to perform evaluations on our provided agents (e.g., RT-1, Octo), or add new robots and environments, please additionally follow the full installation instructions [here](#full-installation-rt-1-and-octo-inference-env-building).**
+**If you'd like to perform evaluations on our provided agents (e.g., RT-1, Octo, OpenVLA), or add new robots and environments, please additionally follow the full installation instructions [here](#full-installation-rt-1-octo-openvla-inference-env-building).**
 
 
 ## Examples
@@ -105,7 +106,7 @@ pip install -e .
 - Simple RT-1 and Octo evaluation script on prepackaged environments with visual matching evaluation setup: see [`simpler_env/simple_inference_visual_matching_prepackaged_envs.py`](https://github.com/simpler-env/SimplerEnv/blob/main/simpler_env/simple_inference_visual_matching_prepackaged_envs.py).
 - Colab notebook for RT-1 and Octo inference: see [this link](https://colab.research.google.com/github/simpler-env/SimplerEnv/blob/main/example.ipynb).
 - Environment interactive visualization and manual control: see [`ManiSkill2_real2sim/mani_skill2_real2sim/examples/demo_manual_control_custom_envs.py`](https://github.com/simpler-env/ManiSkill2_real2sim/blob/main/mani_skill2_real2sim/examples/demo_manual_control_custom_envs.py)
-- Policy inference scripts to reproduce our Google Robot and WidowX real-to-sim evaluation results with sweeps over object / robot poses and advanced loggings. These contain both visual matching and variant aggregation evaluation setups along with RT-1, RT-1-X, and Octo policies. See [`scripts/`](https://github.com/simpler-env/SimplerEnv/tree/main/scripts).
+- Policy inference scripts to reproduce our Google Robot and WidowX real-to-sim evaluation results with sweeps over object / robot poses and advanced loggings. These contain both visual matching and variant aggregation evaluation setups along with RT-1, RT-1-X, Octo, and OpenVLA policies. See [`scripts/`](https://github.com/simpler-env/SimplerEnv/tree/main/scripts).
 - Real-to-sim evaluation videos from running `scripts/*.sh`: see [this link](https://huggingface.co/datasets/xuanlinli17/simpler-env-eval-example-videos/tree/main).
 
 ## Current Environments
@@ -219,7 +220,7 @@ If you want to use existing environments for evaluating new policies, you can ke
 We provide a step-by-step guide to add new real-to-sim evaluation environments and robots in [this README](ADDING_NEW_ENVS_ROBOTS.md)
 
 
-## Full Installation (RT-1 and Octo Inference, Env Building)
+## Full Installation (RT-1, Octo, OpenVLA Inference, Env Building)
 
 If you'd like to perform evaluations on our provided agents (e.g., RT-1, Octo), or add new robots and environments, please follow the full installation instructions below.
 

From 2b0ecc8ec6793ec2268e630b1710939feb998407 Mon Sep 17 00:00:00 2001
From: xuanlinli17 <xuanlinli17@gmail.com>
Date: Sun, 18 Aug 2024 09:27:27 -0700
Subject: [PATCH 8/9] add openvla simple inference

---
 README.md                                              |  2 +-
 ...imple_inference_visual_matching_prepackaged_envs.py | 10 ++++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index c998a09..8323958 100644
--- a/README.md
+++ b/README.md
@@ -103,7 +103,7 @@ pip install -e .
 
 ## Examples
 
-- Simple RT-1 and Octo evaluation script on prepackaged environments with visual matching evaluation setup: see [`simpler_env/simple_inference_visual_matching_prepackaged_envs.py`](https://github.com/simpler-env/SimplerEnv/blob/main/simpler_env/simple_inference_visual_matching_prepackaged_envs.py).
+- Simple RT-1, Octo, and OpenVLA evaluation script on prepackaged environments with visual matching evaluation setup: see [`simpler_env/simple_inference_visual_matching_prepackaged_envs.py`](https://github.com/simpler-env/SimplerEnv/blob/main/simpler_env/simple_inference_visual_matching_prepackaged_envs.py).
 - Colab notebook for RT-1 and Octo inference: see [this link](https://colab.research.google.com/github/simpler-env/SimplerEnv/blob/main/example.ipynb).
 - Environment interactive visualization and manual control: see [`ManiSkill2_real2sim/mani_skill2_real2sim/examples/demo_manual_control_custom_envs.py`](https://github.com/simpler-env/ManiSkill2_real2sim/blob/main/mani_skill2_real2sim/examples/demo_manual_control_custom_envs.py)
 - Policy inference scripts to reproduce our Google Robot and WidowX real-to-sim evaluation results with sweeps over object / robot poses and advanced loggings. These contain both visual matching and variant aggregation evaluation setups along with RT-1, RT-1-X, Octo, and OpenVLA policies. See [`scripts/`](https://github.com/simpler-env/SimplerEnv/tree/main/scripts).
diff --git a/simpler_env/simple_inference_visual_matching_prepackaged_envs.py b/simpler_env/simple_inference_visual_matching_prepackaged_envs.py
index 8bf23b8..2e09cd2 100644
--- a/simpler_env/simple_inference_visual_matching_prepackaged_envs.py
+++ b/simpler_env/simple_inference_visual_matching_prepackaged_envs.py
@@ -6,6 +6,8 @@
         --ckpt-path ./checkpoints/rt_1_tf_trained_for_000400120  --task google_robot_pick_coke_can  --logging-root ./results_simple_eval/  --n-trajs 10
     python simpler_env/simple_inference_visual_matching_prepackaged_envs.py --policy octo-small \
         --ckpt-path None --task widowx_spoon_on_towel  --logging-root ./results_simple_eval/  --n-trajs 10
+    python simpler_env/simple_inference_visual_matching_prepackaged_envs.py --policy openvla/openvla-7b \
+        --ckpt-path None --task google_robot_move_near_v1  --logging-root ./results_simple_eval/  --n-trajs 10
 """
 
 import argparse
@@ -21,7 +23,7 @@
 
 parser = argparse.ArgumentParser()
 
-parser.add_argument("--policy", default="rt1", choices=["rt1", "octo-base", "octo-small"])
+parser.add_argument("--policy", default="rt1", choices=["rt1", "octo-base", "octo-small", "openvla/openvla-7b"])
 parser.add_argument(
     "--ckpt-path",
     type=str,
@@ -37,7 +39,7 @@
 parser.add_argument("--n-trajs", type=int, default=10)
 
 args = parser.parse_args()
-if args.policy in ["octo-base", "octo-small"]:
+if args.policy in ["octo-base", "octo-small", "openvla/openvla-7b"]:
     if args.ckpt_path in [None, "None"] or "rt_1_x" in args.ckpt_path:
         args.ckpt_path = args.policy
 if args.ckpt_path[-1] == "/":
@@ -75,6 +77,10 @@
     from simpler_env.policies.octo.octo_model import OctoInference
 
     model = OctoInference(model_type=args.ckpt_path, policy_setup=policy_setup, init_rng=0)
+elif "openvla" in args.policy:
+    from simpler_env.policies.openvla.openvla_model import OpenVLAInference
+
+    model = OpenVLAInference(saved_model_path=args.ckpt_path, policy_setup=policy_setup)
 else:
     raise NotImplementedError()
 

From 2cd7aaea91a7ed19d81c7652b4c9f01e157e6444 Mon Sep 17 00:00:00 2001
From: xuanlinli17 <xuanlinli17@gmail.com>
Date: Sun, 18 Aug 2024 09:44:30 -0700
Subject: [PATCH 9/9] minor readme modification

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8323958..c46c40d 100644
--- a/README.md
+++ b/README.md
@@ -184,6 +184,7 @@ simpler_env/
    policies/: policy implementations
       rt1/: RT-1 policy implementation
       octo/: Octo policy implementation
+      openvla/: OpenVLA policy implementation
    utils/:
       env/: environment building and observation utilities
       debug/: debugging tools for policies and robots
@@ -206,7 +207,7 @@ scripts/: example bash scripts for policy inference under our variant aggregatio
 
 If you want to use existing environments for evaluating new policies, you can keep `./ManiSkill2_real2sim` as is.
 
-1. Implement new policy inference scripts in `simpler_env/policies/{your_new_policy}`, following the examples for RT-1 (`simpler_env/policies/rt1`) and Octo (`simpler_env/policies/octo`) policies.
+1. Implement new policy inference scripts in `simpler_env/policies/{your_new_policy}`, following the examples for RT-1 (`simpler_env/policies/rt1`), Octo (`simpler_env/policies/octo`), and OpenVLA (`simpler_env/policies/openvla`) policies.
 2. You can now use `simpler_env/simple_inference_visual_matching_prepackaged_envs.py` to perform policy evaluations in simulation.
    - If the policy behaviors deviate a lot from those in the real-world, you can write similar scripts as in `simpler_env/utils/debug/{policy_name}_inference_real_video.py` to debug the policy behaviors. The debugging script performs policy inference by feeding real eval video frames into the policy. If the policy behavior still deviates significantly from real, this may suggest that policy actions are processed incorrectly into the simulation environments. Please double check action orderings and action spaces.
 3. If you'd like to perform customized evaluations,