From 7f88c2d887fa42eb3fdc6dfe385d67516ec1eaa9 Mon Sep 17 00:00:00 2001
From: Guang Yang <42389959+guangy10@users.noreply.github.com>
Date: Wed, 17 Apr 2024 15:16:46 -0700
Subject: [PATCH] Script to add validaiton for dtypes (#241)

---
 .ci/scripts/validate.sh | 244 ++++++++++++++++++++--------------------
 1 file changed, 124 insertions(+), 120 deletions(-)

diff --git a/.ci/scripts/validate.sh b/.ci/scripts/validate.sh
index 971965305..ff97d99b0 100644
--- a/.ci/scripts/validate.sh
+++ b/.ci/scripts/validate.sh
@@ -25,69 +25,71 @@ function generate_compiled_model_output() {
     local MODEL_DIR="${CHECKPOINT_PATH%/*}"
     local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//')
 
-    echo ""############### Run inference with torch.compile "###############"
-    echo ""
-    echo "******************************************"
-    echo "************** non-quantized *************"
-    echo "******************************************"
-    python -W ignore generate.py --compile --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
-    cat "$MODEL_DIR/output_compiled"
-
-    echo "******************************************"
-    echo "******* Emb: channel-wise quantized ******"
-    echo "******************************************"
-    python -W ignore generate.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
-    cat "$MODEL_DIR/output_eager"
-    python -W ignore generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
-    cat "$MODEL_DIR/output_compiled"
-
-    echo "******************************************"
-    echo "******** Emb: group-wise quantized *******"
-    echo "******************************************"
-    python -W ignore generate.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
-    cat "$MODEL_DIR/output_eager"
-    python -W ignore generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
-    cat "$MODEL_DIR/output_compiled"
-
-    echo "***********************************************"
-    echo "******* Emb: 4bit channel-wise quantized ******"
-    echo "***********************************************"
-    python -W ignore generate.py --quant '{"embedding" : {"bitwidth": 4, "groupsize": 0, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
-    cat "$MODEL_DIR/output_eager"
-    python -W ignore generate.py --compile --quant '{"embedding" : {"bitwidth": 4, "groupsize": 0, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
-    cat "$MODEL_DIR/output_compiled"
-
-    echo "***********************************************"
-    echo "******** Emb: 4bit group-wise quantized *******"
-    echo "***********************************************"
-    python -W ignore generate.py --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
-    cat "$MODEL_DIR/output_eager"
-    python -W ignore generate.py --compile --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
-    cat "$MODEL_DIR/output_compiled"
-
-    echo "******************************************"
-    echo "******* INT8 channel-wise quantized ******"
-    echo "******************************************"
-    python -W ignore generate.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
-    cat "$MODEL_DIR/output_eager"
-    python -W ignore generate.py --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
-    cat "$MODEL_DIR/output_compiled"
-
-    echo "******************************************"
-    echo "******** INT8 group-wise quantized *******"
-    echo "******************************************"
-    python -W ignore generate.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
-    cat "$MODEL_DIR/output_eager"
-    python -W ignore generate.py --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
-    cat "$MODEL_DIR/output_compiled"
-
-    echo "******************************************"
-    echo "******** INT4 group-wise quantized *******"
-    echo "******************************************"
-    python -W ignore generate.py --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
-    cat "$MODEL_DIR/output_eager"
-    python -W ignore generate.py --compile --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
-    cat "$MODEL_DIR/output_compiled"
+    for DTYPE in float32 bfloat16; do
+        echo ""############### Run inference with torch.compile for dtype $DTYPE "###############"
+        echo ""
+        echo "******************************************"
+        echo "************** non-quantized *************"
+        echo "******************************************"
+        python -W ignore generate.py --dtype ${DTYPE} --compile --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
+        cat "$MODEL_DIR/output_compiled"
+
+        echo "******************************************"
+        echo "******* Emb: channel-wise quantized ******"
+        echo "******************************************"
+        python -W ignore generate.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
+        cat "$MODEL_DIR/output_eager"
+        python -W ignore generate.py --dtype ${DTYPE} --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
+        cat "$MODEL_DIR/output_compiled"
+
+        echo "******************************************"
+        echo "******** Emb: group-wise quantized *******"
+        echo "******************************************"
+        python -W ignore generate.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
+        cat "$MODEL_DIR/output_eager"
+        python -W ignore generate.py --dtype ${DTYPE} --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
+        cat "$MODEL_DIR/output_compiled"
+
+        echo "***********************************************"
+        echo "******* Emb: 4bit channel-wise quantized ******"
+        echo "***********************************************"
+        python -W ignore generate.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 4, "groupsize": 0, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
+        cat "$MODEL_DIR/output_eager"
+        python -W ignore generate.py --dtype ${DTYPE} --compile --quant '{"embedding" : {"bitwidth": 4, "groupsize": 0, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
+        cat "$MODEL_DIR/output_compiled"
+
+        echo "***********************************************"
+        echo "******** Emb: 4bit group-wise quantized *******"
+        echo "***********************************************"
+        python -W ignore generate.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
+        cat "$MODEL_DIR/output_eager"
+        python -W ignore generate.py --dtype ${DTYPE} --compile --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
+        cat "$MODEL_DIR/output_compiled"
+
+        echo "******************************************"
+        echo "******* INT8 channel-wise quantized ******"
+        echo "******************************************"
+        python -W ignore generate.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
+        cat "$MODEL_DIR/output_eager"
+        python -W ignore generate.py --dtype ${DTYPE} --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
+        cat "$MODEL_DIR/output_compiled"
+
+        echo "******************************************"
+        echo "******** INT8 group-wise quantized *******"
+        echo "******************************************"
+        python -W ignore generate.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
+        cat "$MODEL_DIR/output_eager"
+        python -W ignore generate.py --dtype ${DTYPE} --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
+        cat "$MODEL_DIR/output_compiled"
+
+        echo "******************************************"
+        echo "******** INT4 group-wise quantized *******"
+        echo "******************************************"
+        python -W ignore generate.py --dtype ${DTYPE} --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
+        cat "$MODEL_DIR/output_eager"
+        python -W ignore generate.py --dtype ${DTYPE} --compile --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
+        cat "$MODEL_DIR/output_compiled"
+    done
 }
 
 function generate_aoti_model_output() {
@@ -96,63 +98,65 @@ function generate_aoti_model_output() {
     local MODEL_DIR="${CHECKPOINT_PATH%/*}"
     local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//')
 
-    echo ""############### Run inference with AOT Inductor "###############"
-    echo ""
-    echo "******************************************"
-    echo "************** non-quantized *************"
-    echo "******************************************"
-    python -W ignore export.py --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path "${MODEL_DIR}/${MODEL_NAME}.so" --device "$TARGET_DEVICE" || exit 1
-    python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --dso-path "$MODEL_DIR/${MODEL_NAME}.so" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
-    cat "$MODEL_DIR/output_aoti"
-
-    echo "******************************************"
-    echo "******* Emb: channel-wise quantized ******"
-    echo "******************************************"
-    python -W ignore export.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
-    python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
-    cat "$MODEL_DIR/output_aoti"
-
-    echo "******************************************"
-    echo "******** Emb: group-wise quantized *******"
-    echo "******************************************"
-    python -W ignore export.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
-    python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
-    cat "$MODEL_DIR/output_aoti"
-
-    echo "***********************************************"
-    echo "******* Emb: 4bit channel-wise quantized ******"
-    echo "***********************************************"
-    python -W ignore export.py --quant '{"embedding" : {"bitwidth": 4, "groupsize": 0, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
-    python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
-    cat "$MODEL_DIR/output_aoti"
-
-    echo "***********************************************"
-    echo "******** Emb: 4bit group-wise quantized *******"
-    echo "***********************************************"
-    python -W ignore export.py --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
-    python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
-    cat "$MODEL_DIR/output_aoti"
-
-    echo "******************************************"
-    echo "******* INT8 channel-wise quantized ******"
-    echo "******************************************"
-    python -W ignore export.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
-    python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
-    cat "$MODEL_DIR/output_aoti"
-
-    echo "******************************************"
-    echo "******** INT8 group-wise quantized *******"
-    echo "******************************************"
-    python -W ignore export.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
-    python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
-    cat "$MODEL_DIR/output_aoti"
-
-    echo "******************************************"
-    echo "******** INT4 group-wise quantized *******"
-    echo "******************************************"
-    python -W ignore export.py --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
-    python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
-    cat "$MODEL_DIR/output_aoti"
+    for DTYPE in float32 bfloat16; do
+        echo ""############### Run inference with AOT Inductor  for dtype $DTYPE "###############"
+        echo ""
+        echo "******************************************"
+        echo "************** non-quantized *************"
+        echo "******************************************"
+        python -W ignore export.py --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path "${MODEL_DIR}/${MODEL_NAME}.so" --device "$TARGET_DEVICE" || exit 1
+        python -W ignore generate.py --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --dso-path "$MODEL_DIR/${MODEL_NAME}.so" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
+        cat "$MODEL_DIR/output_aoti"
+
+        echo "******************************************"
+        echo "******* Emb: channel-wise quantized ******"
+        echo "******************************************"
+        python -W ignore export.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
+        python -W ignore generate.py --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
+        cat "$MODEL_DIR/output_aoti"
+
+        echo "******************************************"
+        echo "******** Emb: group-wise quantized *******"
+        echo "******************************************"
+        python -W ignore export.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
+        python -W ignore generate.py --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
+        cat "$MODEL_DIR/output_aoti"
+
+        echo "***********************************************"
+        echo "******* Emb: 4bit channel-wise quantized ******"
+        echo "***********************************************"
+        python -W ignore export.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 4, "groupsize": 0, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
+        python -W ignore generate.py --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
+        cat "$MODEL_DIR/output_aoti"
+
+        echo "***********************************************"
+        echo "******** Emb: 4bit group-wise quantized *******"
+        echo "***********************************************"
+        python -W ignore export.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
+        python -W ignore generate.py --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
+        cat "$MODEL_DIR/output_aoti"
+
+        echo "******************************************"
+        echo "******* INT8 channel-wise quantized ******"
+        echo "******************************************"
+        python -W ignore export.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
+        python -W ignore generate.py --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
+        cat "$MODEL_DIR/output_aoti"
+
+        echo "******************************************"
+        echo "******** INT8 group-wise quantized *******"
+        echo "******************************************"
+        python -W ignore export.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
+        python -W ignore generate.py --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
+        cat "$MODEL_DIR/output_aoti"
+
+        # echo "******************************************"
+        # echo "******** INT4 group-wise quantized *******"
+        # echo "******************************************"
+        # python -W ignore export.py --dtype ${DTYPE} --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
+        # python -W ignore generate.py --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
+        # cat "$MODEL_DIR/output_aoti"
+    done
 }
 
 function generate_executorch_model_output() {