From fff94acd516b23f804ef7ebb84c94eb724fb99e5 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Sun, 11 May 2025 21:21:18 -0400
Subject: [PATCH 1/7] refact cuda ut

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/testing_utils.py           | 174 ++++++++++++++++++++++++++
 test_cuda/requirements.txt            |  18 +++
 test_cuda/test_2_3bits.py             |   4 +
 test_cuda/test_auto_round_format.py   |  44 ++-----
 test_cuda/test_conv1d.py              |   3 +-
 test_cuda/test_exllamav2_backend.py   |   4 +
 test_cuda/test_export.py              |   7 +-
 test_cuda/test_gguf.py                | 100 +++++++++++++++
 test_cuda/test_gguf_format.py         | 131 -------------------
 test_cuda/test_main_func.py           |   5 +-
 test_cuda/test_multiple_card.py       |  19 ++-
 test_cuda/test_multiple_card_calib.py |  17 ++-
 test_cuda/test_qbits.py               |  14 ++-
 test_cuda/test_support_vlms.py        |  11 +-
 test_cuda/test_triton_backend.py      |  11 +-
 test_cuda/test_vlms.py                |  13 +-
 16 files changed, 383 insertions(+), 192 deletions(-)
 create mode 100644 auto_round/testing_utils.py
 create mode 100644 test_cuda/requirements.txt
 delete mode 100644 test_cuda/test_gguf_format.py

diff --git a/auto_round/testing_utils.py b/auto_round/testing_utils.py
new file mode 100644
index 00000000..03d2e709
--- /dev/null
+++ b/auto_round/testing_utils.py
@@ -0,0 +1,174 @@
+import unittest
+import importlib.util
+
+import torch
+
+from transformers.utils.versions import require_version
+
+def is_gguf_available():
+    return importlib.util.find_spec("gguf") is not None
+
+def is_autogptq_available():
+    return importlib.util.find_spec("auto-gptq") is not None
+
+def is_awq_available():
+    return importlib.util.find_spec("autoawq") is not None
+
+def is_optimum_available():
+    return importlib.util.find_spec("optimum") is not None
+
+def is_ipex_available():
+    try:
+        require_version("intel-extension-for-pytorch>=2.5")
+        return True
+    except ImportError:
+        return False
+
+def is_itrex_available():
+    return importlib.util.find_spec("intel-extension-for-transformers") is not None
+
+def is_flash_attn_avaliable():
+    return importlib.util.find_spec("flash-attn") is not None
+
+def is_gptqmodel_available():
+    try:
+        require_version("gptqmodel>=2.0")
+        return True
+    except ImportError:
+        return False
+
+def is_new_version():
+    try:
+        require_version("auto-round>=0.5.0")
+        return True
+    except ImportError:
+        return False
+
+
+def require_gguf(test_case):
+    """
+    Decorator marking a test that requires gguf.
+
+    These tests are skipped when gguf isn't installed.
+
+    """
+    return unittest.skipUnless(is_gguf_available(), "test requires gguf")(test_case)
+
+
+def require_autogptq(test_case):
+    """
+    Decorator marking a test that requires auto-gptq.
+
+    These tests are skipped when auto-gptq isn't installed.
+
+    """
+    return unittest.skipUnless(is_autogptq_available(), "test requires auto-gptq")(test_case)
+
+
+def require_gptqmodel(test_case):
+    """
+    Decorator marking a test that requires gptqmodel.
+
+    These tests are skipped when gptqmodel isn't installed.
+
+    """
+    return unittest.skipUnless(is_autogptq_available(), "test requires gptqmodel>=2.0")(test_case)
+
+
+def require_awq(test_case):
+    """
+    Decorator marking a test that requires autoawq.
+
+    These tests are skipped when autoawq isn't installed.
+
+    """
+    return unittest.skipUnless(is_awq_available(), "test requires autoawq")(test_case)
+
+
+def require_ipex(test_case):
+    """
+    Decorator marking a test that requires intel-extension-for-pytorch.
+
+    These tests are skipped when intel-extension-for-pytorch isn't installed.
+
+    """
+    return unittest.skipUnless(is_ipex_available(), "test requires intel-extension-for-pytorch>=2.5")(test_case)
+
+
+def require_itrex(test_case):
+    """
+    Decorator marking a test that requires intel-extension-for-transformers.
+
+    These tests are skipped when intel-extension-for-transformers isn't installed.
+
+    """
+    return unittest.skipUnless(is_itrex_available(), "test requires intel-extension-for-transformers")(test_case)
+
+def require_optimum(test_case):
+    """
+    Decorator marking a test that optimum.
+
+    These tests are skipped when optimum isn't installed.
+
+    """
+    return unittest.skipUnless(is_optimum_available(), "test requires optimum")(test_case)
+
+
+def require_new_version(test_case):
+    """
+    Decorator marking a test that requires auto-round>=0.5.0.
+
+    These tests are skipped when auto-round<0.5.0.
+
+    """
+    return unittest.skipUnless(is_new_version(), "test requires auto-round>=0.5.0")(test_case)
+
+
+def multi_card(test_case):
+    """
+    Decorator marking a test that requires multi cards.
+
+    These tests are skipped when use only one card or cpu.
+
+    """
+    return unittest.skipUnless(
+        torch.cuda.is_available() and torch.cuda.device_count() > 1, "test requires multiple cards.")(test_case)
+
+
+def require_old_version(test_case):
+    """
+    Decorator marking a test that requires old version of transformers and torch.
+
+    These tests are skipped when not use special version.
+
+    """
+    env_check = True
+    try:
+        require_version("torch<2.7.0")
+        env_check &= True
+    except ImportError:
+        env_check &= False
+    return unittest.skipUnless(env_check, "Environment is not satisfactory")(test_case)
+
+
+def require_vlm_env(test_case):
+    """
+    Decorator marking a test that requires some special env to load vlm model.
+
+    These tests are skipped when not meet the environment requirments.
+
+    """
+
+    env_check = True
+    # pip install flash-attn --no-build-isolation
+    env_check &= is_flash_attn_avaliable()
+
+    # git clone https://github.com/haotian-liu/LLaVA.git && cd LLaVA && pip install -e .
+    env_check &= importlib.util.find_spec("llava") is not None
+
+    return unittest.skipUnless(env_check, "Environment is not satisfactory")(test_case)
+    
+
+    
+
+    
\ No newline at end of file
diff --git a/test_cuda/requirements.txt b/test_cuda/requirements.txt
new file mode 100644
index 00000000..00ae08a8
--- /dev/null
+++ b/test_cuda/requirements.txt
@@ -0,0 +1,18 @@
+accelerate
+autoawq
+auto-gptq
+datasets
+einops
+gptqmodel>=2.0
+intel-extension-for-pytorch>=2.5
+intel-extension-for-transformers
+lm-eval>=0.4.2,<0.5
+numpy < 2.0
+optimum
+pandas
+pillow
+py-cpuinfo
+torch
+torchvision
+tqdm
+transformers
diff --git a/test_cuda/test_2_3bits.py b/test_cuda/test_2_3bits.py
index c595bc5f..82a84b12 100644
--- a/test_cuda/test_2_3bits.py
+++ b/test_cuda/test_2_3bits.py
@@ -12,6 +12,7 @@
 from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate
 from lm_eval.utils import make_table  # pylint: disable=E0401
+from auto_round.testing_utils import require_autogptq, require_new_version
 
 
 def get_accuracy(data):
@@ -35,6 +36,7 @@ def tearDownClass(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
+    @require_autogptq
     def test_3bits_autoround(self):
         model_name = "/models/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
@@ -55,6 +57,7 @@ def test_3bits_autoround(self):
         assert accuracy > 0.3
         shutil.rmtree("./saved", ignore_errors=True)
 
+    @require_new_version
     def test_norm_bias_tuning(self):
         model_name = "/models/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
@@ -73,6 +76,7 @@ def test_norm_bias_tuning(self):
         assert accuracy > 0.18
         shutil.rmtree("./saved", ignore_errors=True)
 
+    @require_new_version
     def test_2bits_autoround(self):
         model_name = "/models/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
diff --git a/test_cuda/test_auto_round_format.py b/test_cuda/test_auto_round_format.py
index 9cc2b0a0..7b995e36 100644
--- a/test_cuda/test_auto_round_format.py
+++ b/test_cuda/test_auto_round_format.py
@@ -5,6 +5,7 @@
 
 sys.path.insert(0, "..")
 from auto_round.eval.evaluation import simple_evaluate_user_model
+from auto_round.testing_utils import require_new_version, require_autogptq, require_awq, require_ipex
 
 import torch
 import transformers
@@ -75,8 +76,9 @@ def tearDownClass(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
+    @require_new_version
     def test_autoround_asym(self):
-        for bits in [2, 4, 8]:
+        for bits in [2, 3, 4, 8]:
             model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
             tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
             bits, group_size, sym = bits, 128, False
@@ -94,7 +96,7 @@ def test_autoround_asym(self):
 
             autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
 
-            model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto",
+            model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cuda:0",
                                                          trust_remote_code=True)
             tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
             text = "There is a girl who likes adventure,"
@@ -104,6 +106,7 @@ def test_autoround_asym(self):
             assert ("!!!" not in res)
             shutil.rmtree(self.save_folder, ignore_errors=True)
 
+    @require_autogptq
     def test_mixed_precision(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -138,6 +141,7 @@ def test_mixed_precision(self):
         print(result['results']['lambada_openai']['acc,none'])
         self.assertGreater(result['results']['lambada_openai']['acc,none'], 0.32)
 
+    @require_awq
     def test_awq_backend(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -180,8 +184,9 @@ def test_awq_backend(self):
         self.model_infer(model, tokenizer)
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
+    @require_new_version
     def test_tritonv2_bf16(self):
-        model_name = "/data5/wenhuach/Meta-Llama-3.1-8B-Instruct-int4-sym-inc"
+        model_name = "OPEA/Meta-Llama-3.1-8B-Instruct-int4-sym-inc"
         quantization_config = AutoRoundConfig(backend="tritonv2")
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
@@ -195,6 +200,7 @@ def test_tritonv2_bf16(self):
 
         torch.cuda.empty_cache()
 
+    @require_ipex
     def test_autoround_gptq_sym_format(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -244,6 +250,7 @@ def test_autoround_gptq_sym_format(self):
 
         shutil.rmtree("./saved", ignore_errors=True)
 
+    @require_awq
     def test_autoround_awq_sym_format(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -281,6 +288,7 @@ def test_autoround_awq_sym_format(self):
 
         shutil.rmtree("./saved", ignore_errors=True)
 
+    @require_new_version
     def test_autoround_sym(self):
         for bits in [2, 3, 4, 8]:
             model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
@@ -310,6 +318,7 @@ def test_autoround_sym(self):
             assert ("!!!" not in res)
             shutil.rmtree(self.save_folder, ignore_errors=True)
 
+    @require_new_version
     def test_load_gptq_model_3bits(self):
         model_name = "LucasSantiago257/gemma-2b-2bits-gptq"
         quantization_config = AutoRoundConfig()
@@ -319,35 +328,6 @@ def test_load_gptq_model_3bits(self):
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         self.model_infer(model, tokenizer)
 
-    def test_autoround_asym(self):
-        for bits in [2, 3, 4, 8]:
-            model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-            tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-            bits, group_size, sym = bits, 128, False
-            autoround = AutoRound(
-                model,
-                tokenizer,
-                bits=bits,
-                group_size=group_size,
-                sym=sym,
-                iters=2,
-                seqlen=2,
-                dataset=self.llm_dataloader,
-            )
-            quantized_model_path = self.save_folder
-
-            autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
-
-            model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cuda:0",
-                                                         trust_remote_code=True)
-            tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
-            text = "There is a girl who likes adventure,"
-            inputs = tokenizer(text, return_tensors="pt").to(model.device)
-            res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])
-            print(res)
-            assert ("!!!" not in res)
-            shutil.rmtree(self.save_folder, ignore_errors=True)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test_cuda/test_conv1d.py b/test_cuda/test_conv1d.py
index 941e1324..79dad6d6 100644
--- a/test_cuda/test_conv1d.py
+++ b/test_cuda/test_conv1d.py
@@ -8,6 +8,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
+from auto_round.testing_utils import require_gptqmodel
 from _test_helpers import model_infer
 class LLMDataLoader:
     def __init__(self):
@@ -30,7 +31,7 @@ def tearDownClass(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-
+    @require_gptqmodel
     def test_quant(self):
         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
diff --git a/test_cuda/test_exllamav2_backend.py b/test_cuda/test_exllamav2_backend.py
index 66846c68..de243219 100644
--- a/test_cuda/test_exllamav2_backend.py
+++ b/test_cuda/test_exllamav2_backend.py
@@ -11,6 +11,7 @@
 from auto_round import AutoRound
 from auto_round import AutoRoundConfig
 from auto_round.eval.evaluation import simple_evaluate_user_model
+from auto_round.testing_utils import require_autogptq, require_gptqmodel
 
 
 class LLMDataLoader:
@@ -63,6 +64,7 @@ def tearDownClass(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
+    @require_gptqmodel
     def test_gptqmodel_exllmav2_4bits_asym(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -110,6 +112,7 @@ def test_gptqmodel_exllmav2_4bits_asym(self):
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
+    @require_autogptq
     def test_gptq_exllamav2_4bits_sym(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -143,6 +146,7 @@ def test_gptq_exllamav2_4bits_sym(self):
         torch.cuda.empty_cache()
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
+    @require_autogptq
     def test_gptq_exllamav2_4bits_sym_group_size(self):
         for group_size in [-1, 32, 64, 128, 256, 1024]:  ## 384, 768 has accuracy issue
             print(f"!!!!!!!!!!!!!!!!!{group_size}!!!!!!!!!!!!!!!!!")
diff --git a/test_cuda/test_export.py b/test_cuda/test_export.py
index db68fa0d..3ac32e6c 100644
--- a/test_cuda/test_export.py
+++ b/test_cuda/test_export.py
@@ -9,6 +9,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
+from auto_round.testing_utils import require_awq, require_optimum
 
 
 class LLMDataLoader:
@@ -32,6 +33,7 @@ def tearDownClass(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
+    @require_optimum
     def test_autogptq_format(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -60,6 +62,7 @@ def test_autogptq_format(self):
                        "she is a good friend of mine, she is")
         shutil.rmtree("./saved", ignore_errors=True)
 
+    @require_optimum
     def test_autogptq_format_fp_layers(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -173,8 +176,7 @@ def test_autoround_format(self):
                        "she is a great artist, she is a great artist, she is a great artist, she is")
         shutil.rmtree("./saved", ignore_errors=True)
 
-
-    #
+    @require_awq
     def test_autoawq_format(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -205,6 +207,7 @@ def test_autoawq_format(self):
                        "I just think it's funny that people are downvoting")
         shutil.rmtree("./saved", ignore_errors=True)
 
+    @require_optimum
     def test_autoawq_format_fp_qsave_layers(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         layer_config = {"model.decoder.layers.0.self_attn.k_proj": {"bits": 16},
diff --git a/test_cuda/test_gguf.py b/test_cuda/test_gguf.py
index 3d75967c..f31de413 100644
--- a/test_cuda/test_gguf.py
+++ b/test_cuda/test_gguf.py
@@ -9,6 +9,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
+from auto_round.testing_utils import require_gguf
 
 class LLMDataLoader:
     def __init__(self):
@@ -32,6 +33,7 @@ def tearDownClass(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
+    @require_gguf
     def test_gguf_format(self):
         bits, group_size, sym = 4, 32, False
         autoround = AutoRound(
@@ -70,6 +72,104 @@ def test_gguf_format(self):
         output = llm("There is a girl who likes adventure,", max_tokens=32)
         print(output)
     
+    @require_gguf
+    def test_q2_k_export(self):
+        bits, group_size, sym = 2, 16, False
+        model_name = "Qwen/Qwen2.5-1.5B-Instruct"
+        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        autoround = AutoRound(
+            model,
+            tokenizer,
+            bits=bits,
+            group_size=group_size,
+            sym=sym,
+            iters=1,
+            seqlen=1,
+            dataset=self.llm_dataloader,
+            data_type="int_asym_dq"
+        )
+        autoround.quantize()
+        quantized_model_path = "./saved"
+
+        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="gguf:q2_k_s")
+        gguf_file = "Qwen2.5-1.5B-Instruct-1.5B-Q2_K_S.gguf"
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
+        text = "There is a girl who likes adventure,"
+        inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
+        result = self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])
+        print(result)
+
+        from auto_round.eval.evaluation import simple_evaluate_user_model
+        result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="piqa")
+        self.assertGreater(result['results']['piqa']['acc,none'], 0.45)
+        
+        shutil.rmtree("./saved", ignore_errors=True)
+
+    @require_gguf
+    def test_basic_usage(self):
+        python_path = sys.executable
+        res = os.system(
+            f"cd .. && {python_path} -m auto_round --model {self.model_name} --eval_task_by_task"
+            f" --tasks piqa,openbookqa --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0"
+        )
+        if res > 0 or res == -1:
+            assert False, "cmd line test fail, please have a check"
+        shutil.rmtree("./saved", ignore_errors=True)
+
+    @require_gguf
+    def test_q4_0(self):
+        bits, group_size, sym = 4, 32, True
+        autoround = AutoRound(
+            self.model,
+            self.tokenizer,
+            bits=bits,
+            group_size=group_size,
+            sym=sym,
+            iters=1,
+            data_type="int"
+        )
+        autoround.quantize()
+        quantized_model_path = "./saved"
+
+        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="gguf:q4_0")
+        gguf_file = "Qwen2.5-0.5B-Instruct-494M-Q4_0.gguf"
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
+        text = "There is a girl who likes adventure,"
+        inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
+        print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
+
+        from auto_round.eval.evaluation import simple_evaluate_user_model
+        result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="piqa")
+        self.assertGreater(result['results']['piqa']['acc,none'], 0.55)
+        shutil.rmtree("./saved", ignore_errors=True)
     
+    @require_gguf
+    def test_q4_1(self):
+        bits, group_size, sym = 4, 32, False
+        autoround = AutoRound(
+            self.model,
+            self.tokenizer,
+            bits=bits,
+            group_size=group_size,
+            sym=sym,
+            iters=1,
+            data_type="int"
+        )
+        autoround.quantize()
+        quantized_model_path = "./saved"
+
+        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="gguf:q4_1")
+        gguf_file = "Qwen2.5-0.5B-Instruct-494M-Q4_1.gguf"
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
+        text = "There is a girl who likes adventure,"
+        inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
+        print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
+
+        from auto_round.eval.evaluation import simple_evaluate_user_model
+        result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="piqa")
+        self.assertGreater(result['results']['piqa']['acc,none'], 0.55)
+        shutil.rmtree("./saved", ignore_errors=True)
+
 if __name__ == "__main__":
     unittest.main()
\ No newline at end of file
diff --git a/test_cuda/test_gguf_format.py b/test_cuda/test_gguf_format.py
deleted file mode 100644
index 8e0bc9d5..00000000
--- a/test_cuda/test_gguf_format.py
+++ /dev/null
@@ -1,131 +0,0 @@
-import os
-import sys
-import unittest
-import shutil
-sys.path.insert(0, "..")
-
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from auto_round import AutoRound
-
-
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestGGUF(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.model_name = "Qwen/Qwen2.5-0.5B-Instruct"
-        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
-    
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree("./saved", ignore_errors=True)
-        shutil.rmtree("runs", ignore_errors=True)
-    
-    def test_q2_k_export(self):
-        bits, group_size, sym = 2, 16, False
-        model_name = "Qwen/Qwen2.5-1.5B-Instruct"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        autoround = AutoRound(
-            model,
-            tokenizer,
-            bits=bits,
-            group_size=group_size,
-            sym=sym,
-            iters=1,
-            seqlen=1,
-            dataset=self.llm_dataloader,
-            data_type="int_asym_dq"
-        )
-        autoround.quantize()
-        quantized_model_path = "./saved"
-
-        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="gguf:q2_k_s")
-        gguf_file = "Qwen2.5-1.5B-Instruct-1.5B-Q2_K_S.gguf"
-        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
-        text = "There is a girl who likes adventure,"
-        inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
-        result = self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])
-        print(result)
-
-        from auto_round.eval.evaluation import simple_evaluate_user_model
-        result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="piqa")
-        self.assertGreater(result['results']['piqa']['acc,none'], 0.45)
-        
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    def test_basic_usage(self):
-        python_path = sys.executable
-        res = os.system(
-            f"cd .. && {python_path} -m auto_round --model {self.model_name} --eval_task_by_task"
-            f" --tasks piqa,openbookqa --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0"
-        )
-        if res > 0 or res == -1:
-            assert False, "cmd line test fail, please have a check"
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    def test_q4_0(self):
-        bits, group_size, sym = 4, 32, True
-        autoround = AutoRound(
-            self.model,
-            self.tokenizer,
-            bits=bits,
-            group_size=group_size,
-            sym=sym,
-            iters=1,
-            data_type="int"
-        )
-        autoround.quantize()
-        quantized_model_path = "./saved"
-
-        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="gguf:q4_0")
-        gguf_file = "Qwen2.5-0.5B-Instruct-494M-Q4_0.gguf"
-        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
-        text = "There is a girl who likes adventure,"
-        inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
-        print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
-
-        from auto_round.eval.evaluation import simple_evaluate_user_model
-        result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="piqa")
-        self.assertGreater(result['results']['piqa']['acc,none'], 0.55)
-        shutil.rmtree("./saved", ignore_errors=True)
-    
-    def test_q4_1(self):
-        bits, group_size, sym = 4, 32, False
-        autoround = AutoRound(
-            self.model,
-            self.tokenizer,
-            bits=bits,
-            group_size=group_size,
-            sym=sym,
-            iters=1,
-            data_type="int"
-        )
-        autoround.quantize()
-        quantized_model_path = "./saved"
-
-        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="gguf:q4_1")
-        gguf_file = "Qwen2.5-0.5B-Instruct-494M-Q4_1.gguf"
-        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
-        text = "There is a girl who likes adventure,"
-        inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
-        print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
-
-        from auto_round.eval.evaluation import simple_evaluate_user_model
-        result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="piqa")
-        self.assertGreater(result['results']['piqa']['acc,none'], 0.55)
-        shutil.rmtree("./saved", ignore_errors=True)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test_cuda/test_main_func.py b/test_cuda/test_main_func.py
index 0b1c2b73..f879fbe8 100644
--- a/test_cuda/test_main_func.py
+++ b/test_cuda/test_main_func.py
@@ -11,6 +11,7 @@
 
 from auto_round import AutoRound, AutoRoundAdam
 from auto_round.eval.evaluation import simple_evaluate
+from auto_round.testing_utils import require_gptqmodel
 from lm_eval.utils import make_table  # pylint: disable=E0401
 
 
@@ -35,6 +36,7 @@ def tearDownClass(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
+    @require_gptqmodel
     def test_backend(self):
         model_name = "/models/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
@@ -76,6 +78,7 @@ def test_backend(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     @unittest.skipIf(torch.cuda.is_available() is False, "Skipping because no cuda")
+    @require_gptqmodel
     def test_fp_layers(self):
         model_name = "/models/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
@@ -119,7 +122,7 @@ def test_undivided_group_size_tuning(self):
         autoround = AutoRound(model, tokenizer, bits=4, group_size=127, nsamples=2, iters=2)
         autoround.quantize()
 
-
+    @require_gptqmodel
     def test_adam(self):
         model_name = "/models/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
diff --git a/test_cuda/test_multiple_card.py b/test_cuda/test_multiple_card.py
index 2adffbaa..d908606a 100644
--- a/test_cuda/test_multiple_card.py
+++ b/test_cuda/test_multiple_card.py
@@ -10,6 +10,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate
+from auto_round.testing_utils import multi_card, require_new_version, require_gptqmodel
 
 
 def get_accuracy(data):
@@ -35,6 +36,7 @@ def tearDownClass(self):
         shutil.rmtree(self.save_dir, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
+    @multi_card
     def test_device_map(self):
         model_name = "/models/Qwen2-0.5B-Instruct"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
@@ -43,6 +45,8 @@ def test_device_map(self):
         autoround = AutoRound(model, tokenizer, iters=2, device_map=device_map, nsamples=7,seqlen=32)
         autoround.quantize()
 
+    @multi_card
+    @require_gptqmodel
     def test_device_map_str(self):
         model_name = "/models/Qwen2-0.5B-Instruct"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
@@ -61,6 +65,7 @@ def test_device_map_str(self):
         assert accuracy > 0.45 ##0.4786
         shutil.rmtree("./saved", ignore_errors=True)
 
+    @multi_card
     def test_layer_norm(self):
         model_name = "/models/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
@@ -70,7 +75,7 @@ def test_layer_norm(self):
                               enable_norm_bias_tuning=True)
         autoround.quantize()
 
-
+    @multi_card
     def test_rms_norm(self):
         model_name = "/models/Qwen2-0.5B-Instruct"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
@@ -80,6 +85,7 @@ def test_rms_norm(self):
                               enable_norm_bias_tuning=True)
         autoround.quantize()
 
+    @multi_card
     def test_act_quantization(self):
         model_name = "/models/Qwen2-0.5B-Instruct"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
@@ -88,6 +94,7 @@ def test_act_quantization(self):
         autoround = AutoRound(model, tokenizer, iters=2, device_map=device_map, nsamples=7,seqlen=32,act_bits=4,act_dynamic=False)
         autoround.quantize()
 
+    @multi_card
     def test_lm_head(self):
         model_name = "/models/Qwen2.5-7B-Instruct"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
@@ -98,9 +105,10 @@ def test_lm_head(self):
                               enable_norm_bias_tuning=True,layer_config=layer_config)
         autoround.quantize()
 
+    @multi_card
     def test_device_map(self):
         from transformers import AutoModelForCausalLM, AutoTokenizer
-        model_name = "/data5/wenhuach/Meta-Llama-3.1-8B-Instruct-int4-sym-inc"
+        model_name = "OPEA/Meta-Llama-3.1-8B-Instruct-int4-sym-inc"
 
         device_map = {}
         for i in range(0, 32):
@@ -192,9 +200,11 @@ def test_device_map(self):
             del model
             torch.cuda.empty_cache()
 
+    @multi_card
+    @require_new_version
     def test_device_map_for_triton(self):
         from transformers import AutoModelForCausalLM, AutoTokenizer
-        model_name = "/data5/wenhuach/Meta-Llama-3.1-8B-Instruct-int4-sym-inc"
+        model_name = "OPEA/Meta-Llama-3.1-8B-Instruct-int4-sym-inc"
 
         device_map = {}
         for i in range(0, 32):
@@ -289,4 +299,5 @@ def test_device_map_for_triton(self):
             del model
             torch.cuda.empty_cache()
 
-
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test_cuda/test_multiple_card_calib.py b/test_cuda/test_multiple_card_calib.py
index a56fe1b2..2d7ff712 100644
--- a/test_cuda/test_multiple_card_calib.py
+++ b/test_cuda/test_multiple_card_calib.py
@@ -1,18 +1,12 @@
-import copy
-import shutil
+import os
+import re
 import sys
+import shutil
 import unittest
-import re
 
 sys.path.insert(0, "..")
-import torch
-import transformers
-from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from auto_round import AutoRound
-from auto_round.eval.evaluation import simple_evaluate
-from lm_eval.utils import make_table  # pylint: disable=E0401
-import os
+from auto_round.testing_utils import multi_card
 
 def get_accuracy(data):
     match = re.search(r'\|acc\s+\|[↑↓]\s+\|\s+([\d.]+)\|', data)
@@ -35,12 +29,15 @@ def tearDownClass(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
+    @multi_card
     def test_multiple_card_calib(self):
         python_path = sys.executable
 
         ##test llm script
         res = os.system(
             f"cd .. && {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --disable_eval --iters 1 --nsamples 1 --output_dir None")
+        if res > 0 or res == -1:
+            assert False, "cmd line test fail, please have a check"
 
 
 if __name__ == "__main__":
diff --git a/test_cuda/test_qbits.py b/test_cuda/test_qbits.py
index 3f1f1d3f..d39f0562 100644
--- a/test_cuda/test_qbits.py
+++ b/test_cuda/test_qbits.py
@@ -7,9 +7,15 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRoundConfig, AutoRound
+from auto_round.testing_utils import require_ipex, require_itrex, require_gptqmodel, require_old_version
 
 
 class TestAutoRound(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        self.model_name = "/models/opt-125m"
+        self.save_folder = "./saved"
+
     def model_infer(self, model, tokenizer):
         prompts = [
             "Hello,my name is",
@@ -43,6 +49,8 @@ def tearDownClass(self):
 
 
     ## require torch 2.6
+    @require_itrex
+    @require_old_version
     def test_load_gptq_model_8bits(self):
         model_name = "acloudfan/opt-125m-gptq-8bit"
         quantization_config = AutoRoundConfig()
@@ -52,7 +60,8 @@ def test_load_gptq_model_8bits(self):
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         self.model_infer(model, tokenizer)
 
-
+    @require_itrex
+    @require_old_version
     def test_load_gptq_model_2bits(self):
         model_name = "LucasSantiago257/gemma-2b-2bits-gptq"
         quantization_config = AutoRoundConfig()
@@ -62,7 +71,7 @@ def test_load_gptq_model_2bits(self):
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         self.model_infer(model, tokenizer)
 
-
+    @require_ipex
     def test_mixed_precision(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -99,6 +108,7 @@ def test_mixed_precision(self):
         assert ("!!!" not in res)
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
+    @require_gptqmodel
     def test_autoround_sym(self):
         for bits in [4]:
             model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
diff --git a/test_cuda/test_support_vlms.py b/test_cuda/test_support_vlms.py
index 4b76314d..008ca0d4 100644
--- a/test_cuda/test_support_vlms.py
+++ b/test_cuda/test_support_vlms.py
@@ -6,6 +6,7 @@
 sys.path.insert(0, '..')
 
 from auto_round import AutoRoundConfig ## must import for auto-round format
+from auto_round.testing_utils import require_gptqmodel, require_vlm_env
 import requests
 from PIL import Image
 
@@ -13,7 +14,8 @@
 class TestSupportVLMS(unittest.TestCase):
     @classmethod
     def setUpClass(self):
-        self.save_dir = os.path.join(os.path.dirname(__file__), "ut_saved")
+        # self.save_dir = os.path.join(os.path.dirname(__file__), "ut_saved")
+        self.save_dir = os.path.join(os.path.dirname("/data5/hengguo"), "ut_saved")
         self.python_path = sys.executable
         self.device = 0
 
@@ -21,6 +23,7 @@ def setUpClass(self):
     def tearDownClass(self):
         shutil.rmtree(self.save_dir, ignore_errors=True)
     
+    @require_gptqmodel
     def test_qwen2(self):
         model_path = "/models/Qwen2-VL-2B-Instruct/"
         # test tune
@@ -76,6 +79,7 @@ def test_qwen2(self):
         print(output_text[0])
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
+    @require_vlm_env
     def test_phi3(self):
         model_path = "/models/Phi-3.5-vision-instruct/"
         ## test tune
@@ -132,7 +136,8 @@ def test_phi3(self):
         clean_up_tokenization_spaces=False)[0] 
         print(response)
         shutil.rmtree(quantized_model_path, ignore_errors=True)
-        
+
+    @require_vlm_env
     def test_phi3_vision_awq(self):
         model_path = "/models/Phi-3.5-vision-instruct/"
         ## test tune
@@ -195,6 +200,7 @@ def test_phi3_vision_awq(self):
         print(response)
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
+    @require_vlm_env
     def test_llava(self):
         model_path = "/models/llava-v1.5-7b/"
         ## test tune
@@ -231,6 +237,7 @@ class DataArgs:
         print(tokenizer.batch_decode(output))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
     
+    @require_gptqmodel
     def test_llama(self):
         model_path = "/models/Llama-3.2-11B-Vision-Instruct/"
         ## test tune
diff --git a/test_cuda/test_triton_backend.py b/test_cuda/test_triton_backend.py
index f87884f7..2db292b6 100644
--- a/test_cuda/test_triton_backend.py
+++ b/test_cuda/test_triton_backend.py
@@ -9,8 +9,8 @@
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from auto_round import AutoRound
-from auto_round import AutoRoundConfig
+from auto_round import AutoRound, AutoRoundConfig
+from auto_round.testing_utils import require_new_version
 
 
 class LLMDataLoader:
@@ -62,6 +62,7 @@ def tearDownClass(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
+    @require_new_version
     def test_tritonv2_4bits_asym(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -109,6 +110,7 @@ def test_tritonv2_4bits_asym(self):
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
+    @require_new_version
     def test_tritonv2_2bits_asym(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -153,6 +155,7 @@ def test_tritonv2_2bits_asym(self):
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
+    @require_new_version
     def test_tritonv2_4bits_sym(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -199,8 +202,9 @@ def test_tritonv2_4bits_sym(self):
         self.assertGreater(result['results']['lambada_openai']['acc,none'], 0.26)
         torch.cuda.empty_cache()
 
-    shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree("./saved", ignore_errors=True)
 
+    @require_new_version
     def test_tritonv2_8bits_sym(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -247,6 +251,7 @@ def test_tritonv2_8bits_sym(self):
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
+    @require_new_version
     def test_tritonv2_2bits_sym(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
diff --git a/test_cuda/test_vlms.py b/test_cuda/test_vlms.py
index c32a061a..9faddb05 100644
--- a/test_cuda/test_vlms.py
+++ b/test_cuda/test_vlms.py
@@ -1,15 +1,16 @@
+import re
+import os
+import sys
 import copy
 import shutil
-import sys
 import unittest
-import re
-import os
+import requests
 
 sys.path.insert(0, "..")
 
 from PIL import Image
 from auto_round import AutoRoundConfig
-import requests
+from auto_round.testing_utils import require_gptqmodel, require_vlm_env
 
 
 class TestAutoRound(unittest.TestCase):
@@ -87,6 +88,7 @@ def qwen_inference(self, quantized_model_dir):
         )
         print(output_text[0])
 
+    @require_gptqmodel
     def test_vlm_tune(self):
         from auto_round import AutoRoundMLLM
         from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoTokenizer
@@ -161,6 +163,7 @@ def phi3_infernece(self, quantized_model_dir):
 
         print(response)
 
+    @require_vlm_env
     def test_quant_not_text(self):
         from auto_round import AutoRoundMLLM
         from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
@@ -183,6 +186,7 @@ def test_quant_not_text(self):
         self.phi3_infernece("./saved")
         shutil.rmtree("./saved", ignore_errors=True)
 
+    @require_vlm_env
     def test_quant_not_text_fp_layers(self):
         import  os
         python_path = sys.executable
@@ -194,6 +198,7 @@ def test_quant_not_text_fp_layers(self):
         self.phi3_infernece(absolute_path)
         shutil.rmtree(absolute_path, ignore_errors=True)
     
+    @require_vlm_env
     def test_mm_block_name(self):
         from auto_round.utils import get_block_names
 

From 62415700126ccc3610844102d69b7cdc7c5ba0c3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 12 May 2025 01:22:49 +0000
Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/testing_utils.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/auto_round/testing_utils.py b/auto_round/testing_utils.py
index 03d2e709..b6cb3461 100644
--- a/auto_round/testing_utils.py
+++ b/auto_round/testing_utils.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import unittest
 import importlib.util
 
@@ -155,7 +169,7 @@ def require_vlm_env(test_case):
     """
     Decorator marking a test that requires some special env to load vlm model.
 
-    These tests are skipped when not meet the environment requirments.
+    These tests are skipped when not meet the environment requirements.
 
     """
 

From ffcba1f5ba5b1c2ac862c515eaf68f5f01714581 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Mon, 12 May 2025 22:55:33 -0400
Subject: [PATCH 3/7] update

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/script/llm.py                      |  10 +-
 auto_round/testing_utils.py                   |  23 +-
 docs/step_by_step.md                          |   2 +-
 test/{ => test_cpu}/_test_helpers.py          |   0
 test/{ => test_cpu}/conftest.py               |   0
 test/{ => test_cpu}/requirements.txt          |   0
 test/{ => test_cpu}/test_act_quantization.py  |   0
 .../test_auto_round_hpu_only.py               |   0
 test/{ => test_cpu}/test_autoopt.py           |   0
 test/{ => test_cpu}/test_autoround.py         |   0
 test/{ => test_cpu}/test_autoround_acc.py     |   0
 .../test_autoround_export_to_itrex.py         |   0
 test/{ => test_cpu}/test_basic_usage.py       |   0
 test/{ => test_cpu}/test_block_names.py       |   0
 test/{ => test_cpu}/test_calib_dataset.py     |   0
 test/{ => test_cpu}/test_conv1d.py            |   0
 test/{ => test_cpu}/test_export.py            |   0
 test/{ => test_cpu}/test_generation.py        |   0
 test/{ => test_cpu}/test_gguf_format.py       |   0
 test/{ => test_cpu}/test_hpu.py               |   0
 test/{ => test_cpu}/test_load_awq_gptq.py     |   0
 test/{ => test_cpu}/test_low_cpu_mem.py       |   0
 test/{ => test_cpu}/test_mllm.py              |   0
 test/test_cpu/test_script.py                  |  18 ++
 test/{ => test_cpu}/test_utils.py             |   0
 test/{ => test_cpu}/test_woq_linear.py        |   0
 .../test_cuda}/_test_helpers.py               |   0
 .../test_cuda}/requirements.txt               |   4 +-
 test/test_cuda/requirements_vlm.txt           |  23 ++
 {test_cuda => test/test_cuda}/test_2_3bits.py |   6 +-
 .../test_cuda}/test_auto_round_format.py      |  11 +-
 .../test_cuda}/test_calib_dataset.py          |   0
 {test_cuda => test/test_cuda}/test_conv1d.py  |   0
 .../test_cuda}/test_exllamav2_backend.py      |   0
 {test_cuda => test/test_cuda}/test_export.py  |   5 +-
 .../test_cuda}/test_get_block_name.py         |   0
 {test_cuda => test/test_cuda}/test_gguf.py    |   0
 .../test_cuda}/test_main_func.py              |   5 +-
 .../test_cuda}/test_marlin_backend.py         |   0
 .../test_cuda}/test_multiple_card.py          |   4 +-
 .../test_cuda}/test_multiple_card_calib.py    |   0
 {test_cuda => test/test_cuda}/test_qbits.py   |   6 +-
 .../test_cuda}/test_support_vlms.py           | 200 +++++++++---------
 .../test_cuda}/test_triton_backend.py         |  12 +-
 {test_cuda => test/test_cuda}/test_vlms.py    |   3 +-
 {test_xpu => test/test_xpu}/test_autoround.py |   0
 46 files changed, 192 insertions(+), 140 deletions(-)
 rename test/{ => test_cpu}/_test_helpers.py (100%)
 rename test/{ => test_cpu}/conftest.py (100%)
 rename test/{ => test_cpu}/requirements.txt (100%)
 rename test/{ => test_cpu}/test_act_quantization.py (100%)
 rename test/{ => test_cpu}/test_auto_round_hpu_only.py (100%)
 rename test/{ => test_cpu}/test_autoopt.py (100%)
 rename test/{ => test_cpu}/test_autoround.py (100%)
 rename test/{ => test_cpu}/test_autoround_acc.py (100%)
 rename test/{ => test_cpu}/test_autoround_export_to_itrex.py (100%)
 rename test/{ => test_cpu}/test_basic_usage.py (100%)
 rename test/{ => test_cpu}/test_block_names.py (100%)
 rename test/{ => test_cpu}/test_calib_dataset.py (100%)
 rename test/{ => test_cpu}/test_conv1d.py (100%)
 rename test/{ => test_cpu}/test_export.py (100%)
 rename test/{ => test_cpu}/test_generation.py (100%)
 rename test/{ => test_cpu}/test_gguf_format.py (100%)
 rename test/{ => test_cpu}/test_hpu.py (100%)
 rename test/{ => test_cpu}/test_load_awq_gptq.py (100%)
 rename test/{ => test_cpu}/test_low_cpu_mem.py (100%)
 rename test/{ => test_cpu}/test_mllm.py (100%)
 create mode 100644 test/test_cpu/test_script.py
 rename test/{ => test_cpu}/test_utils.py (100%)
 rename test/{ => test_cpu}/test_woq_linear.py (100%)
 rename {test_cuda => test/test_cuda}/_test_helpers.py (100%)
 rename {test_cuda => test/test_cuda}/requirements.txt (80%)
 create mode 100644 test/test_cuda/requirements_vlm.txt
 rename {test_cuda => test/test_cuda}/test_2_3bits.py (96%)
 rename {test_cuda => test/test_cuda}/test_auto_round_format.py (98%)
 rename {test_cuda => test/test_cuda}/test_calib_dataset.py (100%)
 rename {test_cuda => test/test_cuda}/test_conv1d.py (100%)
 rename {test_cuda => test/test_cuda}/test_exllamav2_backend.py (100%)
 rename {test_cuda => test/test_cuda}/test_export.py (98%)
 rename {test_cuda => test/test_cuda}/test_get_block_name.py (100%)
 rename {test_cuda => test/test_cuda}/test_gguf.py (100%)
 rename {test_cuda => test/test_cuda}/test_main_func.py (97%)
 rename {test_cuda => test/test_cuda}/test_marlin_backend.py (100%)
 rename {test_cuda => test/test_cuda}/test_multiple_card.py (98%)
 rename {test_cuda => test/test_cuda}/test_multiple_card_calib.py (100%)
 rename {test_cuda => test/test_cuda}/test_qbits.py (96%)
 rename {test_cuda => test/test_cuda}/test_support_vlms.py (73%)
 rename {test_cuda => test/test_cuda}/test_triton_backend.py (98%)
 rename {test_cuda => test/test_cuda}/test_vlms.py (99%)
 rename {test_xpu => test/test_xpu}/test_autoround.py (100%)

diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py
index 5a9594f6..01504914 100644
--- a/auto_round/script/llm.py
+++ b/auto_round/script/llm.py
@@ -370,7 +370,7 @@ def tune(args):
 
     if args.enable_torch_compile:
         logger.info("`torch.compile` is enabled to reduce tuning costs. "
-                    "If it causes issues, you can disable it by remove `--enable_torch_compile` argument.")
+                    "If it causes issues, you can disable it by removing `--enable_torch_compile` argument.")
 
     model_name = args.model
     if model_name[-1] == "/":
@@ -582,7 +582,7 @@ def tune(args):
                 device=device_str,
                 eval_model_dtype=eval_model_dtype)
             print(make_table(res))
-            print("evaluation running time=", time.time() - st)
+            print("evaluation running time=%ds" % (time.time() - st))
     else:
         if args.eval_task_by_task:
             eval_task_by_task(
@@ -599,7 +599,7 @@ def tune(args):
             res = simple_evaluate(
                 model="hf", model_args=model_args, tasks=tasks, device=device_str, batch_size=args.eval_bs)
             print(make_table(res))
-            print("evaluation running time=", time.time() - st)
+            print("evaluation running time=%ds" % (time.time() - st))
 
 
 def _eval_init(tasks, model_path, device, disable_trust_remote_code=False, dtype="auto"):
@@ -657,14 +657,14 @@ def eval(args):
         res = simple_evaluate_user_model(
                 model, tokenizer, tasks=tasks, batch_size=batch_size, device=device_str)
         print(make_table(res))
-        print("evaluation running time=", time.time() - st)
+        print("evaluation running time=%ds" % (time.time() - st))
     else:
         st = time.time()
         res = simple_evaluate(
             model="hf", model_args=model_args, tasks=tasks, device=device_str, batch_size=args.eval_bs)
         from lm_eval.utils import make_table  # pylint: disable=E0401
         print(make_table(res))
-        print("evaluation running time=", time.time() - st)
+        print("evaluation running time=%ds" % (time.time() - st))
 
 
 def eval_task_by_task(
diff --git a/auto_round/testing_utils.py b/auto_round/testing_utils.py
index b6cb3461..b1ace9a4 100644
--- a/auto_round/testing_utils.py
+++ b/auto_round/testing_utils.py
@@ -23,10 +23,10 @@ def is_gguf_available():
     return importlib.util.find_spec("gguf") is not None
 
 def is_autogptq_available():
-    return importlib.util.find_spec("auto-gptq") is not None
+    return importlib.util.find_spec("auto_gptq") is not None
 
 def is_awq_available():
-    return importlib.util.find_spec("autoawq") is not None
+    return importlib.util.find_spec("awq") is not None
 
 def is_optimum_available():
     return importlib.util.find_spec("optimum") is not None
@@ -39,10 +39,10 @@ def is_ipex_available():
         return False
 
 def is_itrex_available():
-    return importlib.util.find_spec("intel-extension-for-transformers") is not None
+    return importlib.util.find_spec("intel_extension_for_transformers") is not None
 
 def is_flash_attn_avaliable():
-    return importlib.util.find_spec("flash-attn") is not None
+    return importlib.util.find_spec("flash_attn") is not None
 
 def is_gptqmodel_available():
     try:
@@ -51,7 +51,7 @@ def is_gptqmodel_available():
     except ImportError:
         return False
 
-def is_new_version():
+def greater_than_050():
     try:
         require_version("auto-round>=0.5.0")
         return True
@@ -86,7 +86,7 @@ def require_gptqmodel(test_case):
     These tests are skipped when gptqmodel isn't installed.
 
     """
-    return unittest.skipUnless(is_autogptq_available(), "test requires gptqmodel>=2.0")(test_case)
+    return unittest.skipUnless(is_gptqmodel_available(), "test requires gptqmodel>=2.0")(test_case)
 
 
 def require_awq(test_case):
@@ -128,14 +128,14 @@ def require_optimum(test_case):
     return unittest.skipUnless(is_optimum_available(), "test requires optimum")(test_case)
 
 
-def require_new_version(test_case):
+def require_greater_than_050(test_case):
     """
     Decorator marking a test that requires auto-round>=0.5.0.
 
     These tests are skipped when auto-round<0.5.0.
 
     """
-    return unittest.skipUnless(is_new_version(), "test requires auto-round>=0.5.0")(test_case)
+    return unittest.skipUnless(greater_than_050(), "test requires auto-round>=0.5.0")(test_case)
 
 
 def multi_card(test_case):
@@ -177,9 +177,14 @@ def require_vlm_env(test_case):
     # pip install flash-attn --no-build-isolation
     env_check &= is_flash_attn_avaliable()
 
-    # git clone https://github.com/haotian-liu/LLaVA.git && cd LLaVA && pip install -e .
+    # pip install git+https://github.com/haotian-liu/LLaVA.git@v1.2.2
     env_check &= importlib.util.find_spec("llava") is not None
 
+    # pip install git+https://github.com/deepseek-ai/DeepSeek-VL2.git
+    env_check &= importlib.util.find_spec("deepseek_vl2") is not None
+
+    env_check &= importlib.util.find_spec("xformers") is not None
+
     return unittest.skipUnless(env_check, "Environment is not satisfactory")(test_case)
     
 
diff --git a/docs/step_by_step.md b/docs/step_by_step.md
index 2be87282..eb981bb3 100644
--- a/docs/step_by_step.md
+++ b/docs/step_by_step.md
@@ -101,7 +101,7 @@ CPU, Intel GPU, HPU,and CUDA for both quantization and inference.
   This setting provides the best accuracy in most scenarios but is 4–5× slower than the standard AutoRound recipe. It is especially recommended for 2-bit quantization and is a good choice if sufficient resources are available.
   
   ```bash
-    auto-round-best --model facebook/opt-125m  --bits 4 --group_size 128  --format "auto_gptq,auto_awq,auto_round"
+  auto-round-best --model facebook/opt-125m  --bits 4 --group_size 128  --format "auto_gptq,auto_awq,auto_round"
     ```
 
 - **Light Settings:**
diff --git a/test/_test_helpers.py b/test/test_cpu/_test_helpers.py
similarity index 100%
rename from test/_test_helpers.py
rename to test/test_cpu/_test_helpers.py
diff --git a/test/conftest.py b/test/test_cpu/conftest.py
similarity index 100%
rename from test/conftest.py
rename to test/test_cpu/conftest.py
diff --git a/test/requirements.txt b/test/test_cpu/requirements.txt
similarity index 100%
rename from test/requirements.txt
rename to test/test_cpu/requirements.txt
diff --git a/test/test_act_quantization.py b/test/test_cpu/test_act_quantization.py
similarity index 100%
rename from test/test_act_quantization.py
rename to test/test_cpu/test_act_quantization.py
diff --git a/test/test_auto_round_hpu_only.py b/test/test_cpu/test_auto_round_hpu_only.py
similarity index 100%
rename from test/test_auto_round_hpu_only.py
rename to test/test_cpu/test_auto_round_hpu_only.py
diff --git a/test/test_autoopt.py b/test/test_cpu/test_autoopt.py
similarity index 100%
rename from test/test_autoopt.py
rename to test/test_cpu/test_autoopt.py
diff --git a/test/test_autoround.py b/test/test_cpu/test_autoround.py
similarity index 100%
rename from test/test_autoround.py
rename to test/test_cpu/test_autoround.py
diff --git a/test/test_autoround_acc.py b/test/test_cpu/test_autoround_acc.py
similarity index 100%
rename from test/test_autoround_acc.py
rename to test/test_cpu/test_autoround_acc.py
diff --git a/test/test_autoround_export_to_itrex.py b/test/test_cpu/test_autoround_export_to_itrex.py
similarity index 100%
rename from test/test_autoround_export_to_itrex.py
rename to test/test_cpu/test_autoround_export_to_itrex.py
diff --git a/test/test_basic_usage.py b/test/test_cpu/test_basic_usage.py
similarity index 100%
rename from test/test_basic_usage.py
rename to test/test_cpu/test_basic_usage.py
diff --git a/test/test_block_names.py b/test/test_cpu/test_block_names.py
similarity index 100%
rename from test/test_block_names.py
rename to test/test_cpu/test_block_names.py
diff --git a/test/test_calib_dataset.py b/test/test_cpu/test_calib_dataset.py
similarity index 100%
rename from test/test_calib_dataset.py
rename to test/test_cpu/test_calib_dataset.py
diff --git a/test/test_conv1d.py b/test/test_cpu/test_conv1d.py
similarity index 100%
rename from test/test_conv1d.py
rename to test/test_cpu/test_conv1d.py
diff --git a/test/test_export.py b/test/test_cpu/test_export.py
similarity index 100%
rename from test/test_export.py
rename to test/test_cpu/test_export.py
diff --git a/test/test_generation.py b/test/test_cpu/test_generation.py
similarity index 100%
rename from test/test_generation.py
rename to test/test_cpu/test_generation.py
diff --git a/test/test_gguf_format.py b/test/test_cpu/test_gguf_format.py
similarity index 100%
rename from test/test_gguf_format.py
rename to test/test_cpu/test_gguf_format.py
diff --git a/test/test_hpu.py b/test/test_cpu/test_hpu.py
similarity index 100%
rename from test/test_hpu.py
rename to test/test_cpu/test_hpu.py
diff --git a/test/test_load_awq_gptq.py b/test/test_cpu/test_load_awq_gptq.py
similarity index 100%
rename from test/test_load_awq_gptq.py
rename to test/test_cpu/test_load_awq_gptq.py
diff --git a/test/test_low_cpu_mem.py b/test/test_cpu/test_low_cpu_mem.py
similarity index 100%
rename from test/test_low_cpu_mem.py
rename to test/test_cpu/test_low_cpu_mem.py
diff --git a/test/test_mllm.py b/test/test_cpu/test_mllm.py
similarity index 100%
rename from test/test_mllm.py
rename to test/test_cpu/test_mllm.py
diff --git a/test/test_cpu/test_script.py b/test/test_cpu/test_script.py
new file mode 100644
index 00000000..32662134
--- /dev/null
+++ b/test/test_cpu/test_script.py
@@ -0,0 +1,18 @@
+import os
+import sys
+import unittest
+
+sys.path.insert(0, "..")
+
+
+class TestScript(unittest.TestCase):
+    def test_default(self):
+        os.system('''
+                cd .. && 
+                python -m auto_round
+                    --iters 2
+                    --deployment_device fake
+                    --output_dir ./tmp_script_test''')
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/test/test_utils.py b/test/test_cpu/test_utils.py
similarity index 100%
rename from test/test_utils.py
rename to test/test_cpu/test_utils.py
diff --git a/test/test_woq_linear.py b/test/test_cpu/test_woq_linear.py
similarity index 100%
rename from test/test_woq_linear.py
rename to test/test_cpu/test_woq_linear.py
diff --git a/test_cuda/_test_helpers.py b/test/test_cuda/_test_helpers.py
similarity index 100%
rename from test_cuda/_test_helpers.py
rename to test/test_cuda/_test_helpers.py
diff --git a/test_cuda/requirements.txt b/test/test_cuda/requirements.txt
similarity index 80%
rename from test_cuda/requirements.txt
rename to test/test_cuda/requirements.txt
index 00ae08a8..fb008694 100644
--- a/test_cuda/requirements.txt
+++ b/test/test_cuda/requirements.txt
@@ -4,7 +4,7 @@ auto-gptq
 datasets
 einops
 gptqmodel>=2.0
-intel-extension-for-pytorch>=2.5
+intel-extension-for-pytorch==2.6.0
 intel-extension-for-transformers
 lm-eval>=0.4.2,<0.5
 numpy < 2.0
@@ -12,7 +12,7 @@ optimum
 pandas
 pillow
 py-cpuinfo
-torch
+torch<2.7.0
 torchvision
 tqdm
 transformers
diff --git a/test/test_cuda/requirements_vlm.txt b/test/test_cuda/requirements_vlm.txt
new file mode 100644
index 00000000..46d8220d
--- /dev/null
+++ b/test/test_cuda/requirements_vlm.txt
@@ -0,0 +1,23 @@
+# git+https://github.com/haotian-liu/LLaVA.git@v1.2.2
+# git clone https://github.com/deepseek-ai/DeepSeek-VL2.git && cd DeepSeek-VL2 && pip install -e .
+accelerate
+autoawq
+auto-gptq
+bitsandbytes==0.44.0 
+datasets
+einops
+flash-attn==2.6.1
+gptqmodel>=2.0
+intel-extension-for-pytorch==2.6.0
+intel-extension-for-transformers
+lm-eval>=0.4.2,<0.5
+numpy < 2.0
+optimum
+pandas
+pillow
+py-cpuinfo
+torch==2.0.1
+torchvision==0.15.2
+triton==2.0.0
+tqdm
+transformers==4.45.0
diff --git a/test_cuda/test_2_3bits.py b/test/test_cuda/test_2_3bits.py
similarity index 96%
rename from test_cuda/test_2_3bits.py
rename to test/test_cuda/test_2_3bits.py
index 82a84b12..5f49c254 100644
--- a/test_cuda/test_2_3bits.py
+++ b/test/test_cuda/test_2_3bits.py
@@ -12,7 +12,7 @@
 from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate
 from lm_eval.utils import make_table  # pylint: disable=E0401
-from auto_round.testing_utils import require_autogptq, require_new_version
+from auto_round.testing_utils import require_autogptq, require_greater_than_050
 
 
 def get_accuracy(data):
@@ -57,7 +57,7 @@ def test_3bits_autoround(self):
         assert accuracy > 0.3
         shutil.rmtree("./saved", ignore_errors=True)
 
-    @require_new_version
+    @require_greater_than_050
     def test_norm_bias_tuning(self):
         model_name = "/models/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
@@ -76,7 +76,7 @@ def test_norm_bias_tuning(self):
         assert accuracy > 0.18
         shutil.rmtree("./saved", ignore_errors=True)
 
-    @require_new_version
+    @require_greater_than_050
     def test_2bits_autoround(self):
         model_name = "/models/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
diff --git a/test_cuda/test_auto_round_format.py b/test/test_cuda/test_auto_round_format.py
similarity index 98%
rename from test_cuda/test_auto_round_format.py
rename to test/test_cuda/test_auto_round_format.py
index 7b995e36..5fabe1bd 100644
--- a/test_cuda/test_auto_round_format.py
+++ b/test/test_cuda/test_auto_round_format.py
@@ -5,7 +5,7 @@
 
 sys.path.insert(0, "..")
 from auto_round.eval.evaluation import simple_evaluate_user_model
-from auto_round.testing_utils import require_new_version, require_autogptq, require_awq, require_ipex
+from auto_round.testing_utils import require_greater_than_050, require_autogptq, require_awq, require_ipex
 
 import torch
 import transformers
@@ -76,7 +76,7 @@ def tearDownClass(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    @require_new_version
+    @require_greater_than_050
     def test_autoround_asym(self):
         for bits in [2, 3, 4, 8]:
             model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
@@ -184,7 +184,7 @@ def test_awq_backend(self):
         self.model_infer(model, tokenizer)
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
-    @require_new_version
+    @require_greater_than_050
     def test_tritonv2_bf16(self):
         model_name = "OPEA/Meta-Llama-3.1-8B-Instruct-int4-sym-inc"
         quantization_config = AutoRoundConfig(backend="tritonv2")
@@ -251,6 +251,7 @@ def test_autoround_gptq_sym_format(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     @require_awq
+    @require_ipex
     def test_autoround_awq_sym_format(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -288,7 +289,7 @@ def test_autoround_awq_sym_format(self):
 
         shutil.rmtree("./saved", ignore_errors=True)
 
-    @require_new_version
+    @require_greater_than_050
     def test_autoround_sym(self):
         for bits in [2, 3, 4, 8]:
             model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
@@ -318,7 +319,7 @@ def test_autoround_sym(self):
             assert ("!!!" not in res)
             shutil.rmtree(self.save_folder, ignore_errors=True)
 
-    @require_new_version
+    @require_greater_than_050
     def test_load_gptq_model_3bits(self):
         model_name = "LucasSantiago257/gemma-2b-2bits-gptq"
         quantization_config = AutoRoundConfig()
diff --git a/test_cuda/test_calib_dataset.py b/test/test_cuda/test_calib_dataset.py
similarity index 100%
rename from test_cuda/test_calib_dataset.py
rename to test/test_cuda/test_calib_dataset.py
diff --git a/test_cuda/test_conv1d.py b/test/test_cuda/test_conv1d.py
similarity index 100%
rename from test_cuda/test_conv1d.py
rename to test/test_cuda/test_conv1d.py
diff --git a/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py
similarity index 100%
rename from test_cuda/test_exllamav2_backend.py
rename to test/test_cuda/test_exllamav2_backend.py
diff --git a/test_cuda/test_export.py b/test/test_cuda/test_export.py
similarity index 98%
rename from test_cuda/test_export.py
rename to test/test_cuda/test_export.py
index 3ac32e6c..62e75225 100644
--- a/test_cuda/test_export.py
+++ b/test/test_cuda/test_export.py
@@ -208,6 +208,7 @@ def test_autoawq_format(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     @require_optimum
+    @require_awq
     def test_autoawq_format_fp_qsave_layers(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         layer_config = {"model.decoder.layers.0.self_attn.k_proj": {"bits": 16},
@@ -225,8 +226,8 @@ def test_autoawq_format_fp_qsave_layers(self):
             dataset=self.llm_dataloader,
             layer_config=layer_config
         )
-        quantized_model_path = "/data5/wenhuach/test_export"
-        autoround.qsave(output_dir=quantized_model_path,
+        quantized_model_path = "./saved/test_export"
+        autoround.quantize_and_save(output_dir=quantized_model_path,
                         format="auto_awq")
         from auto_round import AutoRoundConfig
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto",
diff --git a/test_cuda/test_get_block_name.py b/test/test_cuda/test_get_block_name.py
similarity index 100%
rename from test_cuda/test_get_block_name.py
rename to test/test_cuda/test_get_block_name.py
diff --git a/test_cuda/test_gguf.py b/test/test_cuda/test_gguf.py
similarity index 100%
rename from test_cuda/test_gguf.py
rename to test/test_cuda/test_gguf.py
diff --git a/test_cuda/test_main_func.py b/test/test_cuda/test_main_func.py
similarity index 97%
rename from test_cuda/test_main_func.py
rename to test/test_cuda/test_main_func.py
index f879fbe8..be07692b 100644
--- a/test_cuda/test_main_func.py
+++ b/test/test_cuda/test_main_func.py
@@ -11,7 +11,7 @@
 
 from auto_round import AutoRound, AutoRoundAdam
 from auto_round.eval.evaluation import simple_evaluate
-from auto_round.testing_utils import require_gptqmodel
+from auto_round.testing_utils import require_gptqmodel, require_optimum, require_awq
 from lm_eval.utils import make_table  # pylint: disable=E0401
 
 
@@ -37,6 +37,8 @@ def tearDownClass(self):
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_gptqmodel
+    @require_optimum
+    @require_awq
     def test_backend(self):
         model_name = "/models/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
@@ -79,6 +81,7 @@ def test_backend(self):
 
     @unittest.skipIf(torch.cuda.is_available() is False, "Skipping because no cuda")
     @require_gptqmodel
+    @require_awq
     def test_fp_layers(self):
         model_name = "/models/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
diff --git a/test_cuda/test_marlin_backend.py b/test/test_cuda/test_marlin_backend.py
similarity index 100%
rename from test_cuda/test_marlin_backend.py
rename to test/test_cuda/test_marlin_backend.py
diff --git a/test_cuda/test_multiple_card.py b/test/test_cuda/test_multiple_card.py
similarity index 98%
rename from test_cuda/test_multiple_card.py
rename to test/test_cuda/test_multiple_card.py
index d908606a..8c0e5e7f 100644
--- a/test_cuda/test_multiple_card.py
+++ b/test/test_cuda/test_multiple_card.py
@@ -10,7 +10,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate
-from auto_round.testing_utils import multi_card, require_new_version, require_gptqmodel
+from auto_round.testing_utils import multi_card, require_greater_than_050, require_gptqmodel
 
 
 def get_accuracy(data):
@@ -201,7 +201,7 @@ def test_device_map(self):
             torch.cuda.empty_cache()
 
     @multi_card
-    @require_new_version
+    @require_greater_than_050
     def test_device_map_for_triton(self):
         from transformers import AutoModelForCausalLM, AutoTokenizer
         model_name = "OPEA/Meta-Llama-3.1-8B-Instruct-int4-sym-inc"
diff --git a/test_cuda/test_multiple_card_calib.py b/test/test_cuda/test_multiple_card_calib.py
similarity index 100%
rename from test_cuda/test_multiple_card_calib.py
rename to test/test_cuda/test_multiple_card_calib.py
diff --git a/test_cuda/test_qbits.py b/test/test_cuda/test_qbits.py
similarity index 96%
rename from test_cuda/test_qbits.py
rename to test/test_cuda/test_qbits.py
index d39f0562..f391f323 100644
--- a/test_cuda/test_qbits.py
+++ b/test/test_cuda/test_qbits.py
@@ -7,7 +7,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRoundConfig, AutoRound
-from auto_round.testing_utils import require_ipex, require_itrex, require_gptqmodel, require_old_version
+from auto_round.testing_utils import require_itrex, require_gptqmodel
 
 
 class TestAutoRound(unittest.TestCase):
@@ -50,7 +50,6 @@ def tearDownClass(self):
 
     ## require torch 2.6
     @require_itrex
-    @require_old_version
     def test_load_gptq_model_8bits(self):
         model_name = "acloudfan/opt-125m-gptq-8bit"
         quantization_config = AutoRoundConfig()
@@ -61,7 +60,6 @@ def test_load_gptq_model_8bits(self):
         self.model_infer(model, tokenizer)
 
     @require_itrex
-    @require_old_version
     def test_load_gptq_model_2bits(self):
         model_name = "LucasSantiago257/gemma-2b-2bits-gptq"
         quantization_config = AutoRoundConfig()
@@ -71,7 +69,7 @@ def test_load_gptq_model_2bits(self):
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         self.model_infer(model, tokenizer)
 
-    @require_ipex
+    @require_itrex
     def test_mixed_precision(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
diff --git a/test_cuda/test_support_vlms.py b/test/test_cuda/test_support_vlms.py
similarity index 73%
rename from test_cuda/test_support_vlms.py
rename to test/test_cuda/test_support_vlms.py
index 008ca0d4..54fe86a0 100644
--- a/test_cuda/test_support_vlms.py
+++ b/test/test_cuda/test_support_vlms.py
@@ -24,60 +24,60 @@ def tearDownClass(self):
         shutil.rmtree(self.save_dir, ignore_errors=True)
     
     @require_gptqmodel
-    def test_qwen2(self):
-        model_path = "/models/Qwen2-VL-2B-Instruct/"
-        # test tune
-        res = os.system(
-            f"cd .. && {self.python_path} -m auto_round --mllm "
-            f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") 
-        self.assertFalse(res > 0 or res == -1, msg="qwen2 tuning fail")
-
-        # test infer
-        quantized_model_path = os.path.join(self.save_dir, "Qwen2-VL-2B-Instruct-w4g128-auto_round")
+    # def test_qwen2(self):
+    #     model_path = "/models/Qwen2-VL-2B-Instruct/"
+    #     # test tune
+    #     res = os.system(
+    #         f"cd .. && {self.python_path} -m auto_round --mllm "
+    #         f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") 
+    #     self.assertFalse(res > 0 or res == -1, msg="qwen2 tuning fail")
+
+    #     # test infer
+    #     quantized_model_path = os.path.join(self.save_dir, "Qwen2-VL-2B-Instruct-w4g128-auto_round")
        
-        from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
-        model = Qwen2VLForConditionalGeneration.from_pretrained(
-            quantized_model_path,
-            torch_dtype="float16",
-            device_map=f"cuda:{self.device}",
-        )
-        processor = AutoProcessor.from_pretrained(quantized_model_path)
-        image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "image",
-                        "image": image_url,
-                    },
-                    {"type": "text", "text": "Describe this image."},
-                ],
-            }
-        ]
-
-        # Preparation for inference
-        text = processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
-        )
-        image_inputs = Image.open(requests.get(image_url, stream=True).raw)
-        inputs = processor(
-            text=[text],
-            images=image_inputs,
-            padding=True,
-            return_tensors="pt",
-        )
-        inputs = inputs.to(model.device)
-
-        generated_ids = model.generate(**inputs, max_new_tokens=128)
-        generated_ids_trimmed = [
-            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        output_text = processor.batch_decode(
-            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-        print(output_text[0])
-        shutil.rmtree(quantized_model_path, ignore_errors=True)
+    #     from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+    #     model = Qwen2VLForConditionalGeneration.from_pretrained(
+    #         quantized_model_path,
+    #         torch_dtype="float16",
+    #         device_map=f"cuda:{self.device}",
+    #     )
+    #     processor = AutoProcessor.from_pretrained(quantized_model_path)
+    #     image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+    #     messages = [
+    #         {
+    #             "role": "user",
+    #             "content": [
+    #                 {
+    #                     "type": "image",
+    #                     "image": image_url,
+    #                 },
+    #                 {"type": "text", "text": "Describe this image."},
+    #             ],
+    #         }
+    #     ]
+
+    #     # Preparation for inference
+    #     text = processor.apply_chat_template(
+    #         messages, tokenize=False, add_generation_prompt=True
+    #     )
+    #     image_inputs = Image.open(requests.get(image_url, stream=True).raw)
+    #     inputs = processor(
+    #         text=[text],
+    #         images=image_inputs,
+    #         padding=True,
+    #         return_tensors="pt",
+    #     )
+    #     inputs = inputs.to(model.device)
+
+    #     generated_ids = model.generate(**inputs, max_new_tokens=128)
+    #     generated_ids_trimmed = [
+    #         out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    #     ]
+    #     output_text = processor.batch_decode(
+    #         generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    #     )
+    #     print(output_text[0])
+    #     shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     @require_vlm_env
     def test_phi3(self):
@@ -238,45 +238,46 @@ class DataArgs:
         shutil.rmtree(quantized_model_path, ignore_errors=True)
     
     @require_gptqmodel
-    def test_llama(self):
-        model_path = "/models/Llama-3.2-11B-Vision-Instruct/"
-        ## test tune
-        res = os.system(
-            f"cd .. && {self.python_path} -m auto_round --mllm "
-            f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") 
-        self.assertFalse(res > 0 or res == -1, msg="llama-3.2 tuning fail")
-
-        ## test infer
-        from transformers import MllamaForConditionalGeneration, AutoProcessor
-        quantized_model_path = os.path.join(self.save_dir, "Llama-3.2-11B-Vision-Instruct-w4g128-auto_round")
-        model = MllamaForConditionalGeneration.from_pretrained(
-            quantized_model_path,
-            torch_dtype="float16",
-            device_map=f"cuda:{self.device}",
-        )
-        processor = AutoProcessor.from_pretrained(quantized_model_path)
-        image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
-        messages = [
-            {"role": "user", "content": [
-                {"type": "image"},
-                {"type": "text", "text": "Please write a haiku for this one, it would be: "}
-            ]}
-        ]
-
-        # Preparation for inference
-        image = Image.open(requests.get(image_url, stream=True).raw)
-        input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
-        inputs = processor(
-            image,
-            input_text,
-            add_special_tokens=False,
-            return_tensors="pt"
-        ).to(model.device)
-
-        output = model.generate(**inputs, max_new_tokens=50)
-        print(processor.decode(output[0]))
-        shutil.rmtree(quantized_model_path, ignore_errors=True)
+    # def test_llama(self):
+    #     model_path = "/models/Llama-3.2-11B-Vision-Instruct/"
+    #     ## test tune
+    #     res = os.system(
+    #         f"cd .. && {self.python_path} -m auto_round --mllm "
+    #         f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") 
+    #     self.assertFalse(res > 0 or res == -1, msg="llama-3.2 tuning fail")
+
+    #     ## test infer
+    #     from transformers import MllamaForConditionalGeneration, AutoProcessor
+    #     quantized_model_path = os.path.join(self.save_dir, "Llama-3.2-11B-Vision-Instruct-w4g128-auto_round")
+    #     model = MllamaForConditionalGeneration.from_pretrained(
+    #         quantized_model_path,
+    #         torch_dtype="float16",
+    #         device_map=f"cuda:{self.device}",
+    #     )
+    #     processor = AutoProcessor.from_pretrained(quantized_model_path)
+    #     image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
+    #     messages = [
+    #         {"role": "user", "content": [
+    #             {"type": "image"},
+    #             {"type": "text", "text": "Please write a haiku for this one, it would be: "}
+    #         ]}
+    #     ]
+
+    #     # Preparation for inference
+    #     image = Image.open(requests.get(image_url, stream=True).raw)
+    #     input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
+    #     inputs = processor(
+    #         image,
+    #         input_text,
+    #         add_special_tokens=False,
+    #         return_tensors="pt"
+    #     ).to(model.device)
+
+    #     output = model.generate(**inputs, max_new_tokens=50)
+    #     print(processor.decode(output[0]))
+    #     shutil.rmtree(quantized_model_path, ignore_errors=True)
     
+    @require_vlm_env
     def test_cogvlm(self):
         model_path = "/models/cogvlm2-llama3-chat-19B/"
         ## test tune
@@ -331,15 +332,16 @@ def test_cogvlm(self):
         print(response)     
         shutil.rmtree(quantized_model_path, ignore_errors=True)
     
-    def test_72b(self):
-        model_path = "/models/Qwen2-VL-72B-Instruct/"
-        res = os.system(
-            f"cd .. && {self.python_path} -m auto_round --mllm "
-            f"--model {model_path} --iter 1 --nsamples 1 --bs 1 --output_dir {self.save_dir} --device {self.device}"
-            )
-        self.assertFalse(res > 0 or res == -1, msg="qwen2-72b tuning fail")
-        shutil.rmtree(self.save_dir, ignore_errors=True)
+    # def test_72b(self):
+    #     model_path = "/models/Qwen2-VL-72B-Instruct/"
+    #     res = os.system(
+    #         f"cd .. && {self.python_path} -m auto_round --mllm "
+    #         f"--model {model_path} --iter 1 --nsamples 1 --bs 1 --output_dir {self.save_dir} --device {self.device}"
+    #         )
+    #     self.assertFalse(res > 0 or res == -1, msg="qwen2-72b tuning fail")
+    #     shutil.rmtree(self.save_dir, ignore_errors=True)
     
+    @require_vlm_env
     def test_deepseek_vl2(self):
         model_path = "/models/deepseek-vl2-tiny"
         res = os.system(
diff --git a/test_cuda/test_triton_backend.py b/test/test_cuda/test_triton_backend.py
similarity index 98%
rename from test_cuda/test_triton_backend.py
rename to test/test_cuda/test_triton_backend.py
index 2db292b6..bac83243 100644
--- a/test_cuda/test_triton_backend.py
+++ b/test/test_cuda/test_triton_backend.py
@@ -10,7 +10,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound, AutoRoundConfig
-from auto_round.testing_utils import require_new_version
+from auto_round.testing_utils import require_greater_than_050
 
 
 class LLMDataLoader:
@@ -62,7 +62,7 @@ def tearDownClass(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    @require_new_version
+    @require_greater_than_050
     def test_tritonv2_4bits_asym(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -110,7 +110,7 @@ def test_tritonv2_4bits_asym(self):
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
-    @require_new_version
+    @require_greater_than_050
     def test_tritonv2_2bits_asym(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -155,7 +155,7 @@ def test_tritonv2_2bits_asym(self):
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
-    @require_new_version
+    @require_greater_than_050
     def test_tritonv2_4bits_sym(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -204,7 +204,7 @@ def test_tritonv2_4bits_sym(self):
 
         shutil.rmtree("./saved", ignore_errors=True)
 
-    @require_new_version
+    @require_greater_than_050
     def test_tritonv2_8bits_sym(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -251,7 +251,7 @@ def test_tritonv2_8bits_sym(self):
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
-    @require_new_version
+    @require_greater_than_050
     def test_tritonv2_2bits_sym(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
diff --git a/test_cuda/test_vlms.py b/test/test_cuda/test_vlms.py
similarity index 99%
rename from test_cuda/test_vlms.py
rename to test/test_cuda/test_vlms.py
index 9faddb05..7917ffc5 100644
--- a/test_cuda/test_vlms.py
+++ b/test/test_cuda/test_vlms.py
@@ -10,7 +10,7 @@
 
 from PIL import Image
 from auto_round import AutoRoundConfig
-from auto_round.testing_utils import require_gptqmodel, require_vlm_env
+from auto_round.testing_utils import require_gptqmodel, require_vlm_env, require_optimum
 
 
 class TestAutoRound(unittest.TestCase):
@@ -89,6 +89,7 @@ def qwen_inference(self, quantized_model_dir):
         print(output_text[0])
 
     @require_gptqmodel
+    @require_optimum
     def test_vlm_tune(self):
         from auto_round import AutoRoundMLLM
         from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoTokenizer
diff --git a/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py
similarity index 100%
rename from test_xpu/test_autoround.py
rename to test/test_xpu/test_autoround.py

From 94a88196cc167b43a89dd924c9651c3535375728 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Tue, 13 May 2025 01:21:13 -0400
Subject: [PATCH 4/7] fix

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 test/test_cpu/test_act_quantization.py        |  2 +-
 test/test_cpu/test_autoopt.py                 |  2 +-
 test/test_cpu/test_autoround.py               |  2 +-
 test/test_cpu/test_autoround_acc.py           |  2 +-
 .../test_autoround_export_to_itrex.py         |  2 +-
 test/test_cpu/test_basic_usage.py             | 20 +++++++++----------
 test/test_cpu/test_block_names.py             |  4 ++--
 test/test_cpu/test_calib_dataset.py           |  2 +-
 test/test_cpu/test_conv1d.py                  |  2 +-
 test/test_cpu/test_export.py                  |  2 +-
 test/test_cpu/test_generation.py              |  2 +-
 test/test_cpu/test_gguf_format.py             |  6 +++---
 test/test_cpu/test_hpu.py                     |  2 +-
 test/test_cpu/test_load_awq_gptq.py           |  2 +-
 test/test_cpu/test_low_cpu_mem.py             |  2 +-
 test/test_cpu/test_mllm.py                    |  2 +-
 test/test_cpu/test_script.py                  |  4 ++--
 test/test_cpu/test_utils.py                   |  2 ++
 test/test_cpu/test_woq_linear.py              |  3 ++-
 test/test_cuda/requirements_vlm.txt           |  4 +---
 test/test_cuda/test_2_3bits.py                |  2 +-
 test/test_cuda/test_auto_round_format.py      |  2 +-
 test/test_cuda/test_calib_dataset.py          |  2 +-
 test/test_cuda/test_conv1d.py                 |  2 +-
 test/test_cuda/test_exllamav2_backend.py      |  2 +-
 test/test_cuda/test_export.py                 |  2 +-
 test/test_cuda/test_get_block_name.py         |  2 +-
 test/test_cuda/test_gguf.py                   |  6 +++---
 test/test_cuda/test_main_func.py              |  2 +-
 test/test_cuda/test_marlin_backend.py         |  2 +-
 test/test_cuda/test_multiple_card.py          |  2 +-
 test/test_cuda/test_multiple_card_calib.py    |  4 ++--
 test/test_cuda/test_qbits.py                  |  2 +-
 test/test_cuda/test_support_vlms.py           | 18 ++++++++---------
 test/test_cuda/test_triton_backend.py         |  2 +-
 test/test_cuda/test_vlms.py                   |  2 +-
 test/test_xpu/test_autoround.py               |  2 +-
 37 files changed, 63 insertions(+), 62 deletions(-)

diff --git a/test/test_cpu/test_act_quantization.py b/test/test_cpu/test_act_quantization.py
index a4ada07d..e72c9931 100644
--- a/test/test_cpu/test_act_quantization.py
+++ b/test/test_cpu/test_act_quantization.py
@@ -3,7 +3,7 @@
 import sys
 import unittest
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
diff --git a/test/test_cpu/test_autoopt.py b/test/test_cpu/test_autoopt.py
index 3ea4565d..6a986706 100644
--- a/test/test_cpu/test_autoopt.py
+++ b/test/test_cpu/test_autoopt.py
@@ -3,7 +3,7 @@
 import sys
 import unittest
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
diff --git a/test/test_cpu/test_autoround.py b/test/test_cpu/test_autoround.py
index 77ad0dac..02352c00 100644
--- a/test/test_cpu/test_autoround.py
+++ b/test/test_cpu/test_autoround.py
@@ -5,7 +5,7 @@
 
 from auto_round.eval.evaluation import simple_evaluate_user_model
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
diff --git a/test/test_cpu/test_autoround_acc.py b/test/test_cpu/test_autoround_acc.py
index 37229b1c..1158e9be 100644
--- a/test/test_cpu/test_autoround_acc.py
+++ b/test/test_cpu/test_autoround_acc.py
@@ -3,7 +3,7 @@
 import shutil
 import sys
 import unittest
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 import torch
 import transformers
 from math import isclose
diff --git a/test/test_cpu/test_autoround_export_to_itrex.py b/test/test_cpu/test_autoround_export_to_itrex.py
index ba5424fd..7e894b63 100644
--- a/test/test_cpu/test_autoround_export_to_itrex.py
+++ b/test/test_cpu/test_autoround_export_to_itrex.py
@@ -3,7 +3,7 @@
 import sys
 import unittest
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
diff --git a/test/test_cpu/test_basic_usage.py b/test/test_cpu/test_basic_usage.py
index 695271be..eeedbcd6 100644
--- a/test/test_cpu/test_basic_usage.py
+++ b/test/test_cpu/test_basic_usage.py
@@ -3,7 +3,7 @@
 import sys
 import unittest
 
-sys.path.insert(0, '..')
+sys.path.insert(0, '../..')
 
 
 class TestAutoRoundCmd(unittest.TestCase):
@@ -21,52 +21,52 @@ def test_auto_round_cmd(self):
 
         ##test llm script
         # res = os.system(
-        #     f"cd .. && {python_path} -m auto_round -h")
+        #     f"cd ../.. && {python_path} -m auto_round -h")
         # if res > 0 or res == -1:
         #     assert False, "cmd line test fail, please have a check"
         #
         res = os.system(
-            f"cd .. && {python_path} -m auto_round --model 'facebook/opt-125m' --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved --tasks piqa")
+            f"cd ../.. && {python_path} -m auto_round --model 'facebook/opt-125m' --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved --tasks piqa")
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
         
         res = os.system(
-            f"cd .. && {python_path} -m auto_round --model 'facebook/opt-125m' --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32"
+            f"cd ../.. && {python_path} -m auto_round --model 'facebook/opt-125m' --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd .. && {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai")
+            f"cd ../.. && {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai")
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         # test mllm script
         # test auto_round_mllm help
         res = os.system(
-            f"cd .. && {python_path} -m auto_round --mllm -h")
+            f"cd ../.. && {python_path} -m auto_round --mllm -h")
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         # test auto_round_mllm --eval help
         res = os.system(
-            f"cd .. && {python_path} -m auto_round --mllm --eval -h")
+            f"cd ../.. && {python_path} -m auto_round --mllm --eval -h")
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         # test auto_round_mllm --lmms help
         res = os.system(
-            f"cd .. && {python_path} -m auto_round --mllm --lmms -h")
+            f"cd ../.. && {python_path} -m auto_round --mllm --lmms -h")
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd .. && {python_path} -m auto_round --mllm --iter 2 --nsamples 10 --seqlen 32 --format auto_round --output_dir ./saved")
+            f"cd ../.. && {python_path} -m auto_round --mllm --iter 2 --nsamples 10 --seqlen 32 --format auto_round --output_dir ./saved")
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd .. && {python_path} -m auto_round --mllm --iter 2 --nsamples 10  --seqlen 256 --format auto_round"
+            f"cd ../.. && {python_path} -m auto_round --mllm --iter 2 --nsamples 10  --seqlen 256 --format auto_round"
             " --quant_nontext_module --output_dir ./saved ")
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
diff --git a/test/test_cpu/test_block_names.py b/test/test_cpu/test_block_names.py
index 7c618b68..2e26565f 100644
--- a/test/test_cpu/test_block_names.py
+++ b/test/test_cpu/test_block_names.py
@@ -3,7 +3,7 @@
 import sys
 import unittest
 sys.path.insert(0, ".")
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 import torch
 import torch.nn as nn
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
@@ -203,7 +203,7 @@ def test_moe(self):
         ##tokenizer = AutoTokenizer.from_pretrained(model_name)
         # python_path = sys.executable
         # res = os.system(
-        #     f"cd .. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {model_name} --iter 1 --nsamples 1 --format auto_round --output_dir test/saved --disable_eval")
+        #     f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {model_name} --iter 1 --nsamples 1 --format auto_round --output_dir test/saved --disable_eval")
         # if res > 0 or res == -1:
         #     assert False, "cmd line test fail, please have a check"
         #
diff --git a/test/test_cpu/test_calib_dataset.py b/test/test_cpu/test_calib_dataset.py
index 57990d9b..5dea5bf0 100644
--- a/test/test_cpu/test_calib_dataset.py
+++ b/test/test_cpu/test_calib_dataset.py
@@ -3,7 +3,7 @@
 import sys
 import unittest
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 import json
 
 import torch
diff --git a/test/test_cpu/test_conv1d.py b/test/test_cpu/test_conv1d.py
index 4f45fcbd..cfcac0bd 100644
--- a/test/test_cpu/test_conv1d.py
+++ b/test/test_cpu/test_conv1d.py
@@ -3,7 +3,7 @@
 import sys
 import unittest
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py
index 55cecd37..fd751c3c 100644
--- a/test/test_cpu/test_export.py
+++ b/test/test_cpu/test_export.py
@@ -2,7 +2,7 @@
 import sys
 import unittest
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
diff --git a/test/test_cpu/test_generation.py b/test/test_cpu/test_generation.py
index e2a14097..0ca6390e 100644
--- a/test/test_cpu/test_generation.py
+++ b/test/test_cpu/test_generation.py
@@ -3,7 +3,7 @@
 import sys
 import unittest
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py
index c85eded5..b353232a 100644
--- a/test/test_cpu/test_gguf_format.py
+++ b/test/test_cpu/test_gguf_format.py
@@ -2,7 +2,7 @@
 import sys
 import unittest
 import shutil
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -35,7 +35,7 @@ def tearDownClass(self):
     def test_basic_usage(self):
         python_path = sys.executable
         res = os.system(
-            f"cd .. && {python_path} -m auto_round --model {self.model_name} --eval_task_by_task"
+            f"cd ../.. && {python_path} -m auto_round --model {self.model_name} --eval_task_by_task"
             f" --tasks piqa --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0"
         )
         if res > 0 or res == -1:
@@ -43,7 +43,7 @@ def test_basic_usage(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
         res = os.system(
-            f"cd .. && {python_path} -m auto_round --model {self.model_name}"
+            f"cd ../.. && {python_path} -m auto_round --model {self.model_name}"
             f" --tasks piqa,openbookqa --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0"
         )
         if res > 0 or res == -1:
diff --git a/test/test_cpu/test_hpu.py b/test/test_cpu/test_hpu.py
index f7c87f0d..629a9321 100644
--- a/test/test_cpu/test_hpu.py
+++ b/test/test_cpu/test_hpu.py
@@ -2,7 +2,7 @@
 import sys
 import unittest
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
diff --git a/test/test_cpu/test_load_awq_gptq.py b/test/test_cpu/test_load_awq_gptq.py
index 6db06c03..08eb8d75 100644
--- a/test/test_cpu/test_load_awq_gptq.py
+++ b/test/test_cpu/test_load_awq_gptq.py
@@ -2,7 +2,7 @@
 import sys
 import unittest
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
diff --git a/test/test_cpu/test_low_cpu_mem.py b/test/test_cpu/test_low_cpu_mem.py
index 737fc164..2c4378f0 100644
--- a/test/test_cpu/test_low_cpu_mem.py
+++ b/test/test_cpu/test_low_cpu_mem.py
@@ -2,7 +2,7 @@
 import sys
 import os
 import unittest
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
diff --git a/test/test_cpu/test_mllm.py b/test/test_cpu/test_mllm.py
index fd6d996b..16005241 100644
--- a/test/test_cpu/test_mllm.py
+++ b/test/test_cpu/test_mllm.py
@@ -1,7 +1,7 @@
 import sys
 import unittest
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 
 from auto_round import AutoRoundMLLM
 
diff --git a/test/test_cpu/test_script.py b/test/test_cpu/test_script.py
index 32662134..069a59ec 100644
--- a/test/test_cpu/test_script.py
+++ b/test/test_cpu/test_script.py
@@ -2,13 +2,13 @@
 import sys
 import unittest
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 
 
 class TestScript(unittest.TestCase):
     def test_default(self):
         os.system('''
-                cd .. && 
+                cd ../.. && 
                 python -m auto_round
                     --iters 2
                     --deployment_device fake
diff --git a/test/test_cpu/test_utils.py b/test/test_cpu/test_utils.py
index e9faedbe..eff324e4 100644
--- a/test/test_cpu/test_utils.py
+++ b/test/test_cpu/test_utils.py
@@ -1,4 +1,6 @@
 from unittest.mock import patch
+import sys
+sys.path.insert(0, "../..")
 import auto_round.utils as auto_round_utils
 
 class TestPackingWithNumba:
diff --git a/test/test_cpu/test_woq_linear.py b/test/test_cpu/test_woq_linear.py
index f049890e..1f48e230 100644
--- a/test/test_cpu/test_woq_linear.py
+++ b/test/test_cpu/test_woq_linear.py
@@ -1,6 +1,7 @@
 import pytest
 import torch
-
+import sys
+sys.path.insert(0, "../..")
 from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear
 
 
diff --git a/test/test_cuda/requirements_vlm.txt b/test/test_cuda/requirements_vlm.txt
index 46d8220d..707b14ea 100644
--- a/test/test_cuda/requirements_vlm.txt
+++ b/test/test_cuda/requirements_vlm.txt
@@ -1,16 +1,14 @@
 # git+https://github.com/haotian-liu/LLaVA.git@v1.2.2
 # git clone https://github.com/deepseek-ai/DeepSeek-VL2.git && cd DeepSeek-VL2 && pip install -e .
 accelerate
-autoawq
-auto-gptq
 bitsandbytes==0.44.0 
 datasets
 einops
 flash-attn==2.6.1
-gptqmodel>=2.0
 intel-extension-for-pytorch==2.6.0
 intel-extension-for-transformers
 lm-eval>=0.4.2,<0.5
+nvidia-cudnn-cu12==8.9.7.29
 numpy < 2.0
 optimum
 pandas
diff --git a/test/test_cuda/test_2_3bits.py b/test/test_cuda/test_2_3bits.py
index 5f49c254..51eec8b7 100644
--- a/test/test_cuda/test_2_3bits.py
+++ b/test/test_cuda/test_2_3bits.py
@@ -4,7 +4,7 @@
 import unittest
 import re
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
diff --git a/test/test_cuda/test_auto_round_format.py b/test/test_cuda/test_auto_round_format.py
index 5fabe1bd..817b5087 100644
--- a/test/test_cuda/test_auto_round_format.py
+++ b/test/test_cuda/test_auto_round_format.py
@@ -3,7 +3,7 @@
 import sys
 import unittest
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.testing_utils import require_greater_than_050, require_autogptq, require_awq, require_ipex
 
diff --git a/test/test_cuda/test_calib_dataset.py b/test/test_cuda/test_calib_dataset.py
index a4d8a73e..69479da1 100644
--- a/test/test_cuda/test_calib_dataset.py
+++ b/test/test_cuda/test_calib_dataset.py
@@ -3,7 +3,7 @@
 import sys
 import unittest
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 import json
 
 import torch
diff --git a/test/test_cuda/test_conv1d.py b/test/test_cuda/test_conv1d.py
index 79dad6d6..bf0daba3 100644
--- a/test/test_cuda/test_conv1d.py
+++ b/test/test_cuda/test_conv1d.py
@@ -3,7 +3,7 @@
 import sys
 import unittest
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
diff --git a/test/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py
index de243219..151cf690 100644
--- a/test/test_cuda/test_exllamav2_backend.py
+++ b/test/test_cuda/test_exllamav2_backend.py
@@ -2,7 +2,7 @@
 import sys
 import unittest
 import pytest
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 
 
 import torch
diff --git a/test/test_cuda/test_export.py b/test/test_cuda/test_export.py
index 62e75225..bbd0b5f5 100644
--- a/test/test_cuda/test_export.py
+++ b/test/test_cuda/test_export.py
@@ -3,7 +3,7 @@
 import sys
 import unittest
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
diff --git a/test/test_cuda/test_get_block_name.py b/test/test_cuda/test_get_block_name.py
index f077a65d..fd89e2aa 100644
--- a/test/test_cuda/test_get_block_name.py
+++ b/test/test_cuda/test_get_block_name.py
@@ -3,7 +3,7 @@
 import sys
 import unittest
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer, Qwen2VLForConditionalGeneration, AutoModelForVision2Seq, \
diff --git a/test/test_cuda/test_gguf.py b/test/test_cuda/test_gguf.py
index f31de413..43044672 100644
--- a/test/test_cuda/test_gguf.py
+++ b/test/test_cuda/test_gguf.py
@@ -3,7 +3,7 @@
 import sys
 import unittest
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -60,7 +60,7 @@ def test_gguf_format(self):
         save_dir = os.path.join(os.path.dirname(__file__), "saved")
         model_path = "Qwen/Qwen2.5-0.5B-Instruct"
         res = os.system(
-            f"cd .. && {sys.executable} -m auto_round --model {model_path} --iter 2 "
+            f"cd ../.. && {sys.executable} -m auto_round --model {model_path} --iter 2 "
             f"--output_dir {save_dir} --nsample 2 --format gguf:q4_0 --device 0"
         )
         print(save_dir)
@@ -110,7 +110,7 @@ def test_q2_k_export(self):
     def test_basic_usage(self):
         python_path = sys.executable
         res = os.system(
-            f"cd .. && {python_path} -m auto_round --model {self.model_name} --eval_task_by_task"
+            f"cd ../.. && {python_path} -m auto_round --model {self.model_name} --eval_task_by_task"
             f" --tasks piqa,openbookqa --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0"
         )
         if res > 0 or res == -1:
diff --git a/test/test_cuda/test_main_func.py b/test/test_cuda/test_main_func.py
index be07692b..a3453338 100644
--- a/test/test_cuda/test_main_func.py
+++ b/test/test_cuda/test_main_func.py
@@ -4,7 +4,7 @@
 import unittest
 import re
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
diff --git a/test/test_cuda/test_marlin_backend.py b/test/test_cuda/test_marlin_backend.py
index a800eb00..cb3516ca 100644
--- a/test/test_cuda/test_marlin_backend.py
+++ b/test/test_cuda/test_marlin_backend.py
@@ -3,7 +3,7 @@
 import unittest
 import pytest
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
diff --git a/test/test_cuda/test_multiple_card.py b/test/test_cuda/test_multiple_card.py
index 8c0e5e7f..7f2dcd11 100644
--- a/test/test_cuda/test_multiple_card.py
+++ b/test/test_cuda/test_multiple_card.py
@@ -2,7 +2,7 @@
 import sys
 import unittest
 import shutil
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 
 
 import torch
diff --git a/test/test_cuda/test_multiple_card_calib.py b/test/test_cuda/test_multiple_card_calib.py
index 2d7ff712..63a494f7 100644
--- a/test/test_cuda/test_multiple_card_calib.py
+++ b/test/test_cuda/test_multiple_card_calib.py
@@ -4,7 +4,7 @@
 import shutil
 import unittest
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 
 from auto_round.testing_utils import multi_card
 
@@ -35,7 +35,7 @@ def test_multiple_card_calib(self):
 
         ##test llm script
         res = os.system(
-            f"cd .. && {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --disable_eval --iters 1 --nsamples 1 --output_dir None")
+            f"cd ../.. && {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --disable_eval --iters 1 --nsamples 1 --output_dir None")
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
diff --git a/test/test_cuda/test_qbits.py b/test/test_cuda/test_qbits.py
index f391f323..8b79d97b 100644
--- a/test/test_cuda/test_qbits.py
+++ b/test/test_cuda/test_qbits.py
@@ -2,7 +2,7 @@
 import sys
 import unittest
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
diff --git a/test/test_cuda/test_support_vlms.py b/test/test_cuda/test_support_vlms.py
index 54fe86a0..fe424d20 100644
--- a/test/test_cuda/test_support_vlms.py
+++ b/test/test_cuda/test_support_vlms.py
@@ -3,7 +3,7 @@
 import shutil
 import unittest
 
-sys.path.insert(0, '..')
+sys.path.insert(0, '../..')
 
 from auto_round import AutoRoundConfig ## must import for auto-round format
 from auto_round.testing_utils import require_gptqmodel, require_vlm_env
@@ -28,7 +28,7 @@ def tearDownClass(self):
     #     model_path = "/models/Qwen2-VL-2B-Instruct/"
     #     # test tune
     #     res = os.system(
-    #         f"cd .. && {self.python_path} -m auto_round --mllm "
+    #         f"cd ../.. && {self.python_path} -m auto_round --mllm "
     #         f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") 
     #     self.assertFalse(res > 0 or res == -1, msg="qwen2 tuning fail")
 
@@ -84,7 +84,7 @@ def test_phi3(self):
         model_path = "/models/Phi-3.5-vision-instruct/"
         ## test tune
         res = os.system(
-            f"cd .. && {self.python_path} -m auto_round --mllm "
+            f"cd ../.. && {self.python_path} -m auto_round --mllm "
             f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}")
         self.assertFalse(res > 0 or res == -1, msg="Phi-3.5 tuning fail")
 
@@ -142,7 +142,7 @@ def test_phi3_vision_awq(self):
         model_path = "/models/Phi-3.5-vision-instruct/"
         ## test tune
         res = os.system(
-            f"cd .. && {self.python_path} -m auto_round --mllm "
+            f"cd ../.. && {self.python_path} -m auto_round --mllm "
             f"--model {model_path} --iter 2 --quant_nontext_module "
             f"--nsample 64 --seqlen 32 "
             f"--format auto_awq --output_dir {self.save_dir} --device {self.device}")
@@ -205,7 +205,7 @@ def test_llava(self):
         model_path = "/models/llava-v1.5-7b/"
         ## test tune
         res = os.system(
-            f"cd .. && {self.python_path} -m auto_round --mllm "
+            f"cd ../.. && {self.python_path} -m auto_round --mllm "
             f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") 
         self.assertFalse(res > 0 or res == -1, msg="llava-v1.5-7b tuning fail")
     
@@ -242,7 +242,7 @@ class DataArgs:
     #     model_path = "/models/Llama-3.2-11B-Vision-Instruct/"
     #     ## test tune
     #     res = os.system(
-    #         f"cd .. && {self.python_path} -m auto_round --mllm "
+    #         f"cd ../.. && {self.python_path} -m auto_round --mllm "
     #         f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") 
     #     self.assertFalse(res > 0 or res == -1, msg="llama-3.2 tuning fail")
 
@@ -282,7 +282,7 @@ def test_cogvlm(self):
         model_path = "/models/cogvlm2-llama3-chat-19B/"
         ## test tune
         res = os.system(
-            f"cd .. && {self.python_path} -m auto_round --mllm "
+            f"cd ../.. && {self.python_path} -m auto_round --mllm "
             f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") 
         self.assertFalse(res > 0 or res == -1, msg="cogvlm2 tuning fail")
     
@@ -335,7 +335,7 @@ def test_cogvlm(self):
     # def test_72b(self):
     #     model_path = "/models/Qwen2-VL-72B-Instruct/"
     #     res = os.system(
-    #         f"cd .. && {self.python_path} -m auto_round --mllm "
+    #         f"cd ../.. && {self.python_path} -m auto_round --mllm "
     #         f"--model {model_path} --iter 1 --nsamples 1 --bs 1 --output_dir {self.save_dir} --device {self.device}"
     #         )
     #     self.assertFalse(res > 0 or res == -1, msg="qwen2-72b tuning fail")
@@ -345,7 +345,7 @@ def test_cogvlm(self):
     def test_deepseek_vl2(self):
         model_path = "/models/deepseek-vl2-tiny"
         res = os.system(
-            f"cd .. && {self.python_path} -m auto_round --mllm "
+            f"cd ../.. && {self.python_path} -m auto_round --mllm "
             f"--model {model_path} --iter 3 --nsamples 10 --bs 4 --output_dir {self.save_dir} --device auto --group_size 32 "
             f"--fp_layers language.model.layers.4,language.model.layers.6"
             )
diff --git a/test/test_cuda/test_triton_backend.py b/test/test_cuda/test_triton_backend.py
index bac83243..ddb24eca 100644
--- a/test/test_cuda/test_triton_backend.py
+++ b/test/test_cuda/test_triton_backend.py
@@ -3,7 +3,7 @@
 import sys
 import unittest
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 from auto_round.eval.evaluation import simple_evaluate_user_model
 
 import torch
diff --git a/test/test_cuda/test_vlms.py b/test/test_cuda/test_vlms.py
index 7917ffc5..ccc24a4f 100644
--- a/test/test_cuda/test_vlms.py
+++ b/test/test_cuda/test_vlms.py
@@ -6,7 +6,7 @@
 import unittest
 import requests
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 
 from PIL import Image
 from auto_round import AutoRoundConfig
diff --git a/test/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py
index d0ab1dd9..caad1663 100644
--- a/test/test_xpu/test_autoround.py
+++ b/test/test_xpu/test_autoround.py
@@ -3,7 +3,7 @@
 import sys
 import unittest
 
-sys.path.insert(0, "..")
+sys.path.insert(0, "../..")
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer

From ee2c0b5510f3807c67e26021cdfb96ac2d9ff56f Mon Sep 17 00:00:00 2001
From: "Sun, Xuehao" <xuehao.sun@intel.com>
Date: Tue, 13 May 2025 14:52:10 +0800
Subject: [PATCH 5/7] fix ci path

Signed-off-by: Sun, Xuehao <xuehao.sun@intel.com>
---
 .azure-pipelines/scripts/ut/run_ut.sh     | 13 +++++++------
 .azure-pipelines/scripts/ut/run_ut_hpu.sh |  4 ++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/.azure-pipelines/scripts/ut/run_ut.sh b/.azure-pipelines/scripts/ut/run_ut.sh
index 91509b25..b482f29b 100644
--- a/.azure-pipelines/scripts/ut/run_ut.sh
+++ b/.azure-pipelines/scripts/ut/run_ut.sh
@@ -2,16 +2,17 @@
 set -xe
 
 # install requirements
-echo "set up UT env..."
+echo "##[group]set up UT env..."
 export TQDM_MININTERVAL=60
-export TQDM_POSITION=-1
 pip install pytest-cov pytest-html
-pip install -r /auto-round/test/requirements.txt
-pip list
+pip install -r /auto-round/test/test_cpu/requirements.txt
+
 # install latest gguf for ut test
 git clone https://github.com/ggml-org/llama.cpp.git && cd llama.cpp/gguf-py && pip install .
+echo "##[endgroup]"
+pip list
 
-cd /auto-round/test || exit 1
+cd /auto-round/test/test_cpu || exit 1
 find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} +
 
 export LD_LIBRARY_PATH=${HOME}/.local/lib/:$LD_LIBRARY_PATH
@@ -31,7 +32,7 @@ cp report.html ${LOG_DIR}/
 cp coverage.xml ${LOG_DIR}/
 
 if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then
-    echo "Find errors in pytest case, please check the output..."
+    echo "##[error]Find errors in pytest case, please check the output..."
     exit 1
 fi
 
diff --git a/.azure-pipelines/scripts/ut/run_ut_hpu.sh b/.azure-pipelines/scripts/ut/run_ut_hpu.sh
index 750562c2..ec2ad0d2 100644
--- a/.azure-pipelines/scripts/ut/run_ut_hpu.sh
+++ b/.azure-pipelines/scripts/ut/run_ut_hpu.sh
@@ -6,7 +6,7 @@ echo "set up UT env..."
 pip install pytest-cov pytest-html
 pip list
 
-cd /auto-round/test || exit 1
+cd /auto-round/test/test_cpu || exit 1
 find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} +
 
 export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
@@ -31,7 +31,7 @@ cp report.html ${LOG_DIR}/
 cp coverage.xml ${LOG_DIR}/
 
 if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then
-    echo "Find errors in pytest case, please check the output..."
+    echo "##[error]Find errors in pytest case, please check the output..."
     exit 1
 fi
 

From bf1d495fd5cadc78f85bad70ccb38ddb11057c9d Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Tue, 13 May 2025 21:26:48 -0400
Subject: [PATCH 6/7] update requirements for vlm

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 test/test_cuda/requirements_vlm.txt | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/test/test_cuda/requirements_vlm.txt b/test/test_cuda/requirements_vlm.txt
index 707b14ea..ac0034e4 100644
--- a/test/test_cuda/requirements_vlm.txt
+++ b/test/test_cuda/requirements_vlm.txt
@@ -8,14 +8,13 @@ flash-attn==2.6.1
 intel-extension-for-pytorch==2.6.0
 intel-extension-for-transformers
 lm-eval>=0.4.2,<0.5
-nvidia-cudnn-cu12==8.9.7.29
 numpy < 2.0
 optimum
 pandas
 pillow
 py-cpuinfo
-torch==2.0.1
-torchvision==0.15.2
-triton==2.0.0
+torch==2.3.0
+torchvision
+triton==2.3.0
 tqdm
 transformers==4.45.0

From 2b85ac258329ee09bb5b598180dc9b226e6ef9ed Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Wed, 14 May 2025 01:17:58 -0400
Subject: [PATCH 7/7] update;

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 test/test_cuda/requirements_vlm.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/test_cuda/requirements_vlm.txt b/test/test_cuda/requirements_vlm.txt
index ac0034e4..bd3cafeb 100644
--- a/test/test_cuda/requirements_vlm.txt
+++ b/test/test_cuda/requirements_vlm.txt
@@ -1,11 +1,11 @@
 # git+https://github.com/haotian-liu/LLaVA.git@v1.2.2
-# git clone https://github.com/deepseek-ai/DeepSeek-VL2.git && cd DeepSeek-VL2 && pip install -e .
+# pip install git+https://github.com/deepseek-ai/DeepSeek-VL2.git
 accelerate
 bitsandbytes==0.44.0 
 datasets
 einops
 flash-attn==2.6.1
-intel-extension-for-pytorch==2.6.0
+intel-extension-for-pytorch==2.3.0
 intel-extension-for-transformers
 lm-eval>=0.4.2,<0.5
 numpy < 2.0
@@ -18,3 +18,4 @@ torchvision
 triton==2.3.0
 tqdm
 transformers==4.45.0
+xformers