From fff94acd516b23f804ef7ebb84c94eb724fb99e5 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Sun, 11 May 2025 21:21:18 -0400 Subject: [PATCH 1/7] refact cuda ut Signed-off-by: n1ck-guo --- auto_round/testing_utils.py | 174 ++++++++++++++++++++++++++ test_cuda/requirements.txt | 18 +++ test_cuda/test_2_3bits.py | 4 + test_cuda/test_auto_round_format.py | 44 ++----- test_cuda/test_conv1d.py | 3 +- test_cuda/test_exllamav2_backend.py | 4 + test_cuda/test_export.py | 7 +- test_cuda/test_gguf.py | 100 +++++++++++++++ test_cuda/test_gguf_format.py | 131 ------------------- test_cuda/test_main_func.py | 5 +- test_cuda/test_multiple_card.py | 19 ++- test_cuda/test_multiple_card_calib.py | 17 ++- test_cuda/test_qbits.py | 14 ++- test_cuda/test_support_vlms.py | 11 +- test_cuda/test_triton_backend.py | 11 +- test_cuda/test_vlms.py | 13 +- 16 files changed, 383 insertions(+), 192 deletions(-) create mode 100644 auto_round/testing_utils.py create mode 100644 test_cuda/requirements.txt delete mode 100644 test_cuda/test_gguf_format.py diff --git a/auto_round/testing_utils.py b/auto_round/testing_utils.py new file mode 100644 index 00000000..03d2e709 --- /dev/null +++ b/auto_round/testing_utils.py @@ -0,0 +1,174 @@ +import unittest +import importlib.util + +import torch + +from transformers.utils.versions import require_version + +def is_gguf_available(): + return importlib.util.find_spec("gguf") is not None + +def is_autogptq_available(): + return importlib.util.find_spec("auto-gptq") is not None + +def is_awq_available(): + return importlib.util.find_spec("autoawq") is not None + +def is_optimum_available(): + return importlib.util.find_spec("optimum") is not None + +def is_ipex_available(): + try: + require_version("intel-extension-for-pytorch>=2.5") + return True + except ImportError: + return False + +def is_itrex_available(): + return importlib.util.find_spec("intel-extension-for-transformers") is not None + +def is_flash_attn_avaliable(): + return importlib.util.find_spec("flash-attn") is not None + +def is_gptqmodel_available(): + try: + require_version("gptqmodel>=2.0") + return True + except ImportError: + return False + +def is_new_version(): + try: + require_version("auto-round>=0.5.0") + return True + except ImportError: + return False + + +def require_gguf(test_case): + """ + Decorator marking a test that requires gguf. + + These tests are skipped when gguf isn't installed. + + """ + return unittest.skipUnless(is_gguf_available(), "test requires gguf")(test_case) + + +def require_autogptq(test_case): + """ + Decorator marking a test that requires auto-gptq. + + These tests are skipped when auto-gptq isn't installed. + + """ + return unittest.skipUnless(is_autogptq_available(), "test requires auto-gptq")(test_case) + + +def require_gptqmodel(test_case): + """ + Decorator marking a test that requires gptqmodel. + + These tests are skipped when gptqmodel isn't installed. + + """ + return unittest.skipUnless(is_autogptq_available(), "test requires gptqmodel>=2.0")(test_case) + + +def require_awq(test_case): + """ + Decorator marking a test that requires autoawq. + + These tests are skipped when autoawq isn't installed. + + """ + return unittest.skipUnless(is_awq_available(), "test requires autoawq")(test_case) + + +def require_ipex(test_case): + """ + Decorator marking a test that requires intel-extension-for-pytorch. + + These tests are skipped when intel-extension-for-pytorch isn't installed. + + """ + return unittest.skipUnless(is_ipex_available(), "test requires intel-extension-for-pytorch>=2.5")(test_case) + + +def require_itrex(test_case): + """ + Decorator marking a test that requires intel-extension-for-transformers. + + These tests are skipped when intel-extension-for-transformers isn't installed. + + """ + return unittest.skipUnless(is_itrex_available(), "test requires intel-extension-for-transformers")(test_case) + +def require_optimum(test_case): + """ + Decorator marking a test that optimum. + + These tests are skipped when optimum isn't installed. + + """ + return unittest.skipUnless(is_optimum_available(), "test requires optimum")(test_case) + + +def require_new_version(test_case): + """ + Decorator marking a test that requires auto-round>=0.5.0. + + These tests are skipped when auto-round<0.5.0. + + """ + return unittest.skipUnless(is_new_version(), "test requires auto-round>=0.5.0")(test_case) + + +def multi_card(test_case): + """ + Decorator marking a test that requires multi cards. + + These tests are skipped when use only one card or cpu. + + """ + return unittest.skipUnless( + torch.cuda.is_available() and torch.cuda.device_count() > 1, "test requires multiple cards.")(test_case) + + +def require_old_version(test_case): + """ + Decorator marking a test that requires old version of transformers and torch. + + These tests are skipped when not use special version. + + """ + env_check = True + try: + require_version("torch<2.7.0") + env_check &= True + except ImportError: + env_check &= False + return unittest.skipUnless(env_check, "Environment is not satisfactory")(test_case) + + +def require_vlm_env(test_case): + """ + Decorator marking a test that requires some special env to load vlm model. + + These tests are skipped when not meet the environment requirments. + + """ + + env_check = True + # pip install flash-attn --no-build-isolation + env_check &= is_flash_attn_avaliable() + + # git clone https://github.com/haotian-liu/LLaVA.git && cd LLaVA && pip install -e . + env_check &= importlib.util.find_spec("llava") is not None + + return unittest.skipUnless(env_check, "Environment is not satisfactory")(test_case) + + + + + \ No newline at end of file diff --git a/test_cuda/requirements.txt b/test_cuda/requirements.txt new file mode 100644 index 00000000..00ae08a8 --- /dev/null +++ b/test_cuda/requirements.txt @@ -0,0 +1,18 @@ +accelerate +autoawq +auto-gptq +datasets +einops +gptqmodel>=2.0 +intel-extension-for-pytorch>=2.5 +intel-extension-for-transformers +lm-eval>=0.4.2,<0.5 +numpy < 2.0 +optimum +pandas +pillow +py-cpuinfo +torch +torchvision +tqdm +transformers diff --git a/test_cuda/test_2_3bits.py b/test_cuda/test_2_3bits.py index c595bc5f..82a84b12 100644 --- a/test_cuda/test_2_3bits.py +++ b/test_cuda/test_2_3bits.py @@ -12,6 +12,7 @@ from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate from lm_eval.utils import make_table # pylint: disable=E0401 +from auto_round.testing_utils import require_autogptq, require_new_version def get_accuracy(data): @@ -35,6 +36,7 @@ def tearDownClass(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) + @require_autogptq def test_3bits_autoround(self): model_name = "/models/opt-125m" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") @@ -55,6 +57,7 @@ def test_3bits_autoround(self): assert accuracy > 0.3 shutil.rmtree("./saved", ignore_errors=True) + @require_new_version def test_norm_bias_tuning(self): model_name = "/models/opt-125m" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") @@ -73,6 +76,7 @@ def test_norm_bias_tuning(self): assert accuracy > 0.18 shutil.rmtree("./saved", ignore_errors=True) + @require_new_version def test_2bits_autoround(self): model_name = "/models/opt-125m" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") diff --git a/test_cuda/test_auto_round_format.py b/test_cuda/test_auto_round_format.py index 9cc2b0a0..7b995e36 100644 --- a/test_cuda/test_auto_round_format.py +++ b/test_cuda/test_auto_round_format.py @@ -5,6 +5,7 @@ sys.path.insert(0, "..") from auto_round.eval.evaluation import simple_evaluate_user_model +from auto_round.testing_utils import require_new_version, require_autogptq, require_awq, require_ipex import torch import transformers @@ -75,8 +76,9 @@ def tearDownClass(self): shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) + @require_new_version def test_autoround_asym(self): - for bits in [2, 4, 8]: + for bits in [2, 3, 4, 8]: model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = bits, 128, False @@ -94,7 +96,7 @@ def test_autoround_asym(self): autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") - model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cuda:0", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) text = "There is a girl who likes adventure," @@ -104,6 +106,7 @@ def test_autoround_asym(self): assert ("!!!" not in res) shutil.rmtree(self.save_folder, ignore_errors=True) + @require_autogptq def test_mixed_precision(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -138,6 +141,7 @@ def test_mixed_precision(self): print(result['results']['lambada_openai']['acc,none']) self.assertGreater(result['results']['lambada_openai']['acc,none'], 0.32) + @require_awq def test_awq_backend(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -180,8 +184,9 @@ def test_awq_backend(self): self.model_infer(model, tokenizer) shutil.rmtree(self.save_folder, ignore_errors=True) + @require_new_version def test_tritonv2_bf16(self): - model_name = "/data5/wenhuach/Meta-Llama-3.1-8B-Instruct-int4-sym-inc" + model_name = "OPEA/Meta-Llama-3.1-8B-Instruct-int4-sym-inc" quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( model_name, @@ -195,6 +200,7 @@ def test_tritonv2_bf16(self): torch.cuda.empty_cache() + @require_ipex def test_autoround_gptq_sym_format(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -244,6 +250,7 @@ def test_autoround_gptq_sym_format(self): shutil.rmtree("./saved", ignore_errors=True) + @require_awq def test_autoround_awq_sym_format(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -281,6 +288,7 @@ def test_autoround_awq_sym_format(self): shutil.rmtree("./saved", ignore_errors=True) + @require_new_version def test_autoround_sym(self): for bits in [2, 3, 4, 8]: model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) @@ -310,6 +318,7 @@ def test_autoround_sym(self): assert ("!!!" not in res) shutil.rmtree(self.save_folder, ignore_errors=True) + @require_new_version def test_load_gptq_model_3bits(self): model_name = "LucasSantiago257/gemma-2b-2bits-gptq" quantization_config = AutoRoundConfig() @@ -319,35 +328,6 @@ def test_load_gptq_model_3bits(self): tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) self.model_infer(model, tokenizer) - def test_autoround_asym(self): - for bits in [2, 3, 4, 8]: - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - bits, group_size, sym = bits, 128, False - autoround = AutoRound( - model, - tokenizer, - bits=bits, - group_size=group_size, - sym=sym, - iters=2, - seqlen=2, - dataset=self.llm_dataloader, - ) - quantized_model_path = self.save_folder - - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") - - model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cuda:0", - trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) - text = "There is a girl who likes adventure," - inputs = tokenizer(text, return_tensors="pt").to(model.device) - res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]) - print(res) - assert ("!!!" not in res) - shutil.rmtree(self.save_folder, ignore_errors=True) - if __name__ == "__main__": unittest.main() diff --git a/test_cuda/test_conv1d.py b/test_cuda/test_conv1d.py index 941e1324..79dad6d6 100644 --- a/test_cuda/test_conv1d.py +++ b/test_cuda/test_conv1d.py @@ -8,6 +8,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound +from auto_round.testing_utils import require_gptqmodel from _test_helpers import model_infer class LLMDataLoader: def __init__(self): @@ -30,7 +31,7 @@ def tearDownClass(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - + @require_gptqmodel def test_quant(self): self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) bits, group_size, sym = 4, 128, True diff --git a/test_cuda/test_exllamav2_backend.py b/test_cuda/test_exllamav2_backend.py index 66846c68..de243219 100644 --- a/test_cuda/test_exllamav2_backend.py +++ b/test_cuda/test_exllamav2_backend.py @@ -11,6 +11,7 @@ from auto_round import AutoRound from auto_round import AutoRoundConfig from auto_round.eval.evaluation import simple_evaluate_user_model +from auto_round.testing_utils import require_autogptq, require_gptqmodel class LLMDataLoader: @@ -63,6 +64,7 @@ def tearDownClass(self): shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) + @require_gptqmodel def test_gptqmodel_exllmav2_4bits_asym(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -110,6 +112,7 @@ def test_gptqmodel_exllmav2_4bits_asym(self): torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) + @require_autogptq def test_gptq_exllamav2_4bits_sym(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -143,6 +146,7 @@ def test_gptq_exllamav2_4bits_sym(self): torch.cuda.empty_cache() shutil.rmtree(self.save_folder, ignore_errors=True) + @require_autogptq def test_gptq_exllamav2_4bits_sym_group_size(self): for group_size in [-1, 32, 64, 128, 256, 1024]: ## 384, 768 has accuracy issue print(f"!!!!!!!!!!!!!!!!!{group_size}!!!!!!!!!!!!!!!!!") diff --git a/test_cuda/test_export.py b/test_cuda/test_export.py index db68fa0d..3ac32e6c 100644 --- a/test_cuda/test_export.py +++ b/test_cuda/test_export.py @@ -9,6 +9,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound +from auto_round.testing_utils import require_awq, require_optimum class LLMDataLoader: @@ -32,6 +33,7 @@ def tearDownClass(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) + @require_optimum def test_autogptq_format(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -60,6 +62,7 @@ def test_autogptq_format(self): "she is a good friend of mine, she is") shutil.rmtree("./saved", ignore_errors=True) + @require_optimum def test_autogptq_format_fp_layers(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -173,8 +176,7 @@ def test_autoround_format(self): "she is a great artist, she is a great artist, she is a great artist, she is") shutil.rmtree("./saved", ignore_errors=True) - - # + @require_awq def test_autoawq_format(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -205,6 +207,7 @@ def test_autoawq_format(self): "I just think it's funny that people are downvoting") shutil.rmtree("./saved", ignore_errors=True) + @require_optimum def test_autoawq_format_fp_qsave_layers(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) layer_config = {"model.decoder.layers.0.self_attn.k_proj": {"bits": 16}, diff --git a/test_cuda/test_gguf.py b/test_cuda/test_gguf.py index 3d75967c..f31de413 100644 --- a/test_cuda/test_gguf.py +++ b/test_cuda/test_gguf.py @@ -9,6 +9,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound +from auto_round.testing_utils import require_gguf class LLMDataLoader: def __init__(self): @@ -32,6 +33,7 @@ def tearDownClass(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) + @require_gguf def test_gguf_format(self): bits, group_size, sym = 4, 32, False autoround = AutoRound( @@ -70,6 +72,104 @@ def test_gguf_format(self): output = llm("There is a girl who likes adventure,", max_tokens=32) print(output) + @require_gguf + def test_q2_k_export(self): + bits, group_size, sym = 2, 16, False + model_name = "Qwen/Qwen2.5-1.5B-Instruct" + model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + autoround = AutoRound( + model, + tokenizer, + bits=bits, + group_size=group_size, + sym=sym, + iters=1, + seqlen=1, + dataset=self.llm_dataloader, + data_type="int_asym_dq" + ) + autoround.quantize() + quantized_model_path = "./saved" + + autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="gguf:q2_k_s") + gguf_file = "Qwen2.5-1.5B-Instruct-1.5B-Q2_K_S.gguf" + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") + text = "There is a girl who likes adventure," + inputs = self.tokenizer(text, return_tensors="pt").to(model.device) + result = self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]) + print(result) + + from auto_round.eval.evaluation import simple_evaluate_user_model + result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="piqa") + self.assertGreater(result['results']['piqa']['acc,none'], 0.45) + + shutil.rmtree("./saved", ignore_errors=True) + + @require_gguf + def test_basic_usage(self): + python_path = sys.executable + res = os.system( + f"cd .. && {python_path} -m auto_round --model {self.model_name} --eval_task_by_task" + f" --tasks piqa,openbookqa --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0" + ) + if res > 0 or res == -1: + assert False, "cmd line test fail, please have a check" + shutil.rmtree("./saved", ignore_errors=True) + + @require_gguf + def test_q4_0(self): + bits, group_size, sym = 4, 32, True + autoround = AutoRound( + self.model, + self.tokenizer, + bits=bits, + group_size=group_size, + sym=sym, + iters=1, + data_type="int" + ) + autoround.quantize() + quantized_model_path = "./saved" + + autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="gguf:q4_0") + gguf_file = "Qwen2.5-0.5B-Instruct-494M-Q4_0.gguf" + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") + text = "There is a girl who likes adventure," + inputs = self.tokenizer(text, return_tensors="pt").to(model.device) + print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) + + from auto_round.eval.evaluation import simple_evaluate_user_model + result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="piqa") + self.assertGreater(result['results']['piqa']['acc,none'], 0.55) + shutil.rmtree("./saved", ignore_errors=True) + @require_gguf + def test_q4_1(self): + bits, group_size, sym = 4, 32, False + autoround = AutoRound( + self.model, + self.tokenizer, + bits=bits, + group_size=group_size, + sym=sym, + iters=1, + data_type="int" + ) + autoround.quantize() + quantized_model_path = "./saved" + + autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="gguf:q4_1") + gguf_file = "Qwen2.5-0.5B-Instruct-494M-Q4_1.gguf" + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") + text = "There is a girl who likes adventure," + inputs = self.tokenizer(text, return_tensors="pt").to(model.device) + print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) + + from auto_round.eval.evaluation import simple_evaluate_user_model + result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="piqa") + self.assertGreater(result['results']['piqa']['acc,none'], 0.55) + shutil.rmtree("./saved", ignore_errors=True) + if __name__ == "__main__": unittest.main() \ No newline at end of file diff --git a/test_cuda/test_gguf_format.py b/test_cuda/test_gguf_format.py deleted file mode 100644 index 8e0bc9d5..00000000 --- a/test_cuda/test_gguf_format.py +++ /dev/null @@ -1,131 +0,0 @@ -import os -import sys -import unittest -import shutil -sys.path.insert(0, "..") - -import torch -from transformers import AutoModelForCausalLM, AutoTokenizer - -from auto_round import AutoRound - - -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestGGUF(unittest.TestCase): - @classmethod - def setUpClass(self): - self.model_name = "Qwen/Qwen2.5-0.5B-Instruct" - self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() - - @classmethod - def tearDownClass(self): - shutil.rmtree("./saved", ignore_errors=True) - shutil.rmtree("runs", ignore_errors=True) - - def test_q2_k_export(self): - bits, group_size, sym = 2, 16, False - model_name = "Qwen/Qwen2.5-1.5B-Instruct" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - autoround = AutoRound( - model, - tokenizer, - bits=bits, - group_size=group_size, - sym=sym, - iters=1, - seqlen=1, - dataset=self.llm_dataloader, - data_type="int_asym_dq" - ) - autoround.quantize() - quantized_model_path = "./saved" - - autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="gguf:q2_k_s") - gguf_file = "Qwen2.5-1.5B-Instruct-1.5B-Q2_K_S.gguf" - model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") - text = "There is a girl who likes adventure," - inputs = self.tokenizer(text, return_tensors="pt").to(model.device) - result = self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]) - print(result) - - from auto_round.eval.evaluation import simple_evaluate_user_model - result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="piqa") - self.assertGreater(result['results']['piqa']['acc,none'], 0.45) - - shutil.rmtree("./saved", ignore_errors=True) - - def test_basic_usage(self): - python_path = sys.executable - res = os.system( - f"cd .. && {python_path} -m auto_round --model {self.model_name} --eval_task_by_task" - f" --tasks piqa,openbookqa --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0" - ) - if res > 0 or res == -1: - assert False, "cmd line test fail, please have a check" - shutil.rmtree("./saved", ignore_errors=True) - - def test_q4_0(self): - bits, group_size, sym = 4, 32, True - autoround = AutoRound( - self.model, - self.tokenizer, - bits=bits, - group_size=group_size, - sym=sym, - iters=1, - data_type="int" - ) - autoround.quantize() - quantized_model_path = "./saved" - - autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="gguf:q4_0") - gguf_file = "Qwen2.5-0.5B-Instruct-494M-Q4_0.gguf" - model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") - text = "There is a girl who likes adventure," - inputs = self.tokenizer(text, return_tensors="pt").to(model.device) - print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) - - from auto_round.eval.evaluation import simple_evaluate_user_model - result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="piqa") - self.assertGreater(result['results']['piqa']['acc,none'], 0.55) - shutil.rmtree("./saved", ignore_errors=True) - - def test_q4_1(self): - bits, group_size, sym = 4, 32, False - autoround = AutoRound( - self.model, - self.tokenizer, - bits=bits, - group_size=group_size, - sym=sym, - iters=1, - data_type="int" - ) - autoround.quantize() - quantized_model_path = "./saved" - - autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="gguf:q4_1") - gguf_file = "Qwen2.5-0.5B-Instruct-494M-Q4_1.gguf" - model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") - text = "There is a girl who likes adventure," - inputs = self.tokenizer(text, return_tensors="pt").to(model.device) - print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) - - from auto_round.eval.evaluation import simple_evaluate_user_model - result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="piqa") - self.assertGreater(result['results']['piqa']['acc,none'], 0.55) - shutil.rmtree("./saved", ignore_errors=True) - -if __name__ == "__main__": - unittest.main() diff --git a/test_cuda/test_main_func.py b/test_cuda/test_main_func.py index 0b1c2b73..f879fbe8 100644 --- a/test_cuda/test_main_func.py +++ b/test_cuda/test_main_func.py @@ -11,6 +11,7 @@ from auto_round import AutoRound, AutoRoundAdam from auto_round.eval.evaluation import simple_evaluate +from auto_round.testing_utils import require_gptqmodel from lm_eval.utils import make_table # pylint: disable=E0401 @@ -35,6 +36,7 @@ def tearDownClass(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) + @require_gptqmodel def test_backend(self): model_name = "/models/opt-125m" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") @@ -76,6 +78,7 @@ def test_backend(self): shutil.rmtree("./saved", ignore_errors=True) @unittest.skipIf(torch.cuda.is_available() is False, "Skipping because no cuda") + @require_gptqmodel def test_fp_layers(self): model_name = "/models/opt-125m" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") @@ -119,7 +122,7 @@ def test_undivided_group_size_tuning(self): autoround = AutoRound(model, tokenizer, bits=4, group_size=127, nsamples=2, iters=2) autoround.quantize() - + @require_gptqmodel def test_adam(self): model_name = "/models/opt-125m" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") diff --git a/test_cuda/test_multiple_card.py b/test_cuda/test_multiple_card.py index 2adffbaa..d908606a 100644 --- a/test_cuda/test_multiple_card.py +++ b/test_cuda/test_multiple_card.py @@ -10,6 +10,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate +from auto_round.testing_utils import multi_card, require_new_version, require_gptqmodel def get_accuracy(data): @@ -35,6 +36,7 @@ def tearDownClass(self): shutil.rmtree(self.save_dir, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) + @multi_card def test_device_map(self): model_name = "/models/Qwen2-0.5B-Instruct" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) @@ -43,6 +45,8 @@ def test_device_map(self): autoround = AutoRound(model, tokenizer, iters=2, device_map=device_map, nsamples=7,seqlen=32) autoround.quantize() + @multi_card + @require_gptqmodel def test_device_map_str(self): model_name = "/models/Qwen2-0.5B-Instruct" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) @@ -61,6 +65,7 @@ def test_device_map_str(self): assert accuracy > 0.45 ##0.4786 shutil.rmtree("./saved", ignore_errors=True) + @multi_card def test_layer_norm(self): model_name = "/models/opt-125m" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) @@ -70,7 +75,7 @@ def test_layer_norm(self): enable_norm_bias_tuning=True) autoround.quantize() - + @multi_card def test_rms_norm(self): model_name = "/models/Qwen2-0.5B-Instruct" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) @@ -80,6 +85,7 @@ def test_rms_norm(self): enable_norm_bias_tuning=True) autoround.quantize() + @multi_card def test_act_quantization(self): model_name = "/models/Qwen2-0.5B-Instruct" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) @@ -88,6 +94,7 @@ def test_act_quantization(self): autoround = AutoRound(model, tokenizer, iters=2, device_map=device_map, nsamples=7,seqlen=32,act_bits=4,act_dynamic=False) autoround.quantize() + @multi_card def test_lm_head(self): model_name = "/models/Qwen2.5-7B-Instruct" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) @@ -98,9 +105,10 @@ def test_lm_head(self): enable_norm_bias_tuning=True,layer_config=layer_config) autoround.quantize() + @multi_card def test_device_map(self): from transformers import AutoModelForCausalLM, AutoTokenizer - model_name = "/data5/wenhuach/Meta-Llama-3.1-8B-Instruct-int4-sym-inc" + model_name = "OPEA/Meta-Llama-3.1-8B-Instruct-int4-sym-inc" device_map = {} for i in range(0, 32): @@ -192,9 +200,11 @@ def test_device_map(self): del model torch.cuda.empty_cache() + @multi_card + @require_new_version def test_device_map_for_triton(self): from transformers import AutoModelForCausalLM, AutoTokenizer - model_name = "/data5/wenhuach/Meta-Llama-3.1-8B-Instruct-int4-sym-inc" + model_name = "OPEA/Meta-Llama-3.1-8B-Instruct-int4-sym-inc" device_map = {} for i in range(0, 32): @@ -289,4 +299,5 @@ def test_device_map_for_triton(self): del model torch.cuda.empty_cache() - +if __name__ == "__main__": + unittest.main() diff --git a/test_cuda/test_multiple_card_calib.py b/test_cuda/test_multiple_card_calib.py index a56fe1b2..2d7ff712 100644 --- a/test_cuda/test_multiple_card_calib.py +++ b/test_cuda/test_multiple_card_calib.py @@ -1,18 +1,12 @@ -import copy -import shutil +import os +import re import sys +import shutil import unittest -import re sys.path.insert(0, "..") -import torch -import transformers -from transformers import AutoModelForCausalLM, AutoTokenizer -from auto_round import AutoRound -from auto_round.eval.evaluation import simple_evaluate -from lm_eval.utils import make_table # pylint: disable=E0401 -import os +from auto_round.testing_utils import multi_card def get_accuracy(data): match = re.search(r'\|acc\s+\|[↑↓]\s+\|\s+([\d.]+)\|', data) @@ -35,12 +29,15 @@ def tearDownClass(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) + @multi_card def test_multiple_card_calib(self): python_path = sys.executable ##test llm script res = os.system( f"cd .. && {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --disable_eval --iters 1 --nsamples 1 --output_dir None") + if res > 0 or res == -1: + assert False, "cmd line test fail, please have a check" if __name__ == "__main__": diff --git a/test_cuda/test_qbits.py b/test_cuda/test_qbits.py index 3f1f1d3f..d39f0562 100644 --- a/test_cuda/test_qbits.py +++ b/test_cuda/test_qbits.py @@ -7,9 +7,15 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRoundConfig, AutoRound +from auto_round.testing_utils import require_ipex, require_itrex, require_gptqmodel, require_old_version class TestAutoRound(unittest.TestCase): + @classmethod + def setUpClass(self): + self.model_name = "/models/opt-125m" + self.save_folder = "./saved" + def model_infer(self, model, tokenizer): prompts = [ "Hello,my name is", @@ -43,6 +49,8 @@ def tearDownClass(self): ## require torch 2.6 + @require_itrex + @require_old_version def test_load_gptq_model_8bits(self): model_name = "acloudfan/opt-125m-gptq-8bit" quantization_config = AutoRoundConfig() @@ -52,7 +60,8 @@ def test_load_gptq_model_8bits(self): tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) self.model_infer(model, tokenizer) - + @require_itrex + @require_old_version def test_load_gptq_model_2bits(self): model_name = "LucasSantiago257/gemma-2b-2bits-gptq" quantization_config = AutoRoundConfig() @@ -62,7 +71,7 @@ def test_load_gptq_model_2bits(self): tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) self.model_infer(model, tokenizer) - + @require_ipex def test_mixed_precision(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -99,6 +108,7 @@ def test_mixed_precision(self): assert ("!!!" not in res) shutil.rmtree(self.save_folder, ignore_errors=True) + @require_gptqmodel def test_autoround_sym(self): for bits in [4]: model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) diff --git a/test_cuda/test_support_vlms.py b/test_cuda/test_support_vlms.py index 4b76314d..008ca0d4 100644 --- a/test_cuda/test_support_vlms.py +++ b/test_cuda/test_support_vlms.py @@ -6,6 +6,7 @@ sys.path.insert(0, '..') from auto_round import AutoRoundConfig ## must import for auto-round format +from auto_round.testing_utils import require_gptqmodel, require_vlm_env import requests from PIL import Image @@ -13,7 +14,8 @@ class TestSupportVLMS(unittest.TestCase): @classmethod def setUpClass(self): - self.save_dir = os.path.join(os.path.dirname(__file__), "ut_saved") + # self.save_dir = os.path.join(os.path.dirname(__file__), "ut_saved") + self.save_dir = os.path.join(os.path.dirname("/data5/hengguo"), "ut_saved") self.python_path = sys.executable self.device = 0 @@ -21,6 +23,7 @@ def setUpClass(self): def tearDownClass(self): shutil.rmtree(self.save_dir, ignore_errors=True) + @require_gptqmodel def test_qwen2(self): model_path = "/models/Qwen2-VL-2B-Instruct/" # test tune @@ -76,6 +79,7 @@ def test_qwen2(self): print(output_text[0]) shutil.rmtree(quantized_model_path, ignore_errors=True) + @require_vlm_env def test_phi3(self): model_path = "/models/Phi-3.5-vision-instruct/" ## test tune @@ -132,7 +136,8 @@ def test_phi3(self): clean_up_tokenization_spaces=False)[0] print(response) shutil.rmtree(quantized_model_path, ignore_errors=True) - + + @require_vlm_env def test_phi3_vision_awq(self): model_path = "/models/Phi-3.5-vision-instruct/" ## test tune @@ -195,6 +200,7 @@ def test_phi3_vision_awq(self): print(response) shutil.rmtree(quantized_model_path, ignore_errors=True) + @require_vlm_env def test_llava(self): model_path = "/models/llava-v1.5-7b/" ## test tune @@ -231,6 +237,7 @@ class DataArgs: print(tokenizer.batch_decode(output)) shutil.rmtree(quantized_model_path, ignore_errors=True) + @require_gptqmodel def test_llama(self): model_path = "/models/Llama-3.2-11B-Vision-Instruct/" ## test tune diff --git a/test_cuda/test_triton_backend.py b/test_cuda/test_triton_backend.py index f87884f7..2db292b6 100644 --- a/test_cuda/test_triton_backend.py +++ b/test_cuda/test_triton_backend.py @@ -9,8 +9,8 @@ import torch from transformers import AutoModelForCausalLM, AutoTokenizer -from auto_round import AutoRound -from auto_round import AutoRoundConfig +from auto_round import AutoRound, AutoRoundConfig +from auto_round.testing_utils import require_new_version class LLMDataLoader: @@ -62,6 +62,7 @@ def tearDownClass(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) + @require_new_version def test_tritonv2_4bits_asym(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -109,6 +110,7 @@ def test_tritonv2_4bits_asym(self): torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) + @require_new_version def test_tritonv2_2bits_asym(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -153,6 +155,7 @@ def test_tritonv2_2bits_asym(self): torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) + @require_new_version def test_tritonv2_4bits_sym(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -199,8 +202,9 @@ def test_tritonv2_4bits_sym(self): self.assertGreater(result['results']['lambada_openai']['acc,none'], 0.26) torch.cuda.empty_cache() - shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree("./saved", ignore_errors=True) + @require_new_version def test_tritonv2_8bits_sym(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -247,6 +251,7 @@ def test_tritonv2_8bits_sym(self): torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) + @require_new_version def test_tritonv2_2bits_sym(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) diff --git a/test_cuda/test_vlms.py b/test_cuda/test_vlms.py index c32a061a..9faddb05 100644 --- a/test_cuda/test_vlms.py +++ b/test_cuda/test_vlms.py @@ -1,15 +1,16 @@ +import re +import os +import sys import copy import shutil -import sys import unittest -import re -import os +import requests sys.path.insert(0, "..") from PIL import Image from auto_round import AutoRoundConfig -import requests +from auto_round.testing_utils import require_gptqmodel, require_vlm_env class TestAutoRound(unittest.TestCase): @@ -87,6 +88,7 @@ def qwen_inference(self, quantized_model_dir): ) print(output_text[0]) + @require_gptqmodel def test_vlm_tune(self): from auto_round import AutoRoundMLLM from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoTokenizer @@ -161,6 +163,7 @@ def phi3_infernece(self, quantized_model_dir): print(response) + @require_vlm_env def test_quant_not_text(self): from auto_round import AutoRoundMLLM from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer @@ -183,6 +186,7 @@ def test_quant_not_text(self): self.phi3_infernece("./saved") shutil.rmtree("./saved", ignore_errors=True) + @require_vlm_env def test_quant_not_text_fp_layers(self): import os python_path = sys.executable @@ -194,6 +198,7 @@ def test_quant_not_text_fp_layers(self): self.phi3_infernece(absolute_path) shutil.rmtree(absolute_path, ignore_errors=True) + @require_vlm_env def test_mm_block_name(self): from auto_round.utils import get_block_names From 62415700126ccc3610844102d69b7cdc7c5ba0c3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 May 2025 01:22:49 +0000 Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/testing_utils.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/auto_round/testing_utils.py b/auto_round/testing_utils.py index 03d2e709..b6cb3461 100644 --- a/auto_round/testing_utils.py +++ b/auto_round/testing_utils.py @@ -1,3 +1,17 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import unittest import importlib.util @@ -155,7 +169,7 @@ def require_vlm_env(test_case): """ Decorator marking a test that requires some special env to load vlm model. - These tests are skipped when not meet the environment requirments. + These tests are skipped when not meet the environment requirements. """ From ffcba1f5ba5b1c2ac862c515eaf68f5f01714581 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 12 May 2025 22:55:33 -0400 Subject: [PATCH 3/7] update Signed-off-by: n1ck-guo --- auto_round/script/llm.py | 10 +- auto_round/testing_utils.py | 23 +- docs/step_by_step.md | 2 +- test/{ => test_cpu}/_test_helpers.py | 0 test/{ => test_cpu}/conftest.py | 0 test/{ => test_cpu}/requirements.txt | 0 test/{ => test_cpu}/test_act_quantization.py | 0 .../test_auto_round_hpu_only.py | 0 test/{ => test_cpu}/test_autoopt.py | 0 test/{ => test_cpu}/test_autoround.py | 0 test/{ => test_cpu}/test_autoround_acc.py | 0 .../test_autoround_export_to_itrex.py | 0 test/{ => test_cpu}/test_basic_usage.py | 0 test/{ => test_cpu}/test_block_names.py | 0 test/{ => test_cpu}/test_calib_dataset.py | 0 test/{ => test_cpu}/test_conv1d.py | 0 test/{ => test_cpu}/test_export.py | 0 test/{ => test_cpu}/test_generation.py | 0 test/{ => test_cpu}/test_gguf_format.py | 0 test/{ => test_cpu}/test_hpu.py | 0 test/{ => test_cpu}/test_load_awq_gptq.py | 0 test/{ => test_cpu}/test_low_cpu_mem.py | 0 test/{ => test_cpu}/test_mllm.py | 0 test/test_cpu/test_script.py | 18 ++ test/{ => test_cpu}/test_utils.py | 0 test/{ => test_cpu}/test_woq_linear.py | 0 .../test_cuda}/_test_helpers.py | 0 .../test_cuda}/requirements.txt | 4 +- test/test_cuda/requirements_vlm.txt | 23 ++ {test_cuda => test/test_cuda}/test_2_3bits.py | 6 +- .../test_cuda}/test_auto_round_format.py | 11 +- .../test_cuda}/test_calib_dataset.py | 0 {test_cuda => test/test_cuda}/test_conv1d.py | 0 .../test_cuda}/test_exllamav2_backend.py | 0 {test_cuda => test/test_cuda}/test_export.py | 5 +- .../test_cuda}/test_get_block_name.py | 0 {test_cuda => test/test_cuda}/test_gguf.py | 0 .../test_cuda}/test_main_func.py | 5 +- .../test_cuda}/test_marlin_backend.py | 0 .../test_cuda}/test_multiple_card.py | 4 +- .../test_cuda}/test_multiple_card_calib.py | 0 {test_cuda => test/test_cuda}/test_qbits.py | 6 +- .../test_cuda}/test_support_vlms.py | 200 +++++++++--------- .../test_cuda}/test_triton_backend.py | 12 +- {test_cuda => test/test_cuda}/test_vlms.py | 3 +- {test_xpu => test/test_xpu}/test_autoround.py | 0 46 files changed, 192 insertions(+), 140 deletions(-) rename test/{ => test_cpu}/_test_helpers.py (100%) rename test/{ => test_cpu}/conftest.py (100%) rename test/{ => test_cpu}/requirements.txt (100%) rename test/{ => test_cpu}/test_act_quantization.py (100%) rename test/{ => test_cpu}/test_auto_round_hpu_only.py (100%) rename test/{ => test_cpu}/test_autoopt.py (100%) rename test/{ => test_cpu}/test_autoround.py (100%) rename test/{ => test_cpu}/test_autoround_acc.py (100%) rename test/{ => test_cpu}/test_autoround_export_to_itrex.py (100%) rename test/{ => test_cpu}/test_basic_usage.py (100%) rename test/{ => test_cpu}/test_block_names.py (100%) rename test/{ => test_cpu}/test_calib_dataset.py (100%) rename test/{ => test_cpu}/test_conv1d.py (100%) rename test/{ => test_cpu}/test_export.py (100%) rename test/{ => test_cpu}/test_generation.py (100%) rename test/{ => test_cpu}/test_gguf_format.py (100%) rename test/{ => test_cpu}/test_hpu.py (100%) rename test/{ => test_cpu}/test_load_awq_gptq.py (100%) rename test/{ => test_cpu}/test_low_cpu_mem.py (100%) rename test/{ => test_cpu}/test_mllm.py (100%) create mode 100644 test/test_cpu/test_script.py rename test/{ => test_cpu}/test_utils.py (100%) rename test/{ => test_cpu}/test_woq_linear.py (100%) rename {test_cuda => test/test_cuda}/_test_helpers.py (100%) rename {test_cuda => test/test_cuda}/requirements.txt (80%) create mode 100644 test/test_cuda/requirements_vlm.txt rename {test_cuda => test/test_cuda}/test_2_3bits.py (96%) rename {test_cuda => test/test_cuda}/test_auto_round_format.py (98%) rename {test_cuda => test/test_cuda}/test_calib_dataset.py (100%) rename {test_cuda => test/test_cuda}/test_conv1d.py (100%) rename {test_cuda => test/test_cuda}/test_exllamav2_backend.py (100%) rename {test_cuda => test/test_cuda}/test_export.py (98%) rename {test_cuda => test/test_cuda}/test_get_block_name.py (100%) rename {test_cuda => test/test_cuda}/test_gguf.py (100%) rename {test_cuda => test/test_cuda}/test_main_func.py (97%) rename {test_cuda => test/test_cuda}/test_marlin_backend.py (100%) rename {test_cuda => test/test_cuda}/test_multiple_card.py (98%) rename {test_cuda => test/test_cuda}/test_multiple_card_calib.py (100%) rename {test_cuda => test/test_cuda}/test_qbits.py (96%) rename {test_cuda => test/test_cuda}/test_support_vlms.py (73%) rename {test_cuda => test/test_cuda}/test_triton_backend.py (98%) rename {test_cuda => test/test_cuda}/test_vlms.py (99%) rename {test_xpu => test/test_xpu}/test_autoround.py (100%) diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py index 5a9594f6..01504914 100644 --- a/auto_round/script/llm.py +++ b/auto_round/script/llm.py @@ -370,7 +370,7 @@ def tune(args): if args.enable_torch_compile: logger.info("`torch.compile` is enabled to reduce tuning costs. " - "If it causes issues, you can disable it by remove `--enable_torch_compile` argument.") + "If it causes issues, you can disable it by removing `--enable_torch_compile` argument.") model_name = args.model if model_name[-1] == "/": @@ -582,7 +582,7 @@ def tune(args): device=device_str, eval_model_dtype=eval_model_dtype) print(make_table(res)) - print("evaluation running time=", time.time() - st) + print("evaluation running time=%ds" % (time.time() - st)) else: if args.eval_task_by_task: eval_task_by_task( @@ -599,7 +599,7 @@ def tune(args): res = simple_evaluate( model="hf", model_args=model_args, tasks=tasks, device=device_str, batch_size=args.eval_bs) print(make_table(res)) - print("evaluation running time=", time.time() - st) + print("evaluation running time=%ds" % (time.time() - st)) def _eval_init(tasks, model_path, device, disable_trust_remote_code=False, dtype="auto"): @@ -657,14 +657,14 @@ def eval(args): res = simple_evaluate_user_model( model, tokenizer, tasks=tasks, batch_size=batch_size, device=device_str) print(make_table(res)) - print("evaluation running time=", time.time() - st) + print("evaluation running time=%ds" % (time.time() - st)) else: st = time.time() res = simple_evaluate( model="hf", model_args=model_args, tasks=tasks, device=device_str, batch_size=args.eval_bs) from lm_eval.utils import make_table # pylint: disable=E0401 print(make_table(res)) - print("evaluation running time=", time.time() - st) + print("evaluation running time=%ds" % (time.time() - st)) def eval_task_by_task( diff --git a/auto_round/testing_utils.py b/auto_round/testing_utils.py index b6cb3461..b1ace9a4 100644 --- a/auto_round/testing_utils.py +++ b/auto_round/testing_utils.py @@ -23,10 +23,10 @@ def is_gguf_available(): return importlib.util.find_spec("gguf") is not None def is_autogptq_available(): - return importlib.util.find_spec("auto-gptq") is not None + return importlib.util.find_spec("auto_gptq") is not None def is_awq_available(): - return importlib.util.find_spec("autoawq") is not None + return importlib.util.find_spec("awq") is not None def is_optimum_available(): return importlib.util.find_spec("optimum") is not None @@ -39,10 +39,10 @@ def is_ipex_available(): return False def is_itrex_available(): - return importlib.util.find_spec("intel-extension-for-transformers") is not None + return importlib.util.find_spec("intel_extension_for_transformers") is not None def is_flash_attn_avaliable(): - return importlib.util.find_spec("flash-attn") is not None + return importlib.util.find_spec("flash_attn") is not None def is_gptqmodel_available(): try: @@ -51,7 +51,7 @@ def is_gptqmodel_available(): except ImportError: return False -def is_new_version(): +def greater_than_050(): try: require_version("auto-round>=0.5.0") return True @@ -86,7 +86,7 @@ def require_gptqmodel(test_case): These tests are skipped when gptqmodel isn't installed. """ - return unittest.skipUnless(is_autogptq_available(), "test requires gptqmodel>=2.0")(test_case) + return unittest.skipUnless(is_gptqmodel_available(), "test requires gptqmodel>=2.0")(test_case) def require_awq(test_case): @@ -128,14 +128,14 @@ def require_optimum(test_case): return unittest.skipUnless(is_optimum_available(), "test requires optimum")(test_case) -def require_new_version(test_case): +def require_greater_than_050(test_case): """ Decorator marking a test that requires auto-round>=0.5.0. These tests are skipped when auto-round<0.5.0. """ - return unittest.skipUnless(is_new_version(), "test requires auto-round>=0.5.0")(test_case) + return unittest.skipUnless(greater_than_050(), "test requires auto-round>=0.5.0")(test_case) def multi_card(test_case): @@ -177,9 +177,14 @@ def require_vlm_env(test_case): # pip install flash-attn --no-build-isolation env_check &= is_flash_attn_avaliable() - # git clone https://github.com/haotian-liu/LLaVA.git && cd LLaVA && pip install -e . + # pip install git+https://github.com/haotian-liu/LLaVA.git@v1.2.2 env_check &= importlib.util.find_spec("llava") is not None + # pip install git+https://github.com/deepseek-ai/DeepSeek-VL2.git + env_check &= importlib.util.find_spec("deepseek_vl2") is not None + + env_check &= importlib.util.find_spec("xformers") is not None + return unittest.skipUnless(env_check, "Environment is not satisfactory")(test_case) diff --git a/docs/step_by_step.md b/docs/step_by_step.md index 2be87282..eb981bb3 100644 --- a/docs/step_by_step.md +++ b/docs/step_by_step.md @@ -101,7 +101,7 @@ CPU, Intel GPU, HPU,and CUDA for both quantization and inference. This setting provides the best accuracy in most scenarios but is 4–5× slower than the standard AutoRound recipe. It is especially recommended for 2-bit quantization and is a good choice if sufficient resources are available. ```bash - auto-round-best --model facebook/opt-125m --bits 4 --group_size 128 --format "auto_gptq,auto_awq,auto_round" + auto-round-best --model facebook/opt-125m --bits 4 --group_size 128 --format "auto_gptq,auto_awq,auto_round" ``` - **Light Settings:** diff --git a/test/_test_helpers.py b/test/test_cpu/_test_helpers.py similarity index 100% rename from test/_test_helpers.py rename to test/test_cpu/_test_helpers.py diff --git a/test/conftest.py b/test/test_cpu/conftest.py similarity index 100% rename from test/conftest.py rename to test/test_cpu/conftest.py diff --git a/test/requirements.txt b/test/test_cpu/requirements.txt similarity index 100% rename from test/requirements.txt rename to test/test_cpu/requirements.txt diff --git a/test/test_act_quantization.py b/test/test_cpu/test_act_quantization.py similarity index 100% rename from test/test_act_quantization.py rename to test/test_cpu/test_act_quantization.py diff --git a/test/test_auto_round_hpu_only.py b/test/test_cpu/test_auto_round_hpu_only.py similarity index 100% rename from test/test_auto_round_hpu_only.py rename to test/test_cpu/test_auto_round_hpu_only.py diff --git a/test/test_autoopt.py b/test/test_cpu/test_autoopt.py similarity index 100% rename from test/test_autoopt.py rename to test/test_cpu/test_autoopt.py diff --git a/test/test_autoround.py b/test/test_cpu/test_autoround.py similarity index 100% rename from test/test_autoround.py rename to test/test_cpu/test_autoround.py diff --git a/test/test_autoround_acc.py b/test/test_cpu/test_autoround_acc.py similarity index 100% rename from test/test_autoround_acc.py rename to test/test_cpu/test_autoround_acc.py diff --git a/test/test_autoround_export_to_itrex.py b/test/test_cpu/test_autoround_export_to_itrex.py similarity index 100% rename from test/test_autoround_export_to_itrex.py rename to test/test_cpu/test_autoround_export_to_itrex.py diff --git a/test/test_basic_usage.py b/test/test_cpu/test_basic_usage.py similarity index 100% rename from test/test_basic_usage.py rename to test/test_cpu/test_basic_usage.py diff --git a/test/test_block_names.py b/test/test_cpu/test_block_names.py similarity index 100% rename from test/test_block_names.py rename to test/test_cpu/test_block_names.py diff --git a/test/test_calib_dataset.py b/test/test_cpu/test_calib_dataset.py similarity index 100% rename from test/test_calib_dataset.py rename to test/test_cpu/test_calib_dataset.py diff --git a/test/test_conv1d.py b/test/test_cpu/test_conv1d.py similarity index 100% rename from test/test_conv1d.py rename to test/test_cpu/test_conv1d.py diff --git a/test/test_export.py b/test/test_cpu/test_export.py similarity index 100% rename from test/test_export.py rename to test/test_cpu/test_export.py diff --git a/test/test_generation.py b/test/test_cpu/test_generation.py similarity index 100% rename from test/test_generation.py rename to test/test_cpu/test_generation.py diff --git a/test/test_gguf_format.py b/test/test_cpu/test_gguf_format.py similarity index 100% rename from test/test_gguf_format.py rename to test/test_cpu/test_gguf_format.py diff --git a/test/test_hpu.py b/test/test_cpu/test_hpu.py similarity index 100% rename from test/test_hpu.py rename to test/test_cpu/test_hpu.py diff --git a/test/test_load_awq_gptq.py b/test/test_cpu/test_load_awq_gptq.py similarity index 100% rename from test/test_load_awq_gptq.py rename to test/test_cpu/test_load_awq_gptq.py diff --git a/test/test_low_cpu_mem.py b/test/test_cpu/test_low_cpu_mem.py similarity index 100% rename from test/test_low_cpu_mem.py rename to test/test_cpu/test_low_cpu_mem.py diff --git a/test/test_mllm.py b/test/test_cpu/test_mllm.py similarity index 100% rename from test/test_mllm.py rename to test/test_cpu/test_mllm.py diff --git a/test/test_cpu/test_script.py b/test/test_cpu/test_script.py new file mode 100644 index 00000000..32662134 --- /dev/null +++ b/test/test_cpu/test_script.py @@ -0,0 +1,18 @@ +import os +import sys +import unittest + +sys.path.insert(0, "..") + + +class TestScript(unittest.TestCase): + def test_default(self): + os.system(''' + cd .. && + python -m auto_round + --iters 2 + --deployment_device fake + --output_dir ./tmp_script_test''') + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/test/test_utils.py b/test/test_cpu/test_utils.py similarity index 100% rename from test/test_utils.py rename to test/test_cpu/test_utils.py diff --git a/test/test_woq_linear.py b/test/test_cpu/test_woq_linear.py similarity index 100% rename from test/test_woq_linear.py rename to test/test_cpu/test_woq_linear.py diff --git a/test_cuda/_test_helpers.py b/test/test_cuda/_test_helpers.py similarity index 100% rename from test_cuda/_test_helpers.py rename to test/test_cuda/_test_helpers.py diff --git a/test_cuda/requirements.txt b/test/test_cuda/requirements.txt similarity index 80% rename from test_cuda/requirements.txt rename to test/test_cuda/requirements.txt index 00ae08a8..fb008694 100644 --- a/test_cuda/requirements.txt +++ b/test/test_cuda/requirements.txt @@ -4,7 +4,7 @@ auto-gptq datasets einops gptqmodel>=2.0 -intel-extension-for-pytorch>=2.5 +intel-extension-for-pytorch==2.6.0 intel-extension-for-transformers lm-eval>=0.4.2,<0.5 numpy < 2.0 @@ -12,7 +12,7 @@ optimum pandas pillow py-cpuinfo -torch +torch<2.7.0 torchvision tqdm transformers diff --git a/test/test_cuda/requirements_vlm.txt b/test/test_cuda/requirements_vlm.txt new file mode 100644 index 00000000..46d8220d --- /dev/null +++ b/test/test_cuda/requirements_vlm.txt @@ -0,0 +1,23 @@ +# git+https://github.com/haotian-liu/LLaVA.git@v1.2.2 +# git clone https://github.com/deepseek-ai/DeepSeek-VL2.git && cd DeepSeek-VL2 && pip install -e . +accelerate +autoawq +auto-gptq +bitsandbytes==0.44.0 +datasets +einops +flash-attn==2.6.1 +gptqmodel>=2.0 +intel-extension-for-pytorch==2.6.0 +intel-extension-for-transformers +lm-eval>=0.4.2,<0.5 +numpy < 2.0 +optimum +pandas +pillow +py-cpuinfo +torch==2.0.1 +torchvision==0.15.2 +triton==2.0.0 +tqdm +transformers==4.45.0 diff --git a/test_cuda/test_2_3bits.py b/test/test_cuda/test_2_3bits.py similarity index 96% rename from test_cuda/test_2_3bits.py rename to test/test_cuda/test_2_3bits.py index 82a84b12..5f49c254 100644 --- a/test_cuda/test_2_3bits.py +++ b/test/test_cuda/test_2_3bits.py @@ -12,7 +12,7 @@ from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate from lm_eval.utils import make_table # pylint: disable=E0401 -from auto_round.testing_utils import require_autogptq, require_new_version +from auto_round.testing_utils import require_autogptq, require_greater_than_050 def get_accuracy(data): @@ -57,7 +57,7 @@ def test_3bits_autoround(self): assert accuracy > 0.3 shutil.rmtree("./saved", ignore_errors=True) - @require_new_version + @require_greater_than_050 def test_norm_bias_tuning(self): model_name = "/models/opt-125m" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") @@ -76,7 +76,7 @@ def test_norm_bias_tuning(self): assert accuracy > 0.18 shutil.rmtree("./saved", ignore_errors=True) - @require_new_version + @require_greater_than_050 def test_2bits_autoround(self): model_name = "/models/opt-125m" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") diff --git a/test_cuda/test_auto_round_format.py b/test/test_cuda/test_auto_round_format.py similarity index 98% rename from test_cuda/test_auto_round_format.py rename to test/test_cuda/test_auto_round_format.py index 7b995e36..5fabe1bd 100644 --- a/test_cuda/test_auto_round_format.py +++ b/test/test_cuda/test_auto_round_format.py @@ -5,7 +5,7 @@ sys.path.insert(0, "..") from auto_round.eval.evaluation import simple_evaluate_user_model -from auto_round.testing_utils import require_new_version, require_autogptq, require_awq, require_ipex +from auto_round.testing_utils import require_greater_than_050, require_autogptq, require_awq, require_ipex import torch import transformers @@ -76,7 +76,7 @@ def tearDownClass(self): shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - @require_new_version + @require_greater_than_050 def test_autoround_asym(self): for bits in [2, 3, 4, 8]: model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) @@ -184,7 +184,7 @@ def test_awq_backend(self): self.model_infer(model, tokenizer) shutil.rmtree(self.save_folder, ignore_errors=True) - @require_new_version + @require_greater_than_050 def test_tritonv2_bf16(self): model_name = "OPEA/Meta-Llama-3.1-8B-Instruct-int4-sym-inc" quantization_config = AutoRoundConfig(backend="tritonv2") @@ -251,6 +251,7 @@ def test_autoround_gptq_sym_format(self): shutil.rmtree("./saved", ignore_errors=True) @require_awq + @require_ipex def test_autoround_awq_sym_format(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -288,7 +289,7 @@ def test_autoround_awq_sym_format(self): shutil.rmtree("./saved", ignore_errors=True) - @require_new_version + @require_greater_than_050 def test_autoround_sym(self): for bits in [2, 3, 4, 8]: model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) @@ -318,7 +319,7 @@ def test_autoround_sym(self): assert ("!!!" not in res) shutil.rmtree(self.save_folder, ignore_errors=True) - @require_new_version + @require_greater_than_050 def test_load_gptq_model_3bits(self): model_name = "LucasSantiago257/gemma-2b-2bits-gptq" quantization_config = AutoRoundConfig() diff --git a/test_cuda/test_calib_dataset.py b/test/test_cuda/test_calib_dataset.py similarity index 100% rename from test_cuda/test_calib_dataset.py rename to test/test_cuda/test_calib_dataset.py diff --git a/test_cuda/test_conv1d.py b/test/test_cuda/test_conv1d.py similarity index 100% rename from test_cuda/test_conv1d.py rename to test/test_cuda/test_conv1d.py diff --git a/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py similarity index 100% rename from test_cuda/test_exllamav2_backend.py rename to test/test_cuda/test_exllamav2_backend.py diff --git a/test_cuda/test_export.py b/test/test_cuda/test_export.py similarity index 98% rename from test_cuda/test_export.py rename to test/test_cuda/test_export.py index 3ac32e6c..62e75225 100644 --- a/test_cuda/test_export.py +++ b/test/test_cuda/test_export.py @@ -208,6 +208,7 @@ def test_autoawq_format(self): shutil.rmtree("./saved", ignore_errors=True) @require_optimum + @require_awq def test_autoawq_format_fp_qsave_layers(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) layer_config = {"model.decoder.layers.0.self_attn.k_proj": {"bits": 16}, @@ -225,8 +226,8 @@ def test_autoawq_format_fp_qsave_layers(self): dataset=self.llm_dataloader, layer_config=layer_config ) - quantized_model_path = "/data5/wenhuach/test_export" - autoround.qsave(output_dir=quantized_model_path, + quantized_model_path = "./saved/test_export" + autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") from auto_round import AutoRoundConfig model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", diff --git a/test_cuda/test_get_block_name.py b/test/test_cuda/test_get_block_name.py similarity index 100% rename from test_cuda/test_get_block_name.py rename to test/test_cuda/test_get_block_name.py diff --git a/test_cuda/test_gguf.py b/test/test_cuda/test_gguf.py similarity index 100% rename from test_cuda/test_gguf.py rename to test/test_cuda/test_gguf.py diff --git a/test_cuda/test_main_func.py b/test/test_cuda/test_main_func.py similarity index 97% rename from test_cuda/test_main_func.py rename to test/test_cuda/test_main_func.py index f879fbe8..be07692b 100644 --- a/test_cuda/test_main_func.py +++ b/test/test_cuda/test_main_func.py @@ -11,7 +11,7 @@ from auto_round import AutoRound, AutoRoundAdam from auto_round.eval.evaluation import simple_evaluate -from auto_round.testing_utils import require_gptqmodel +from auto_round.testing_utils import require_gptqmodel, require_optimum, require_awq from lm_eval.utils import make_table # pylint: disable=E0401 @@ -37,6 +37,8 @@ def tearDownClass(self): shutil.rmtree("runs", ignore_errors=True) @require_gptqmodel + @require_optimum + @require_awq def test_backend(self): model_name = "/models/opt-125m" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") @@ -79,6 +81,7 @@ def test_backend(self): @unittest.skipIf(torch.cuda.is_available() is False, "Skipping because no cuda") @require_gptqmodel + @require_awq def test_fp_layers(self): model_name = "/models/opt-125m" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") diff --git a/test_cuda/test_marlin_backend.py b/test/test_cuda/test_marlin_backend.py similarity index 100% rename from test_cuda/test_marlin_backend.py rename to test/test_cuda/test_marlin_backend.py diff --git a/test_cuda/test_multiple_card.py b/test/test_cuda/test_multiple_card.py similarity index 98% rename from test_cuda/test_multiple_card.py rename to test/test_cuda/test_multiple_card.py index d908606a..8c0e5e7f 100644 --- a/test_cuda/test_multiple_card.py +++ b/test/test_cuda/test_multiple_card.py @@ -10,7 +10,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate -from auto_round.testing_utils import multi_card, require_new_version, require_gptqmodel +from auto_round.testing_utils import multi_card, require_greater_than_050, require_gptqmodel def get_accuracy(data): @@ -201,7 +201,7 @@ def test_device_map(self): torch.cuda.empty_cache() @multi_card - @require_new_version + @require_greater_than_050 def test_device_map_for_triton(self): from transformers import AutoModelForCausalLM, AutoTokenizer model_name = "OPEA/Meta-Llama-3.1-8B-Instruct-int4-sym-inc" diff --git a/test_cuda/test_multiple_card_calib.py b/test/test_cuda/test_multiple_card_calib.py similarity index 100% rename from test_cuda/test_multiple_card_calib.py rename to test/test_cuda/test_multiple_card_calib.py diff --git a/test_cuda/test_qbits.py b/test/test_cuda/test_qbits.py similarity index 96% rename from test_cuda/test_qbits.py rename to test/test_cuda/test_qbits.py index d39f0562..f391f323 100644 --- a/test_cuda/test_qbits.py +++ b/test/test_cuda/test_qbits.py @@ -7,7 +7,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRoundConfig, AutoRound -from auto_round.testing_utils import require_ipex, require_itrex, require_gptqmodel, require_old_version +from auto_round.testing_utils import require_itrex, require_gptqmodel class TestAutoRound(unittest.TestCase): @@ -50,7 +50,6 @@ def tearDownClass(self): ## require torch 2.6 @require_itrex - @require_old_version def test_load_gptq_model_8bits(self): model_name = "acloudfan/opt-125m-gptq-8bit" quantization_config = AutoRoundConfig() @@ -61,7 +60,6 @@ def test_load_gptq_model_8bits(self): self.model_infer(model, tokenizer) @require_itrex - @require_old_version def test_load_gptq_model_2bits(self): model_name = "LucasSantiago257/gemma-2b-2bits-gptq" quantization_config = AutoRoundConfig() @@ -71,7 +69,7 @@ def test_load_gptq_model_2bits(self): tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) self.model_infer(model, tokenizer) - @require_ipex + @require_itrex def test_mixed_precision(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) diff --git a/test_cuda/test_support_vlms.py b/test/test_cuda/test_support_vlms.py similarity index 73% rename from test_cuda/test_support_vlms.py rename to test/test_cuda/test_support_vlms.py index 008ca0d4..54fe86a0 100644 --- a/test_cuda/test_support_vlms.py +++ b/test/test_cuda/test_support_vlms.py @@ -24,60 +24,60 @@ def tearDownClass(self): shutil.rmtree(self.save_dir, ignore_errors=True) @require_gptqmodel - def test_qwen2(self): - model_path = "/models/Qwen2-VL-2B-Instruct/" - # test tune - res = os.system( - f"cd .. && {self.python_path} -m auto_round --mllm " - f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") - self.assertFalse(res > 0 or res == -1, msg="qwen2 tuning fail") - - # test infer - quantized_model_path = os.path.join(self.save_dir, "Qwen2-VL-2B-Instruct-w4g128-auto_round") + # def test_qwen2(self): + # model_path = "/models/Qwen2-VL-2B-Instruct/" + # # test tune + # res = os.system( + # f"cd .. && {self.python_path} -m auto_round --mllm " + # f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") + # self.assertFalse(res > 0 or res == -1, msg="qwen2 tuning fail") + + # # test infer + # quantized_model_path = os.path.join(self.save_dir, "Qwen2-VL-2B-Instruct-w4g128-auto_round") - from transformers import Qwen2VLForConditionalGeneration, AutoProcessor - model = Qwen2VLForConditionalGeneration.from_pretrained( - quantized_model_path, - torch_dtype="float16", - device_map=f"cuda:{self.device}", - ) - processor = AutoProcessor.from_pretrained(quantized_model_path) - image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" - messages = [ - { - "role": "user", - "content": [ - { - "type": "image", - "image": image_url, - }, - {"type": "text", "text": "Describe this image."}, - ], - } - ] - - # Preparation for inference - text = processor.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - image_inputs = Image.open(requests.get(image_url, stream=True).raw) - inputs = processor( - text=[text], - images=image_inputs, - padding=True, - return_tensors="pt", - ) - inputs = inputs.to(model.device) - - generated_ids = model.generate(**inputs, max_new_tokens=128) - generated_ids_trimmed = [ - out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) - ] - output_text = processor.batch_decode( - generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False - ) - print(output_text[0]) - shutil.rmtree(quantized_model_path, ignore_errors=True) + # from transformers import Qwen2VLForConditionalGeneration, AutoProcessor + # model = Qwen2VLForConditionalGeneration.from_pretrained( + # quantized_model_path, + # torch_dtype="float16", + # device_map=f"cuda:{self.device}", + # ) + # processor = AutoProcessor.from_pretrained(quantized_model_path) + # image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" + # messages = [ + # { + # "role": "user", + # "content": [ + # { + # "type": "image", + # "image": image_url, + # }, + # {"type": "text", "text": "Describe this image."}, + # ], + # } + # ] + + # # Preparation for inference + # text = processor.apply_chat_template( + # messages, tokenize=False, add_generation_prompt=True + # ) + # image_inputs = Image.open(requests.get(image_url, stream=True).raw) + # inputs = processor( + # text=[text], + # images=image_inputs, + # padding=True, + # return_tensors="pt", + # ) + # inputs = inputs.to(model.device) + + # generated_ids = model.generate(**inputs, max_new_tokens=128) + # generated_ids_trimmed = [ + # out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) + # ] + # output_text = processor.batch_decode( + # generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + # ) + # print(output_text[0]) + # shutil.rmtree(quantized_model_path, ignore_errors=True) @require_vlm_env def test_phi3(self): @@ -238,45 +238,46 @@ class DataArgs: shutil.rmtree(quantized_model_path, ignore_errors=True) @require_gptqmodel - def test_llama(self): - model_path = "/models/Llama-3.2-11B-Vision-Instruct/" - ## test tune - res = os.system( - f"cd .. && {self.python_path} -m auto_round --mllm " - f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") - self.assertFalse(res > 0 or res == -1, msg="llama-3.2 tuning fail") - - ## test infer - from transformers import MllamaForConditionalGeneration, AutoProcessor - quantized_model_path = os.path.join(self.save_dir, "Llama-3.2-11B-Vision-Instruct-w4g128-auto_round") - model = MllamaForConditionalGeneration.from_pretrained( - quantized_model_path, - torch_dtype="float16", - device_map=f"cuda:{self.device}", - ) - processor = AutoProcessor.from_pretrained(quantized_model_path) - image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" - messages = [ - {"role": "user", "content": [ - {"type": "image"}, - {"type": "text", "text": "Please write a haiku for this one, it would be: "} - ]} - ] - - # Preparation for inference - image = Image.open(requests.get(image_url, stream=True).raw) - input_text = processor.apply_chat_template(messages, add_generation_prompt=True) - inputs = processor( - image, - input_text, - add_special_tokens=False, - return_tensors="pt" - ).to(model.device) - - output = model.generate(**inputs, max_new_tokens=50) - print(processor.decode(output[0])) - shutil.rmtree(quantized_model_path, ignore_errors=True) + # def test_llama(self): + # model_path = "/models/Llama-3.2-11B-Vision-Instruct/" + # ## test tune + # res = os.system( + # f"cd .. && {self.python_path} -m auto_round --mllm " + # f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") + # self.assertFalse(res > 0 or res == -1, msg="llama-3.2 tuning fail") + + # ## test infer + # from transformers import MllamaForConditionalGeneration, AutoProcessor + # quantized_model_path = os.path.join(self.save_dir, "Llama-3.2-11B-Vision-Instruct-w4g128-auto_round") + # model = MllamaForConditionalGeneration.from_pretrained( + # quantized_model_path, + # torch_dtype="float16", + # device_map=f"cuda:{self.device}", + # ) + # processor = AutoProcessor.from_pretrained(quantized_model_path) + # image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" + # messages = [ + # {"role": "user", "content": [ + # {"type": "image"}, + # {"type": "text", "text": "Please write a haiku for this one, it would be: "} + # ]} + # ] + + # # Preparation for inference + # image = Image.open(requests.get(image_url, stream=True).raw) + # input_text = processor.apply_chat_template(messages, add_generation_prompt=True) + # inputs = processor( + # image, + # input_text, + # add_special_tokens=False, + # return_tensors="pt" + # ).to(model.device) + + # output = model.generate(**inputs, max_new_tokens=50) + # print(processor.decode(output[0])) + # shutil.rmtree(quantized_model_path, ignore_errors=True) + @require_vlm_env def test_cogvlm(self): model_path = "/models/cogvlm2-llama3-chat-19B/" ## test tune @@ -331,15 +332,16 @@ def test_cogvlm(self): print(response) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_72b(self): - model_path = "/models/Qwen2-VL-72B-Instruct/" - res = os.system( - f"cd .. && {self.python_path} -m auto_round --mllm " - f"--model {model_path} --iter 1 --nsamples 1 --bs 1 --output_dir {self.save_dir} --device {self.device}" - ) - self.assertFalse(res > 0 or res == -1, msg="qwen2-72b tuning fail") - shutil.rmtree(self.save_dir, ignore_errors=True) + # def test_72b(self): + # model_path = "/models/Qwen2-VL-72B-Instruct/" + # res = os.system( + # f"cd .. && {self.python_path} -m auto_round --mllm " + # f"--model {model_path} --iter 1 --nsamples 1 --bs 1 --output_dir {self.save_dir} --device {self.device}" + # ) + # self.assertFalse(res > 0 or res == -1, msg="qwen2-72b tuning fail") + # shutil.rmtree(self.save_dir, ignore_errors=True) + @require_vlm_env def test_deepseek_vl2(self): model_path = "/models/deepseek-vl2-tiny" res = os.system( diff --git a/test_cuda/test_triton_backend.py b/test/test_cuda/test_triton_backend.py similarity index 98% rename from test_cuda/test_triton_backend.py rename to test/test_cuda/test_triton_backend.py index 2db292b6..bac83243 100644 --- a/test_cuda/test_triton_backend.py +++ b/test/test_cuda/test_triton_backend.py @@ -10,7 +10,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound, AutoRoundConfig -from auto_round.testing_utils import require_new_version +from auto_round.testing_utils import require_greater_than_050 class LLMDataLoader: @@ -62,7 +62,7 @@ def tearDownClass(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - @require_new_version + @require_greater_than_050 def test_tritonv2_4bits_asym(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -110,7 +110,7 @@ def test_tritonv2_4bits_asym(self): torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) - @require_new_version + @require_greater_than_050 def test_tritonv2_2bits_asym(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -155,7 +155,7 @@ def test_tritonv2_2bits_asym(self): torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) - @require_new_version + @require_greater_than_050 def test_tritonv2_4bits_sym(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -204,7 +204,7 @@ def test_tritonv2_4bits_sym(self): shutil.rmtree("./saved", ignore_errors=True) - @require_new_version + @require_greater_than_050 def test_tritonv2_8bits_sym(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -251,7 +251,7 @@ def test_tritonv2_8bits_sym(self): torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) - @require_new_version + @require_greater_than_050 def test_tritonv2_2bits_sym(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) diff --git a/test_cuda/test_vlms.py b/test/test_cuda/test_vlms.py similarity index 99% rename from test_cuda/test_vlms.py rename to test/test_cuda/test_vlms.py index 9faddb05..7917ffc5 100644 --- a/test_cuda/test_vlms.py +++ b/test/test_cuda/test_vlms.py @@ -10,7 +10,7 @@ from PIL import Image from auto_round import AutoRoundConfig -from auto_round.testing_utils import require_gptqmodel, require_vlm_env +from auto_round.testing_utils import require_gptqmodel, require_vlm_env, require_optimum class TestAutoRound(unittest.TestCase): @@ -89,6 +89,7 @@ def qwen_inference(self, quantized_model_dir): print(output_text[0]) @require_gptqmodel + @require_optimum def test_vlm_tune(self): from auto_round import AutoRoundMLLM from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoTokenizer diff --git a/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py similarity index 100% rename from test_xpu/test_autoround.py rename to test/test_xpu/test_autoround.py From 94a88196cc167b43a89dd924c9651c3535375728 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 13 May 2025 01:21:13 -0400 Subject: [PATCH 4/7] fix Signed-off-by: n1ck-guo --- test/test_cpu/test_act_quantization.py | 2 +- test/test_cpu/test_autoopt.py | 2 +- test/test_cpu/test_autoround.py | 2 +- test/test_cpu/test_autoround_acc.py | 2 +- .../test_autoround_export_to_itrex.py | 2 +- test/test_cpu/test_basic_usage.py | 20 +++++++++---------- test/test_cpu/test_block_names.py | 4 ++-- test/test_cpu/test_calib_dataset.py | 2 +- test/test_cpu/test_conv1d.py | 2 +- test/test_cpu/test_export.py | 2 +- test/test_cpu/test_generation.py | 2 +- test/test_cpu/test_gguf_format.py | 6 +++--- test/test_cpu/test_hpu.py | 2 +- test/test_cpu/test_load_awq_gptq.py | 2 +- test/test_cpu/test_low_cpu_mem.py | 2 +- test/test_cpu/test_mllm.py | 2 +- test/test_cpu/test_script.py | 4 ++-- test/test_cpu/test_utils.py | 2 ++ test/test_cpu/test_woq_linear.py | 3 ++- test/test_cuda/requirements_vlm.txt | 4 +--- test/test_cuda/test_2_3bits.py | 2 +- test/test_cuda/test_auto_round_format.py | 2 +- test/test_cuda/test_calib_dataset.py | 2 +- test/test_cuda/test_conv1d.py | 2 +- test/test_cuda/test_exllamav2_backend.py | 2 +- test/test_cuda/test_export.py | 2 +- test/test_cuda/test_get_block_name.py | 2 +- test/test_cuda/test_gguf.py | 6 +++--- test/test_cuda/test_main_func.py | 2 +- test/test_cuda/test_marlin_backend.py | 2 +- test/test_cuda/test_multiple_card.py | 2 +- test/test_cuda/test_multiple_card_calib.py | 4 ++-- test/test_cuda/test_qbits.py | 2 +- test/test_cuda/test_support_vlms.py | 18 ++++++++--------- test/test_cuda/test_triton_backend.py | 2 +- test/test_cuda/test_vlms.py | 2 +- test/test_xpu/test_autoround.py | 2 +- 37 files changed, 63 insertions(+), 62 deletions(-) diff --git a/test/test_cpu/test_act_quantization.py b/test/test_cpu/test_act_quantization.py index a4ada07d..e72c9931 100644 --- a/test/test_cpu/test_act_quantization.py +++ b/test/test_cpu/test_act_quantization.py @@ -3,7 +3,7 @@ import sys import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/test/test_cpu/test_autoopt.py b/test/test_cpu/test_autoopt.py index 3ea4565d..6a986706 100644 --- a/test/test_cpu/test_autoopt.py +++ b/test/test_cpu/test_autoopt.py @@ -3,7 +3,7 @@ import sys import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/test/test_cpu/test_autoround.py b/test/test_cpu/test_autoround.py index 77ad0dac..02352c00 100644 --- a/test/test_cpu/test_autoround.py +++ b/test/test_cpu/test_autoround.py @@ -5,7 +5,7 @@ from auto_round.eval.evaluation import simple_evaluate_user_model -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/test/test_cpu/test_autoround_acc.py b/test/test_cpu/test_autoround_acc.py index 37229b1c..1158e9be 100644 --- a/test/test_cpu/test_autoround_acc.py +++ b/test/test_cpu/test_autoround_acc.py @@ -3,7 +3,7 @@ import shutil import sys import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import torch import transformers from math import isclose diff --git a/test/test_cpu/test_autoround_export_to_itrex.py b/test/test_cpu/test_autoround_export_to_itrex.py index ba5424fd..7e894b63 100644 --- a/test/test_cpu/test_autoround_export_to_itrex.py +++ b/test/test_cpu/test_autoround_export_to_itrex.py @@ -3,7 +3,7 @@ import sys import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/test/test_cpu/test_basic_usage.py b/test/test_cpu/test_basic_usage.py index 695271be..eeedbcd6 100644 --- a/test/test_cpu/test_basic_usage.py +++ b/test/test_cpu/test_basic_usage.py @@ -3,7 +3,7 @@ import sys import unittest -sys.path.insert(0, '..') +sys.path.insert(0, '../..') class TestAutoRoundCmd(unittest.TestCase): @@ -21,52 +21,52 @@ def test_auto_round_cmd(self): ##test llm script # res = os.system( - # f"cd .. && {python_path} -m auto_round -h") + # f"cd ../.. && {python_path} -m auto_round -h") # if res > 0 or res == -1: # assert False, "cmd line test fail, please have a check" # res = os.system( - f"cd .. && {python_path} -m auto_round --model 'facebook/opt-125m' --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved --tasks piqa") + f"cd ../.. && {python_path} -m auto_round --model 'facebook/opt-125m' --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved --tasks piqa") if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"cd .. && {python_path} -m auto_round --model 'facebook/opt-125m' --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32" + f"cd ../.. && {python_path} -m auto_round --model 'facebook/opt-125m' --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"cd .. && {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai") + f"cd ../.. && {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai") if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" # test mllm script # test auto_round_mllm help res = os.system( - f"cd .. && {python_path} -m auto_round --mllm -h") + f"cd ../.. && {python_path} -m auto_round --mllm -h") if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" # test auto_round_mllm --eval help res = os.system( - f"cd .. && {python_path} -m auto_round --mllm --eval -h") + f"cd ../.. && {python_path} -m auto_round --mllm --eval -h") if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" # test auto_round_mllm --lmms help res = os.system( - f"cd .. && {python_path} -m auto_round --mllm --lmms -h") + f"cd ../.. && {python_path} -m auto_round --mllm --lmms -h") if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"cd .. && {python_path} -m auto_round --mllm --iter 2 --nsamples 10 --seqlen 32 --format auto_round --output_dir ./saved") + f"cd ../.. && {python_path} -m auto_round --mllm --iter 2 --nsamples 10 --seqlen 32 --format auto_round --output_dir ./saved") if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"cd .. && {python_path} -m auto_round --mllm --iter 2 --nsamples 10 --seqlen 256 --format auto_round" + f"cd ../.. && {python_path} -m auto_round --mllm --iter 2 --nsamples 10 --seqlen 256 --format auto_round" " --quant_nontext_module --output_dir ./saved ") if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" diff --git a/test/test_cpu/test_block_names.py b/test/test_cpu/test_block_names.py index 7c618b68..2e26565f 100644 --- a/test/test_cpu/test_block_names.py +++ b/test/test_cpu/test_block_names.py @@ -3,7 +3,7 @@ import sys import unittest sys.path.insert(0, ".") -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import torch import torch.nn as nn from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig @@ -203,7 +203,7 @@ def test_moe(self): ##tokenizer = AutoTokenizer.from_pretrained(model_name) # python_path = sys.executable # res = os.system( - # f"cd .. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {model_name} --iter 1 --nsamples 1 --format auto_round --output_dir test/saved --disable_eval") + # f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {model_name} --iter 1 --nsamples 1 --format auto_round --output_dir test/saved --disable_eval") # if res > 0 or res == -1: # assert False, "cmd line test fail, please have a check" # diff --git a/test/test_cpu/test_calib_dataset.py b/test/test_cpu/test_calib_dataset.py index 57990d9b..5dea5bf0 100644 --- a/test/test_cpu/test_calib_dataset.py +++ b/test/test_cpu/test_calib_dataset.py @@ -3,7 +3,7 @@ import sys import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import json import torch diff --git a/test/test_cpu/test_conv1d.py b/test/test_cpu/test_conv1d.py index 4f45fcbd..cfcac0bd 100644 --- a/test/test_cpu/test_conv1d.py +++ b/test/test_cpu/test_conv1d.py @@ -3,7 +3,7 @@ import sys import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import torch from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py index 55cecd37..fd751c3c 100644 --- a/test/test_cpu/test_export.py +++ b/test/test_cpu/test_export.py @@ -2,7 +2,7 @@ import sys import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import torch from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/test/test_cpu/test_generation.py b/test/test_cpu/test_generation.py index e2a14097..0ca6390e 100644 --- a/test/test_cpu/test_generation.py +++ b/test/test_cpu/test_generation.py @@ -3,7 +3,7 @@ import sys import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index c85eded5..b353232a 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -2,7 +2,7 @@ import sys import unittest import shutil -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import torch from transformers import AutoModelForCausalLM, AutoTokenizer @@ -35,7 +35,7 @@ def tearDownClass(self): def test_basic_usage(self): python_path = sys.executable res = os.system( - f"cd .. && {python_path} -m auto_round --model {self.model_name} --eval_task_by_task" + f"cd ../.. && {python_path} -m auto_round --model {self.model_name} --eval_task_by_task" f" --tasks piqa --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0" ) if res > 0 or res == -1: @@ -43,7 +43,7 @@ def test_basic_usage(self): shutil.rmtree("./saved", ignore_errors=True) res = os.system( - f"cd .. && {python_path} -m auto_round --model {self.model_name}" + f"cd ../.. && {python_path} -m auto_round --model {self.model_name}" f" --tasks piqa,openbookqa --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0" ) if res > 0 or res == -1: diff --git a/test/test_cpu/test_hpu.py b/test/test_cpu/test_hpu.py index f7c87f0d..629a9321 100644 --- a/test/test_cpu/test_hpu.py +++ b/test/test_cpu/test_hpu.py @@ -2,7 +2,7 @@ import sys import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import torch from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/test/test_cpu/test_load_awq_gptq.py b/test/test_cpu/test_load_awq_gptq.py index 6db06c03..08eb8d75 100644 --- a/test/test_cpu/test_load_awq_gptq.py +++ b/test/test_cpu/test_load_awq_gptq.py @@ -2,7 +2,7 @@ import sys import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/test/test_cpu/test_low_cpu_mem.py b/test/test_cpu/test_low_cpu_mem.py index 737fc164..2c4378f0 100644 --- a/test/test_cpu/test_low_cpu_mem.py +++ b/test/test_cpu/test_low_cpu_mem.py @@ -2,7 +2,7 @@ import sys import os import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import torch from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/test/test_cpu/test_mllm.py b/test/test_cpu/test_mllm.py index fd6d996b..16005241 100644 --- a/test/test_cpu/test_mllm.py +++ b/test/test_cpu/test_mllm.py @@ -1,7 +1,7 @@ import sys import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") from auto_round import AutoRoundMLLM diff --git a/test/test_cpu/test_script.py b/test/test_cpu/test_script.py index 32662134..069a59ec 100644 --- a/test/test_cpu/test_script.py +++ b/test/test_cpu/test_script.py @@ -2,13 +2,13 @@ import sys import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") class TestScript(unittest.TestCase): def test_default(self): os.system(''' - cd .. && + cd ../.. && python -m auto_round --iters 2 --deployment_device fake diff --git a/test/test_cpu/test_utils.py b/test/test_cpu/test_utils.py index e9faedbe..eff324e4 100644 --- a/test/test_cpu/test_utils.py +++ b/test/test_cpu/test_utils.py @@ -1,4 +1,6 @@ from unittest.mock import patch +import sys +sys.path.insert(0, "../..") import auto_round.utils as auto_round_utils class TestPackingWithNumba: diff --git a/test/test_cpu/test_woq_linear.py b/test/test_cpu/test_woq_linear.py index f049890e..1f48e230 100644 --- a/test/test_cpu/test_woq_linear.py +++ b/test/test_cpu/test_woq_linear.py @@ -1,6 +1,7 @@ import pytest import torch - +import sys +sys.path.insert(0, "../..") from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear diff --git a/test/test_cuda/requirements_vlm.txt b/test/test_cuda/requirements_vlm.txt index 46d8220d..707b14ea 100644 --- a/test/test_cuda/requirements_vlm.txt +++ b/test/test_cuda/requirements_vlm.txt @@ -1,16 +1,14 @@ # git+https://github.com/haotian-liu/LLaVA.git@v1.2.2 # git clone https://github.com/deepseek-ai/DeepSeek-VL2.git && cd DeepSeek-VL2 && pip install -e . accelerate -autoawq -auto-gptq bitsandbytes==0.44.0 datasets einops flash-attn==2.6.1 -gptqmodel>=2.0 intel-extension-for-pytorch==2.6.0 intel-extension-for-transformers lm-eval>=0.4.2,<0.5 +nvidia-cudnn-cu12==8.9.7.29 numpy < 2.0 optimum pandas diff --git a/test/test_cuda/test_2_3bits.py b/test/test_cuda/test_2_3bits.py index 5f49c254..51eec8b7 100644 --- a/test/test_cuda/test_2_3bits.py +++ b/test/test_cuda/test_2_3bits.py @@ -4,7 +4,7 @@ import unittest import re -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/test/test_cuda/test_auto_round_format.py b/test/test_cuda/test_auto_round_format.py index 5fabe1bd..817b5087 100644 --- a/test/test_cuda/test_auto_round_format.py +++ b/test/test_cuda/test_auto_round_format.py @@ -3,7 +3,7 @@ import sys import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.testing_utils import require_greater_than_050, require_autogptq, require_awq, require_ipex diff --git a/test/test_cuda/test_calib_dataset.py b/test/test_cuda/test_calib_dataset.py index a4d8a73e..69479da1 100644 --- a/test/test_cuda/test_calib_dataset.py +++ b/test/test_cuda/test_calib_dataset.py @@ -3,7 +3,7 @@ import sys import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import json import torch diff --git a/test/test_cuda/test_conv1d.py b/test/test_cuda/test_conv1d.py index 79dad6d6..bf0daba3 100644 --- a/test/test_cuda/test_conv1d.py +++ b/test/test_cuda/test_conv1d.py @@ -3,7 +3,7 @@ import sys import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import torch from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/test/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py index de243219..151cf690 100644 --- a/test/test_cuda/test_exllamav2_backend.py +++ b/test/test_cuda/test_exllamav2_backend.py @@ -2,7 +2,7 @@ import sys import unittest import pytest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import torch diff --git a/test/test_cuda/test_export.py b/test/test_cuda/test_export.py index 62e75225..bbd0b5f5 100644 --- a/test/test_cuda/test_export.py +++ b/test/test_cuda/test_export.py @@ -3,7 +3,7 @@ import sys import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/test/test_cuda/test_get_block_name.py b/test/test_cuda/test_get_block_name.py index f077a65d..fd89e2aa 100644 --- a/test/test_cuda/test_get_block_name.py +++ b/test/test_cuda/test_get_block_name.py @@ -3,7 +3,7 @@ import sys import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer, Qwen2VLForConditionalGeneration, AutoModelForVision2Seq, \ diff --git a/test/test_cuda/test_gguf.py b/test/test_cuda/test_gguf.py index f31de413..43044672 100644 --- a/test/test_cuda/test_gguf.py +++ b/test/test_cuda/test_gguf.py @@ -3,7 +3,7 @@ import sys import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer @@ -60,7 +60,7 @@ def test_gguf_format(self): save_dir = os.path.join(os.path.dirname(__file__), "saved") model_path = "Qwen/Qwen2.5-0.5B-Instruct" res = os.system( - f"cd .. && {sys.executable} -m auto_round --model {model_path} --iter 2 " + f"cd ../.. && {sys.executable} -m auto_round --model {model_path} --iter 2 " f"--output_dir {save_dir} --nsample 2 --format gguf:q4_0 --device 0" ) print(save_dir) @@ -110,7 +110,7 @@ def test_q2_k_export(self): def test_basic_usage(self): python_path = sys.executable res = os.system( - f"cd .. && {python_path} -m auto_round --model {self.model_name} --eval_task_by_task" + f"cd ../.. && {python_path} -m auto_round --model {self.model_name} --eval_task_by_task" f" --tasks piqa,openbookqa --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0" ) if res > 0 or res == -1: diff --git a/test/test_cuda/test_main_func.py b/test/test_cuda/test_main_func.py index be07692b..a3453338 100644 --- a/test/test_cuda/test_main_func.py +++ b/test/test_cuda/test_main_func.py @@ -4,7 +4,7 @@ import unittest import re -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/test/test_cuda/test_marlin_backend.py b/test/test_cuda/test_marlin_backend.py index a800eb00..cb3516ca 100644 --- a/test/test_cuda/test_marlin_backend.py +++ b/test/test_cuda/test_marlin_backend.py @@ -3,7 +3,7 @@ import unittest import pytest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import torch from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/test/test_cuda/test_multiple_card.py b/test/test_cuda/test_multiple_card.py index 8c0e5e7f..7f2dcd11 100644 --- a/test/test_cuda/test_multiple_card.py +++ b/test/test_cuda/test_multiple_card.py @@ -2,7 +2,7 @@ import sys import unittest import shutil -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import torch diff --git a/test/test_cuda/test_multiple_card_calib.py b/test/test_cuda/test_multiple_card_calib.py index 2d7ff712..63a494f7 100644 --- a/test/test_cuda/test_multiple_card_calib.py +++ b/test/test_cuda/test_multiple_card_calib.py @@ -4,7 +4,7 @@ import shutil import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") from auto_round.testing_utils import multi_card @@ -35,7 +35,7 @@ def test_multiple_card_calib(self): ##test llm script res = os.system( - f"cd .. && {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --disable_eval --iters 1 --nsamples 1 --output_dir None") + f"cd ../.. && {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --disable_eval --iters 1 --nsamples 1 --output_dir None") if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" diff --git a/test/test_cuda/test_qbits.py b/test/test_cuda/test_qbits.py index f391f323..8b79d97b 100644 --- a/test/test_cuda/test_qbits.py +++ b/test/test_cuda/test_qbits.py @@ -2,7 +2,7 @@ import sys import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/test/test_cuda/test_support_vlms.py b/test/test_cuda/test_support_vlms.py index 54fe86a0..fe424d20 100644 --- a/test/test_cuda/test_support_vlms.py +++ b/test/test_cuda/test_support_vlms.py @@ -3,7 +3,7 @@ import shutil import unittest -sys.path.insert(0, '..') +sys.path.insert(0, '../..') from auto_round import AutoRoundConfig ## must import for auto-round format from auto_round.testing_utils import require_gptqmodel, require_vlm_env @@ -28,7 +28,7 @@ def tearDownClass(self): # model_path = "/models/Qwen2-VL-2B-Instruct/" # # test tune # res = os.system( - # f"cd .. && {self.python_path} -m auto_round --mllm " + # f"cd ../.. && {self.python_path} -m auto_round --mllm " # f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") # self.assertFalse(res > 0 or res == -1, msg="qwen2 tuning fail") @@ -84,7 +84,7 @@ def test_phi3(self): model_path = "/models/Phi-3.5-vision-instruct/" ## test tune res = os.system( - f"cd .. && {self.python_path} -m auto_round --mllm " + f"cd ../.. && {self.python_path} -m auto_round --mllm " f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") self.assertFalse(res > 0 or res == -1, msg="Phi-3.5 tuning fail") @@ -142,7 +142,7 @@ def test_phi3_vision_awq(self): model_path = "/models/Phi-3.5-vision-instruct/" ## test tune res = os.system( - f"cd .. && {self.python_path} -m auto_round --mllm " + f"cd ../.. && {self.python_path} -m auto_round --mllm " f"--model {model_path} --iter 2 --quant_nontext_module " f"--nsample 64 --seqlen 32 " f"--format auto_awq --output_dir {self.save_dir} --device {self.device}") @@ -205,7 +205,7 @@ def test_llava(self): model_path = "/models/llava-v1.5-7b/" ## test tune res = os.system( - f"cd .. && {self.python_path} -m auto_round --mllm " + f"cd ../.. && {self.python_path} -m auto_round --mllm " f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") self.assertFalse(res > 0 or res == -1, msg="llava-v1.5-7b tuning fail") @@ -242,7 +242,7 @@ class DataArgs: # model_path = "/models/Llama-3.2-11B-Vision-Instruct/" # ## test tune # res = os.system( - # f"cd .. && {self.python_path} -m auto_round --mllm " + # f"cd ../.. && {self.python_path} -m auto_round --mllm " # f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") # self.assertFalse(res > 0 or res == -1, msg="llama-3.2 tuning fail") @@ -282,7 +282,7 @@ def test_cogvlm(self): model_path = "/models/cogvlm2-llama3-chat-19B/" ## test tune res = os.system( - f"cd .. && {self.python_path} -m auto_round --mllm " + f"cd ../.. && {self.python_path} -m auto_round --mllm " f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") self.assertFalse(res > 0 or res == -1, msg="cogvlm2 tuning fail") @@ -335,7 +335,7 @@ def test_cogvlm(self): # def test_72b(self): # model_path = "/models/Qwen2-VL-72B-Instruct/" # res = os.system( - # f"cd .. && {self.python_path} -m auto_round --mllm " + # f"cd ../.. && {self.python_path} -m auto_round --mllm " # f"--model {model_path} --iter 1 --nsamples 1 --bs 1 --output_dir {self.save_dir} --device {self.device}" # ) # self.assertFalse(res > 0 or res == -1, msg="qwen2-72b tuning fail") @@ -345,7 +345,7 @@ def test_cogvlm(self): def test_deepseek_vl2(self): model_path = "/models/deepseek-vl2-tiny" res = os.system( - f"cd .. && {self.python_path} -m auto_round --mllm " + f"cd ../.. && {self.python_path} -m auto_round --mllm " f"--model {model_path} --iter 3 --nsamples 10 --bs 4 --output_dir {self.save_dir} --device auto --group_size 32 " f"--fp_layers language.model.layers.4,language.model.layers.6" ) diff --git a/test/test_cuda/test_triton_backend.py b/test/test_cuda/test_triton_backend.py index bac83243..ddb24eca 100644 --- a/test/test_cuda/test_triton_backend.py +++ b/test/test_cuda/test_triton_backend.py @@ -3,7 +3,7 @@ import sys import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") from auto_round.eval.evaluation import simple_evaluate_user_model import torch diff --git a/test/test_cuda/test_vlms.py b/test/test_cuda/test_vlms.py index 7917ffc5..ccc24a4f 100644 --- a/test/test_cuda/test_vlms.py +++ b/test/test_cuda/test_vlms.py @@ -6,7 +6,7 @@ import unittest import requests -sys.path.insert(0, "..") +sys.path.insert(0, "../..") from PIL import Image from auto_round import AutoRoundConfig diff --git a/test/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py index d0ab1dd9..caad1663 100644 --- a/test/test_xpu/test_autoround.py +++ b/test/test_xpu/test_autoround.py @@ -3,7 +3,7 @@ import sys import unittest -sys.path.insert(0, "..") +sys.path.insert(0, "../..") import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer From ee2c0b5510f3807c67e26021cdfb96ac2d9ff56f Mon Sep 17 00:00:00 2001 From: "Sun, Xuehao" Date: Tue, 13 May 2025 14:52:10 +0800 Subject: [PATCH 5/7] fix ci path Signed-off-by: Sun, Xuehao --- .azure-pipelines/scripts/ut/run_ut.sh | 13 +++++++------ .azure-pipelines/scripts/ut/run_ut_hpu.sh | 4 ++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/.azure-pipelines/scripts/ut/run_ut.sh b/.azure-pipelines/scripts/ut/run_ut.sh index 91509b25..b482f29b 100644 --- a/.azure-pipelines/scripts/ut/run_ut.sh +++ b/.azure-pipelines/scripts/ut/run_ut.sh @@ -2,16 +2,17 @@ set -xe # install requirements -echo "set up UT env..." +echo "##[group]set up UT env..." export TQDM_MININTERVAL=60 -export TQDM_POSITION=-1 pip install pytest-cov pytest-html -pip install -r /auto-round/test/requirements.txt -pip list +pip install -r /auto-round/test/test_cpu/requirements.txt + # install latest gguf for ut test git clone https://github.com/ggml-org/llama.cpp.git && cd llama.cpp/gguf-py && pip install . +echo "##[endgroup]" +pip list -cd /auto-round/test || exit 1 +cd /auto-round/test/test_cpu || exit 1 find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} + export LD_LIBRARY_PATH=${HOME}/.local/lib/:$LD_LIBRARY_PATH @@ -31,7 +32,7 @@ cp report.html ${LOG_DIR}/ cp coverage.xml ${LOG_DIR}/ if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then - echo "Find errors in pytest case, please check the output..." + echo "##[error]Find errors in pytest case, please check the output..." exit 1 fi diff --git a/.azure-pipelines/scripts/ut/run_ut_hpu.sh b/.azure-pipelines/scripts/ut/run_ut_hpu.sh index 750562c2..ec2ad0d2 100644 --- a/.azure-pipelines/scripts/ut/run_ut_hpu.sh +++ b/.azure-pipelines/scripts/ut/run_ut_hpu.sh @@ -6,7 +6,7 @@ echo "set up UT env..." pip install pytest-cov pytest-html pip list -cd /auto-round/test || exit 1 +cd /auto-round/test/test_cpu || exit 1 find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} + export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH @@ -31,7 +31,7 @@ cp report.html ${LOG_DIR}/ cp coverage.xml ${LOG_DIR}/ if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then - echo "Find errors in pytest case, please check the output..." + echo "##[error]Find errors in pytest case, please check the output..." exit 1 fi From bf1d495fd5cadc78f85bad70ccb38ddb11057c9d Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 13 May 2025 21:26:48 -0400 Subject: [PATCH 6/7] update requirements for vlm Signed-off-by: n1ck-guo --- test/test_cuda/requirements_vlm.txt | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/test/test_cuda/requirements_vlm.txt b/test/test_cuda/requirements_vlm.txt index 707b14ea..ac0034e4 100644 --- a/test/test_cuda/requirements_vlm.txt +++ b/test/test_cuda/requirements_vlm.txt @@ -8,14 +8,13 @@ flash-attn==2.6.1 intel-extension-for-pytorch==2.6.0 intel-extension-for-transformers lm-eval>=0.4.2,<0.5 -nvidia-cudnn-cu12==8.9.7.29 numpy < 2.0 optimum pandas pillow py-cpuinfo -torch==2.0.1 -torchvision==0.15.2 -triton==2.0.0 +torch==2.3.0 +torchvision +triton==2.3.0 tqdm transformers==4.45.0 From 2b85ac258329ee09bb5b598180dc9b226e6ef9ed Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 14 May 2025 01:17:58 -0400 Subject: [PATCH 7/7] update; Signed-off-by: n1ck-guo --- test/test_cuda/requirements_vlm.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/test_cuda/requirements_vlm.txt b/test/test_cuda/requirements_vlm.txt index ac0034e4..bd3cafeb 100644 --- a/test/test_cuda/requirements_vlm.txt +++ b/test/test_cuda/requirements_vlm.txt @@ -1,11 +1,11 @@ # git+https://github.com/haotian-liu/LLaVA.git@v1.2.2 -# git clone https://github.com/deepseek-ai/DeepSeek-VL2.git && cd DeepSeek-VL2 && pip install -e . +# pip install git+https://github.com/deepseek-ai/DeepSeek-VL2.git accelerate bitsandbytes==0.44.0 datasets einops flash-attn==2.6.1 -intel-extension-for-pytorch==2.6.0 +intel-extension-for-pytorch==2.3.0 intel-extension-for-transformers lm-eval>=0.4.2,<0.5 numpy < 2.0 @@ -18,3 +18,4 @@ torchvision triton==2.3.0 tqdm transformers==4.45.0 +xformers