diff --git a/examples/huggingface/pytorch/code-generation/quantization/README.md b/examples/huggingface/pytorch/code-generation/quantization/README.md index 92c5ccd1619..b3f9f336df1 100644 --- a/examples/huggingface/pytorch/code-generation/quantization/README.md +++ b/examples/huggingface/pytorch/code-generation/quantization/README.md @@ -19,10 +19,10 @@ pip install -r requirements.txt # Run We provide compression technologies such as `MixedPrecision`, `SmoothQuant` and `WeightOnlyQuant` with `Rtn/Awq/Teq/GPTQ/AutoRound` algorithms and `BitsandBytes`, `load_in_4bit` and `load_in_8bit` work on CPU device, the followings are command to show how to use it. ->**Note**: -> Model type "llama" will default use [ipex.optimize_transformers](https://github.com/intel/intel-extension-for-pytorch/blob/339bd251841e153ad9c34e1033ab8b2d936a1781/docs/tutorials/llm/llm_optimize_transformers.md) to accelerate the inference, but "llama" requests transformers version lower than 4.36.0, "falcon" requests transformers version lower than 4.33.3. -## 1. Performance +## MixedPrecison and SmoothQuant + +### 1. Performance ```bash export KMP_BLOCKTIME=1 export KMP_SETTINGS=1 @@ -30,52 +30,35 @@ export KMP_AFFINITY=granularity=fine,compact,1,0 export LD_PRELOAD=${CONDA_PREFIX}/lib/libiomp5.so export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so # fp32 -OMP_NUM_THREADS= numactl -m -C python run_generation.py \ +OMP_NUM_THREADS= numactl -m -C python run_generation_sq.py \ --model bigcode/starcoder \ --benchmark \ - --batch_size 1 + --benchmark_batch_size 1 + # mixedprecision -OMP_NUM_THREADS= numactl -m -C python run_generation.py \ +OMP_NUM_THREADS= numactl -m -C python run_generation_sq.py \ --model bigcode/starcoder \ --mixed_precision \ --benchmark \ --batch_size 1 + # smoothquant # [alternative] --int8 is used for int8 only, --int8_bf16_mixed is used for int8 mixed bfloat16 precision. -python run_generation.py \ +python run_generation_sq.py \ --model bigcode/starcoder \ --output_dir "./saved_results" \ --sq \ --alpha 0.7 \ - --calib_iters 500 \ + --calib_n_samples 500 \ --dataset "mbpp" - --int8 \ - --benchmark \ - --batch_size 1 -# weightonlyquant -OMP_NUM_THREADS= numactl -m -C python run_generation.py \ - --model bigcode/starcoder \ - --woq \ - --benchmark \ - --batch_size 1 -# load_in_4bit -OMP_NUM_THREADS= numactl -m -C python run_generation.py \ - --model bigcode/starcoder \ - --load_in_4bit \ - --benchmark \ - --batch_size 1 -# load_in_8bit -OMP_NUM_THREADS= numactl -m -C python run_generation.py \ - --model bigcode/starcoder \ - --load_in_8bit \ --benchmark \ --batch_size 1 ``` -## 2. Accuracy +### 2. Accuracy ```bash # fp32 -python run_generation.py \ +python run_generation_sq.py \ --model bigcode/starcoder \ --accuracy \ --batch_size 20 \ @@ -85,7 +68,7 @@ python run_generation.py \ --do_sample \ --tasks "humaneval" # mixedprecision -python run_generation.py \ +python run_generation_sq.py \ --model bigcode/starcoder \ --mixed_precision \ --accuracy \ @@ -97,11 +80,10 @@ python run_generation.py \ --tasks "humaneval" # smoothquant # [alternative] --int8 is used for int8 only, --int8_bf16_mixed is used for int8 mixed bfloat16 precision. -python run_generation.py \ +python run_generation_sq.py \ --model bigcode/starcoder \ --sq \ --alpha 1.0 \ - --int8 \ --accuracy \ --batch_size 20 \ --n_samples 20 \ @@ -109,11 +91,42 @@ python run_generation.py \ --temperature 0.2 \ --do_sample \ --tasks "humaneval" +``` + +## WeightOnlyQuant + +1. ### Performance + +```bash +# weightonlyquant +OMP_NUM_THREADS= numactl -m -C python run_generation_cpu_woq.py \ + --model bigcode/starcoder \ + --woq \ + --benchmark \ + --benchmark_batch_size 1 +# load_in_4bit +OMP_NUM_THREADS= numactl -m -C python run_generation_cpu_woq.py \ + --model bigcode/starcoder \ + --load_in_4bit \ + --benchmark \ + --benchmark_batch_size 1 +# load_in_8bit +OMP_NUM_THREADS= numactl -m -C python run_generation_cpu_woq.py \ + --model bigcode/starcoder \ + --load_in_8bit \ + --benchmark \ + --benchmark_batch_size 1 +``` + +2. ### Accuracy + +```bash + # weightonlyquant -python run_generation.py \ +python run_generation_cpu_woq.py \ --model bigcode/starcoder \ --woq \ - --woq_weight_dtype "nf4" \ + --weight_dtype "nf4" \ --accuracy \ --batch_size 20 \ --n_samples 20 \ @@ -122,7 +135,7 @@ python run_generation.py \ --do_sample \ --tasks "humaneval" # load_in_4bit -python run_generation.py \ +python run_generation_cpu_woq.py \ --model bigcode/starcoder \ --load_in_4bit \ --accuracy \ @@ -133,7 +146,7 @@ python run_generation.py \ --do_sample \ --tasks "humaneval" # load_in_8bit -python run_generation.py \ +python run_generation_cpu_woq.py \ --model bigcode/starcoder \ --load_in_8bit \ --accuracy \ @@ -166,17 +179,14 @@ This creates an image called `evaluation-harness-multiple`, and runs a test on i Suppose the fp32 model is `starcoder-3b`, saved quantized model in `saved_results` and do evaluation on `multiple-lua` tasks with: ``` docker run -v $(CURDIR):$(CURDIR) -it /bin/bash -python3 run_generation.py \ +python3 run_generation_sq.py \ --model $(CURDIR)/starcoder-3b \ - --quantize \ --sq \ --alpha 0.7 \ - --ipex \ - --calib_iters 500 \ + --calib_n_samples 500 \ --calib_batch_size 1 \ --dataset "mbpp" \ --output_dir "$(CURDIR)/saved_results" \ - --int8 \ --accuracy \ --tasks multiple-py \ --batch_size 20 \ @@ -191,9 +201,9 @@ python3 run_generation.py \ To run the container (here from image `evaluation-harness-multiple`) to quantize and evaluate on `CURDIR`, or another file mount it with -v, specify n_samples and allow code execution with --allow_code_execution (and add the number of problems --limit if it was used during generation): ```bash docker run -v $(CURDIR):$(CURDIR) \ - -it $(IMAGE_NAME) python3 run_generation.py --model $(CURDIR)/starcoder-3b --quantize --sq --alpha 0.7 --ipex \ - --calib_iters 5 --calib_batch_size 1 --dataset "mbpp" --calib_split "test" --output_dir "$(CURDIR)/saved_results" \ - --int8 --accuracy --tasks multiple-py --batch_size 20 --n_samples 20 --allow_code_execution \ + -it $(IMAGE_NAME) python3 run_generation_sq.py --model $(CURDIR)/starcoder-3b --sq --alpha 0.7 + --calib_n_samples 5 --calib_batch_size 1 --dataset "mbpp" --output_dir "$(CURDIR)/saved_results" \ + --accuracy --tasks multiple-py --batch_size 20 --n_samples 20 --allow_code_execution \ --do_sample --temperature 0.2 --limit 2 ``` diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/code-generation/quantization/run_benchmark.sh index 7100da888ca..060a6bcdd9c 100644 --- a/examples/huggingface/pytorch/code-generation/quantization/run_benchmark.sh +++ b/examples/huggingface/pytorch/code-generation/quantization/run_benchmark.sh @@ -14,7 +14,7 @@ function init_params { batch_size=1 tuned_checkpoint=saved_results lm_eval_tasks="humaneval" - script="run_generation.py" + script="run_generation_sq.py" for var in "$@" do case $var in @@ -85,7 +85,7 @@ function run_benchmark { if [[ ${int8} == "true" ]]; then - extra_cmd=$extra_cmd" --int8" + model_name_or_path=$tuned_checkpoint fi echo $extra_cmd diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_generation.py b/examples/huggingface/pytorch/code-generation/quantization/run_generation_cpu_woq.py similarity index 64% rename from examples/huggingface/pytorch/code-generation/quantization/run_generation.py rename to examples/huggingface/pytorch/code-generation/quantization/run_generation_cpu_woq.py index 9f37784eea2..6934d8b55b3 100644 --- a/examples/huggingface/pytorch/code-generation/quantization/run_generation.py +++ b/examples/huggingface/pytorch/code-generation/quantization/run_generation_cpu_woq.py @@ -1,19 +1,10 @@ import argparse -import re import time -import json import os -import pathlib import torch -import types -import numpy as np -from itertools import chain -from pathlib import Path from transformers import AutoTokenizer, AutoConfig from optimum.utils import NormalizedConfigManager from intel_extension_for_transformers.transformers import ( - MixedPrecisionConfig, - SmoothQuantConfig, BitsAndBytesConfig, RtnConfig, AwqConfig, @@ -24,71 +15,29 @@ from intel_extension_for_transformers.transformers import ( AutoModelForCausalLM, ) -from intel_extension_for_transformers.transformers.utils import str2bool - +from intel_extension_for_transformers.transformers.llm.quantization.sq_utils import( + generate_dummy_past_key_values, +) parser = argparse.ArgumentParser() # ============Main configs============ parser.add_argument( "--model", nargs="?", default="bigcode/starcoderbase", const="bigcode/starcoderbase" ) -parser.add_argument("--trust_remote_code", action="store_true") -parser.add_argument("--_commit_hash", default=None, type=str) -parser.add_argument("--use_neural_speed", action="store_true") parser.add_argument("--dataset", nargs="?", default="mbpp", const="mbpp") parser.add_argument("--dtype", type=str, default="int8") parser.add_argument( "--max_new_tokens", default=32, type=int, help="output max new tokens" ) parser.add_argument("--output_dir", nargs="?", default="./saved_results") -parser.add_argument("--calib_iters", default=500, type=int, help="calibration iters.") -parser.add_argument("--int8", action="store_true") -parser.add_argument( - "--int8_bf16_mixed", - action="store_true", - help="by default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)", -) # ============Benchmark configs============== parser.add_argument("--benchmark", action="store_true") -parser.add_argument("--iters", default=100, type=int, help="num iter") +parser.add_argument("--benchmark_batch_size", default=1, type=int, help="num benchmark batchsize") +parser.add_argument("--benchmark_iters", default=100, type=int, help="num iter") parser.add_argument("--num_warmup", default=10, type=int, help="num warmup") parser.add_argument( "--prompt_size", default=32, type=int, help="generate dummy input_ids size" ) -# ============Accuracy configs============== -parser.add_argument("--accuracy", action="store_true") -parser.add_argument("--batch_size", default=56, type=int, help="batch size num.") -parser.add_argument( - "--save_accuracy_path", default=None, help="Save accuracy results path." -) -# ============MixedPrecision configs============== -parser.add_argument("--mixed_precision", action="store_true") -# ============SmoothQuant configs============== -parser.add_argument("--sq", action="store_true") -parser.add_argument( - "--calib_padding", action="store_true", help="Calibration dataset do padding." -) -parser.add_argument( - "--calib_shuffle", - default=True, - type=str2bool, - help="Calibration dataset do shuffle.", -) -parser.add_argument( - "--calib_pad_val", default=1, type=int, help="Calibration dataset padding value." -) -parser.add_argument( - "--calib_len", - default=512, - type=int, - help="Calibration dataset max or padding max length.", -) -parser.add_argument( - "--recipes", type=str, help="A dictionary as a string, recipes for smoothquant." -) -parser.add_argument("--alpha", default="0.5", help="Smooth quant parameter.") -# ============BitsAndBytes configs============== -parser.add_argument("--bitsandbytes", action="store_true") # ============WeightOnlyQuant configs=============== parser.add_argument("--woq", action="store_true") parser.add_argument( @@ -124,7 +73,7 @@ "--scale_dtype", type=str, default="fp32", - choices=["fp32", "fp8"], + choices=["fp32", "bf16", "fp8"], ) parser.add_argument( "--compute_dtype", @@ -132,15 +81,28 @@ default="fp32", choices=["fp32", "bf16", "int8"], ) -parser.add_argument("--group_size", type=int, default=32) -parser.add_argument("--scheme", default="sym") -parser.add_argument("--load_in_4bit", action="store_true") -parser.add_argument("--load_in_8bit", action="store_true") +parser.add_argument("--group_size", type=int, default=128) +parser.add_argument("--scheme", default=None) parser.add_argument( "--layer_wise", action="store_true", help="Use layer wise to do quantization", ) +parser.add_argument( + "--calib_n_samples", type=int, default=512, help="Number of calibration data samples." +) +parser.add_argument( + "--seq_len", + type=int, + default=2048, + help="Calibration dataset sequence max length, this should align with your model config", +) +parser.add_argument( + "--calib_batch_size", + type=int, + default=8, + help="Calibration batchsize.", +) # ============GPTQ configs============== parser.add_argument( "--desc_act", @@ -153,21 +115,17 @@ default=0.01, help="Percent of the average Hessian diagonal to use for dampening.", ) +parser.add_argument( + "--true_sequential", + action="store_true", + help="Whether to quantize layers within a transformer block in their original order.", +) parser.add_argument( "--blocksize", type=int, default=128, help="Block size. sub weight matrix size to run GPTQ.", ) -parser.add_argument( - "--nsamples", type=int, default=128, help="Number of calibration data samples." -) -parser.add_argument( - "--max_input_length", - type=int, - default=2048, - help="Calibration dataset sequence max length, this should align with your model config", -) parser.add_argument( "--static_groups", action="store_true", @@ -177,20 +135,34 @@ parser.add_argument( "--lr", type=float, - default=0.0025, + default=None, help="learning rate, if None, it will be set to 1.0/iters automatically", ) parser.add_argument( "--minmax_lr", type=float, - default=0.0025, + default=None, help="minmax learning rate, if None,it will beset to be the same with lr", ) +parser.add_argument("--autoround_iters", default=200, type=int, help="num iters for autoround calibration.") parser.add_argument( - "--use_quant_input", + "--disable_quanted_input", action="store_true", help="whether to use the output of quantized block to tune the next block", ) +parser.add_argument( + "--quant_lm_head", + action="store_true", + help="whether to quant the lm head layer", +) +# ============BitsAndBytes configs============== +parser.add_argument("--bitsandbytes", action="store_true") +# ============AutoModel parameters============== +parser.add_argument("--load_in_4bit", action="store_true") +parser.add_argument("--load_in_8bit", action="store_true") +parser.add_argument("--_commit_hash", default=None, type=str) +parser.add_argument("--trust_remote_code", action="store_true") +parser.add_argument("--use_neural_speed", action="store_true") # ============Harness configs============ parser.add_argument("--tasks", default=None, help="Evaluation tasks") parser.add_argument( @@ -250,6 +222,7 @@ help="Path of additional data to load for the tasks", ) # ============Evaluation configs============== +parser.add_argument("--accuracy", action="store_true") parser.add_argument("--prefix", default="") parser.add_argument("--do_sample", action="store_true") parser.add_argument("--temperature", default=0.2, type=float) @@ -258,6 +231,7 @@ parser.add_argument("--n_samples", default=1, type=int) parser.add_argument("--eos", default="<|endoftext|>", type=str) parser.add_argument("--seed", default=0, type=int) +parser.add_argument("--batch_size", default=1, type=int, help="batch size num.") args = parser.parse_args() @@ -273,11 +247,7 @@ args.model, torchscript=( True - if ( - args.sq - or args.woq_algo in ["Awq", "Teq"] - or (args.int8 or args.int8_bf16_mixed or args.benchmark) - ) + if args.woq_algo in ["Awq", "Teq"] else False ), # torchscript will force `return_dict=False` to avoid jit errors use_cache=True, # to use kv cache. @@ -293,34 +263,17 @@ tokenizer.pad_token = tokenizer.eos_token +# Generation +if args.use_neural_speed: + generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=1) +else: + generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4) -op_type_dict = { - "add": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}}, -} -recipes = { - "smooth_quant": True, - "smooth_quant_args": { - "alpha": args.alpha if args.alpha == "auto" else float(args.alpha) - }, -} -excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] -# mp/sq/woq/bitsandbytes config setting +# woq/bitsandbytes config setting quantization_config = None -if args.mixed_precision: - quantization_config = MixedPrecisionConfig(dtype="bfloat16") # default is bfloat16 -elif args.sq: - quantization_config = SmoothQuantConfig( - tokenizer=tokenizer, # either two of one, tokenizer or calib_func - recipes=recipes, - op_type_dict=op_type_dict, # default is {} - excluded_precisions=excluded_precisions, # default is [] - calib_dataset=args.dataset, - calib_iters=args.calib_iters, - ) -elif args.woq: +if args.woq: if args.woq_algo == "Rtn": quantization_config = RtnConfig( - tokenizer=tokenizer, bits=args.bits, sym=True if args.scheme == "sym" else False, group_size=args.group_size, @@ -328,6 +281,7 @@ scale_dtype=args.scale_dtype, weight_dtype=args.weight_dtype, layer_wise=args.layer_wise, + use_ipex=args.use_ipex, ) elif args.woq_algo == "Awq": quantization_config = AwqConfig( @@ -336,11 +290,13 @@ bits=args.bits, zero_point=False if args.scheme == "sym" else True, group_size=args.group_size, - max_input_length=args.max_input_length, + seq_len=args.seq_len, + n_samples=args.calib_n_samples, + batch_size=args.calib_batch_size, compute_dtype=args.compute_dtype, scale_dtype=args.scale_dtype, weight_dtype=args.weight_dtype, - calib_iters=args.calib_iters, + use_ipex=args.use_ipex, ) elif args.woq_algo == "Teq": quantization_config = TeqConfig( @@ -349,11 +305,13 @@ bits=args.bits, sym=True if args.scheme == "sym" else False, group_size=args.group_size, - max_input_length=args.max_input_length, + seq_len=args.seq_len, + batch_size=args.calib_batch_size, + n_samples=args.calib_n_samples, compute_dtype=args.compute_dtype, scale_dtype=args.scale_dtype, weight_dtype=args.weight_dtype, - calib_iters=args.calib_iters, + use_ipex=args.use_ipex, ) elif args.woq_algo == "GPTQ": quantization_config = GPTQConfig( @@ -364,15 +322,17 @@ damp_percent=args.damp_percent, sym=True if args.scheme == "sym" else False, blocksize=args.blocksize, - nsamples=args.nsamples, static_groups=args.static_groups, group_size=args.group_size, - max_input_length=args.max_input_length, + batch_size=args.calib_batch_size, + n_samples=args.calib_n_samples, + seq_len=args.seq_len, compute_dtype=args.compute_dtype, scale_dtype=args.scale_dtype, weight_dtype=args.weight_dtype, - calib_iters=args.calib_iters, layer_wise=args.layer_wise, + true_sequential=args.true_sequential, + use_ipex=args.use_ipex, ) elif args.woq_algo == "AutoRound": quantization_config = AutoRoundConfig( @@ -380,26 +340,30 @@ dataset=args.dataset, bits=args.bits, sym=True if args.scheme == "sym" else False, - nsamples=args.nsamples, + batch_size=args.calib_batch_size, + n_samples=args.calib_n_samples, group_size=args.group_size, compute_dtype=args.compute_dtype, scale_dtype=args.scale_dtype, weight_dtype=args.weight_dtype, - calib_iters=args.calib_iters, - calib_len=args.calib_len, + iters=args.autoround_iters, + seq_len=args.seq_len, lr=args.lr, minmax_lr=args.minmax_lr, - use_quant_input=args.use_quant_input, + disable_quanted_input=args.disable_quanted_input, + quant_lm_head = args.quant_lm_head, + use_ipex=args.use_ipex, ) else: assert False, "Please set the correct '--woq_algo'" # bitsandbytes elif args.bitsandbytes: - # GPU device is need for `load_in_4bit` and `load_in_8bit`. quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", ) +else: + print("The quantization_config is None.") # get optimized model if quantization_config is not None: @@ -408,7 +372,7 @@ quantization_config=quantization_config, trust_remote_code=args.trust_remote_code, _commit_hash=args._commit_hash, - use_neural_speed=False, + use_neural_speed=args.use_neural_speed, ) elif args.load_in_4bit or args.load_in_8bit: # CPU device usage is provided by intel-extension-for-transformers. @@ -417,62 +381,29 @@ load_in_4bit=args.load_in_4bit, load_in_8bit=args.load_in_8bit, _commit_hash=args._commit_hash, - use_neural_speed=False, - ) -elif not args.int8 and not args.int8_bf16_mixed: - user_model = AutoModelForCausalLM.from_pretrained( - args.model, - config=config, - trust_remote_code=args.trust_remote_code, - _commit_hash=args._commit_hash, - use_neural_speed=False, + use_neural_speed=args.use_neural_speed, ) +else: + print("Didn't do Weight Only Quantization.") # save model -if args.output_dir is not None: +if args.output_dir is not None and ((args.woq or args.load_in_4bit or args.load_in_8bit) and not args.use_neural_speed): + user_model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) - if args.sq: - config.save_pretrained(args.output_dir) - user_model.save(args.output_dir) - elif args.mixed_precision or args.woq: - # user_model will be changed. - user_model.save_pretrained(args.output_dir) - # loading saved woq model - user_model = AutoModelForCausalLM.from_pretrained( - args.output_dir, - trust_remote_code=args.trust_remote_code, - use_neural_speed=args.use_neural_speed - ) - -if args.int8 or args.int8_bf16_mixed: - # TorchScript model don't attribute generate method, the wrapper is provided. - import intel_extension_for_pytorch as ipex - from intel_extension_for_transformers.transformers.llm.evaluation.models import ( - TSModelCausalLMForITREX, - ) + # to validate woq model accuracy + args.model = args.output_dir - user_model = TSModelCausalLMForITREX.from_pretrained( - args.output_dir, - file_name="best_model.pt", +if args.benchmark: + print("Loading model from: ", args.model) + user_model = AutoModelForCausalLM.from_pretrained( + args.model, + quantization_config=quantization_config, trust_remote_code=args.trust_remote_code, _commit_hash=args._commit_hash, + use_neural_speed=args.use_neural_speed, ) - -if args.benchmark: - normalized_config = NormalizedConfigManager.get_normalized_config_class( - user_model.config.model_type - )(user_model.config) - num_layers = normalized_config.num_layers - num_attention_heads = normalized_config.num_attention_heads - hidden_size = normalized_config.hidden_size - d_k = hidden_size // num_attention_heads - num_beams = 1 - if hasattr(normalized_config, "num_key_value_heads"): - num_key_value_heads = normalized_config.num_key_value_heads - if hasattr(normalized_config, "multi_query_group_num"): - num_key_value_heads = normalized_config.multi_query_group_num - - num_iter = args.iters + model_config = user_model.config + num_iter = args.benchmark_iters num_warmup = args.num_warmup total_latency = 0 @@ -485,46 +416,16 @@ input_ids = torch.randint( 1, tokenizer.vocab_size, - size=(args.batch_size, args.prompt_size), + size=(args.benchmark_batch_size, args.prompt_size), ) input_bs, input_len = input_ids.shape attention_mask = torch.ones(input_bs, input_len) position_ids = ( torch.arange(input_len).unsqueeze(0).expand(input_bs, -1) ) - if user_model.config.model_type == "gpt_bigcode": - new_shape = [input_bs, 0, d_k * 2] - dummy_tensor = torch.zeros(size=new_shape) - past_key_values = tuple([dummy_tensor] * num_layers) - else: - if not (args.int8 or args.int8_bf16_mixed): - new_shape = [input_bs, num_key_value_heads, 0, d_k] - past_key_values = [ - ( - torch.zeros(size=new_shape).contiguous(), - torch.zeros(size=new_shape).contiguous(), - ) - for _ in range(num_layers) - ] - past_key_values = tuple(past_key_values) - - else: - new_shape = [input_bs, num_key_value_heads, 1, d_k] - beam_idx_tmp = torch.zeros( - (2048, int(input_bs * num_beams)), dtype=torch.long - ).contiguous() - past_key_values = [ - ( - torch.zeros( - 1, 0, 0, 1, dtype=torch.long - ).contiguous(), - torch.zeros(size=new_shape).contiguous(), - torch.zeros(size=new_shape).contiguous(), - beam_idx_tmp, - ) - for _ in range(num_layers) - ] - past_key_values = tuple(past_key_values) + past_key_values = generate_dummy_past_key_values( + config=model_config, input_bs=input_bs + ) inp = { "input_ids": input_ids, @@ -567,7 +468,7 @@ model=user_model, tokenizer=tokenizer, tasks=args.tasks, - batch_size=args.batch_size, + batch_size=args.eval_batch_size, args=args, ) print(results) diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_generation_sq.py b/examples/huggingface/pytorch/code-generation/quantization/run_generation_sq.py new file mode 100644 index 00000000000..156d3dcfd8a --- /dev/null +++ b/examples/huggingface/pytorch/code-generation/quantization/run_generation_sq.py @@ -0,0 +1,326 @@ +import argparse +import time +import os +import torch +from transformers import AutoTokenizer, AutoConfig +from optimum.utils import NormalizedConfigManager +from intel_extension_for_transformers.transformers import ( + MixedPrecisionConfig, + SmoothQuantConfig, +) +from intel_extension_for_transformers.transformers import ( + AutoModelForCausalLM, +) + +from intel_extension_for_transformers.transformers.llm.quantization.sq_utils import( + generate_dummy_past_key_values, + generate_dummy_past_key_values_for_opt_llm, + IPEX_OPT_LLM_SUPPORTED, +) +parser = argparse.ArgumentParser() + +# ============Main configs============ +parser.add_argument( + "--model", nargs="?", default="bigcode/starcoderbase", const="bigcode/starcoderbase" +) +parser.add_argument("--trust_remote_code", action="store_true") +parser.add_argument("--_commit_hash", default=None, type=str) +parser.add_argument("--dataset", nargs="?", default="mbpp", const="mbpp") +parser.add_argument("--dtype", type=str, default="int8") +parser.add_argument( + "--max_new_tokens", default=32, type=int, help="output max new tokens" +) +parser.add_argument("--output_dir", nargs="?", default="./saved_results") +# ============Benchmark configs============== +parser.add_argument("--benchmark", action="store_true") +parser.add_argument("--benchmark_iters", default=100, type=int, help="num iter") +parser.add_argument("--benchmark_batch_size", default=1, type=int, help="batch size num.") +parser.add_argument("--num_warmup", default=10, type=int, help="num warmup") +parser.add_argument( + "--prompt_size", default=32, type=int, help="generate dummy input_ids size" +) +# ============MixedPrecision configs============== +parser.add_argument("--mixed_precision", action="store_true") +# ============SmoothQuant configs============== +parser.add_argument("--sq", action="store_true") +parser.add_argument("--alpha", default=0.5, help="Smooth quant parameter.") +parser.add_argument( + "--calib_n_samples", default=100, type=int, help="Smooth quant calibration samples." +) +parser.add_argument( + "--seq_len", default=512, type=int, help="Smooth quant calibration input length." +) +parser.add_argument("--calib_batch_size", default=1, type=int, help="batch size num.") +parser.add_argument("--padding", action="store_true") +parser.add_argument("--shuffle", action="store_true") +# sq alpha "auto" parameters +parser.add_argument("--scale_sharing", action="store_true") +parser.add_argument( + "--init_alpha", default=0.5, type=float, help="Smooth quant parameter." +) +parser.add_argument( + "--alpha_min", default=0.0, type=float, help="Smooth quant parameter." +) +parser.add_argument( + "--alpha_max", default=1.0, type=float, help="Smooth quant parameter." +) +parser.add_argument( + "--alpha_step", default=0.1, type=float, help="Smooth quant parameter." +) +parser.add_argument("--shared_criterion", default="max", type=str) +parser.add_argument("--do_blockwise", action="store_true") +parser.add_argument( + "--restore_sq_model_from_json", + action="store_true", + help="restore ipex quantized model from output_dir/best_configure.json", +) +# ============Harness configs============ +parser.add_argument("--accuracy", action="store_true") +parser.add_argument("--tasks", default=None, help="Evaluation tasks") +parser.add_argument( + "--limit", default=None, type=int, help="Limit number of samples to eval" +) +parser.add_argument("--allow_code_execution", action="store_true") +parser.add_argument("--generation_only", action="store_true") +parser.add_argument("--postprocess", action="store_false") +parser.add_argument("--save_references", action="store_true") +parser.add_argument("--save_generations", action="store_true") +parser.add_argument("--instruction_tokens", default=None) +parser.add_argument("--save_generations_path", default="generations.json") +parser.add_argument("--load_generations_path", default=None) +parser.add_argument("--metric_output_path", default="evaluation_results.json") +parser.add_argument( + "--load_generations_intermediate_paths", + type=str, + nargs="*", + help="List of paths for saving the intermediate code generations", +) +# ============Generation config============ +parser.add_argument("--max_length_generation", default=512, type=int) +parser.add_argument("--check_references", action="store_true") +parser.add_argument("--max_memory_per_gpu", type=str, default=None) +parser.add_argument( + "--prompt", + type=str, + default="prompt", + help="Prompt type to use for generation in HumanEvalPack tasks", +) +parser.add_argument( + "--modeltype", + default="causal", + help="AutoModel to use, it can be causal or seq2seq", +) +parser.add_argument( + "--limit_start", + type=int, + default=0, + help="Optional offset to start from when limiting the number of samples", +) +parser.add_argument( + "--save_every_k_tasks", + type=int, + default=-1, + help="Optional saving after every k tasks", +) +parser.add_argument( + "--left_padding", + action="store_true", + help="Force left padding, needed for models like chatglm3-6b", +) +parser.add_argument( + "--load_data_path", + type=str, + default=None, + help="Path of additional data to load for the tasks", +) +# ============Evaluation configs============== +parser.add_argument("--prefix", default="") +parser.add_argument("--do_sample", action="store_true") +parser.add_argument("--temperature", default=0.2, type=float) +parser.add_argument("--top_p", default=0.95, type=float) +parser.add_argument("--top_k", default=0, type=int) +parser.add_argument("--n_samples", default=1, type=int) +parser.add_argument("--eos", default="<|endoftext|>", type=str) +parser.add_argument("--seed", default=0, type=int) +parser.add_argument("--batch_size", default=1, type=int, help="batch size num.") +args = parser.parse_args() + + +tokenizer = AutoTokenizer.from_pretrained( + args.model, + truncation_side="left", + padding_side="right", + trust_remote_code=args.trust_remote_code, + _commit_hash=args._commit_hash, +) + +config = AutoConfig.from_pretrained( + args.model, + torchscript=( + True + if ( + args.sq + or args.benchmark + ) + else False + ), # torchscript will force `return_dict=False` to avoid jit errors + use_cache=True, # to use kv cache. + trust_remote_code=args.trust_remote_code, + _commit_hash=args._commit_hash, +) +if not tokenizer.eos_token: + if tokenizer.bos_token: + tokenizer.eos_token = tokenizer.bos_token + print("bos_token used as eos_token") + else: + raise ValueError("No eos_token or bos_token found") + +tokenizer.pad_token = tokenizer.eos_token + +# Generation +generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4) + +# mp/sq/woq/bitsandbytes config setting +quantization_config = None +if args.mixed_precision: + quantization_config = MixedPrecisionConfig(dtype="bfloat16") # default is bfloat16 +elif args.sq: + excluded_precisions = ["bf16"] + quantization_config = SmoothQuantConfig( + tokenizer=tokenizer, + seq_len=args.seq_len, + n_samples=args.calib_n_samples, + batch_size=args.calib_batch_size, + excluded_precisions=excluded_precisions, + alpha=args.alpha if args.alpha == "auto" else float(args.alpha), + scale_sharing=args.scale_sharing, + init_alpha=args.init_alpha, + alpha_min=args.alpha_min, + alpha_max=args.alpha_max, + alpha_step=args.alpha_step, + shared_criterion=args.shared_criterion, + do_blockwise=args.do_blockwise, + shuffle=args.shuffle, + padding=args.padding, + num_beams=generate_kwargs["num_beams"], + ) + +# get optimized model +if quantization_config is not None: + user_model = AutoModelForCausalLM.from_pretrained( + args.model, + quantization_config=quantization_config, + trust_remote_code=args.trust_remote_code, + _commit_hash=args._commit_hash, + ) + + +# save model +if args.output_dir is not None: + tokenizer.save_pretrained(args.output_dir) + if args.sq: + quantization_config.remove_redundant_parameters() + config.quantization_config = quantization_config + config.save_pretrained(args.output_dir) + user_model.save(args.output_dir) + user_model = AutoModelForCausalLM.from_pretrained( + args.output_dir, + trust_remote_code=args.trust_remote_code, + _commit_hash=args._commit_hash, + ) + elif args.mixed_precision: + user_model.save_pretrained(args.output_dir) + +if args.restore_sq_model_from_json: + from intel_extension_for_transformers.transformers.llm.quantization.sq_utils import ( + recover_model_from_json, + ) + user_model = recover_model_from_json( + args.model, + os.path.join(args.output_dir, "qconfig.json"), + args.trust_remote_code, + ) + +elif not (args.sq or args.mixed_precision): + user_model = AutoModelForCausalLM.from_pretrained( + args.model, + trust_remote_code=args.trust_remote_code, + _commit_hash=args._commit_hash, + ) + +if args.benchmark: + model_config = user_model.config + num_iter = args.benchmark_iters + num_warmup = args.num_warmup + + total_latency = 0 + for j in range(args.max_new_tokens): + total_time = 0.0 + with torch.inference_mode(), torch.no_grad(): + for i in range(num_iter): + tic = time.time() + if j == 0: + input_ids = torch.randint( + 1, + tokenizer.vocab_size, + size=(args.benchmark_batch_size, args.prompt_size), + ) + input_bs, input_len = input_ids.shape + attention_mask = torch.ones(input_bs, input_len) + position_ids = ( + torch.arange(input_len).unsqueeze(0).expand(input_bs, -1) + ) + if model_config.model_type in IPEX_OPT_LLM_SUPPORTED: + past_key_values = generate_dummy_past_key_values_for_opt_llm( + config=model_config, input_bs=input_bs, num_beams=1 + ) + else: + past_key_values = generate_dummy_past_key_values( + config=model_config, input_bs=input_bs + ) + + inp = { + "input_ids": input_ids, + "past_key_values": past_key_values, + "attention_mask": attention_mask, + "position_ids": position_ids, + } + out = user_model(**inp) + gen_id = torch.argmax(out[0][:, -1:, :], axis=-1) + gen_text = tokenizer.batch_decode(gen_id, skip_special_tokens=True) + toc = time.time() + if i >= num_warmup: + total_time += toc - tic + + print("\n", "-" * 10, "Summary:", "-" * 10) + print("Generated token index:", j + 1) + latency = total_time / (num_iter - num_warmup) + print("Inference latency: %.5f sec." % latency) + throughput = (num_iter - num_warmup) / total_time + print("Throughput: {} samples/sec".format(throughput)) + + input_ids = gen_id + past_key_values = out[1] + attention_mask = torch.ones( + (attention_mask.shape[0], attention_mask.shape[1] + 1) + ) + position_ids = torch.tensor([[len(inp["position_ids"])]]) + total_latency += latency + + average_latency = total_latency / args.max_new_tokens + print("Average inference latency: %.5f sec." % latency) + average_throughput = args.max_new_tokens / total_latency + print("Average throughput: {} samples/sec".format(throughput)) + + +if args.accuracy: + from intel_extension_for_transformers.transformers.llm.evaluation.bigcode_eval import evaluate + + results = evaluate( + model=user_model, + tokenizer=tokenizer, + tasks=args.tasks, + batch_size=args.batch_size, + args=args, + ) + print(results) diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_tuning.sh b/examples/huggingface/pytorch/code-generation/quantization/run_tuning.sh index 0a793301dbb..2f4ad9d0c44 100644 --- a/examples/huggingface/pytorch/code-generation/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/code-generation/quantization/run_tuning.sh @@ -63,7 +63,7 @@ function run_tuning { alpha=0.5 fi - if [ ${script} = "run_generation.py" ];then + if [ ${script} = "run_generation_sq.py" ];then python ./${script} \ --model ${model_name_or_path} \ --output_dir ${tuned_checkpoint} \ diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py index cd59d9c4086..087980de926 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py @@ -1,8 +1,5 @@ import argparse -import os -import re import time -import json import torch from transformers import AutoConfig, AutoTokenizer from intel_extension_for_transformers.transformers import ( diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py index d54dd5f127f..caee47c9461 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py @@ -28,22 +28,17 @@ "--num_beams", default=1, type=int, help="number of beams" ) parser.add_argument("--output_dir", nargs="?", default="./saved_results") -parser.add_argument("--int8", action="store_true") -parser.add_argument( - "--int8_bf16_mixed", - action="store_true", - help="by default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)", -) -parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model") # ============Benchmark configs============== parser.add_argument("--benchmark", action="store_true") +parser.add_argument("--benchmark_batch_size", default=1, type=int, + help="batch size num.") parser.add_argument("--do_profiling", action="store_true") parser.add_argument("--profile_token_latency", action="store_true") -parser.add_argument("--iters", default=10, type=int, help="num iter") +parser.add_argument("--benchmark_iters", default=10, type=int, help="num iter") parser.add_argument("--num_warmup", default=3, type=int, help="num warmup") # ============Accuracy configs============== parser.add_argument("--accuracy", action="store_true") -parser.add_argument("--batch_size", default=1, type=int, +parser.add_argument("--eval_batch_size", default=56, type=int, help="batch size num.") parser.add_argument("--save_accuracy_path", default=None, help="Save accuracy results path.") @@ -60,12 +55,12 @@ "int4_fullrange", ] ) +parser.add_argument("--batch_size", default=8, type=int, + help="calibration batch size num.") parser.add_argument("--group_size", type=int, default=128) parser.add_argument("--scheme", default="sym") -parser.add_argument("--woq_enable_mse_search", action="store_true") parser.add_argument("--device", default="xpu") parser.add_argument("--compute_dtype", default="fp16") -parser.add_argument("--calib_iters", default=200, type=int, help="Calibration iters.") parser.add_argument("--load_in_4bit", type=bool, default=False) parser.add_argument("--load_in_8bit", type=bool, default=False) # ============GPTQ configs============== @@ -87,10 +82,10 @@ help="Block size. sub weight matrix size to run GPTQ.", ) parser.add_argument( - "--nsamples", type=int, default=512, help="Number of calibration data samples." + "--n_samples", type=int, default=512, help="Number of calibration data samples." ) parser.add_argument( - "--max_input_length", + "--seq_len", type=int, default=2048, help="Calibration dataset sequence max length, this should align with your model config", @@ -102,7 +97,7 @@ ) # ============AutoRound================== parser.add_argument( - "--calib_len", + "--autoround_iters", default=2048, type=int, help="Calibration dataset max or padding max length for AutoRound.", @@ -119,11 +114,17 @@ default=None, help="minmax learning rate, if None,it will beset to be the same with lr", ) +parser.add_argument("--autoround_iters", default=200, type=int, help="num iters for autoround calibration.") parser.add_argument( - "--use_quant_input", + "--disable_quanted_input", action="store_true", help="whether to use the output of quantized block to tune the next block", ) +parser.add_argument( + "--quant_lm_head", + action="store_true", + help="whether to quant the lm head layer", +) # ======================================= args = parser.parse_args() torch_dtype = convert_dtype_str2torch(args.compute_dtype) @@ -155,14 +156,14 @@ damp_percent=args.damp_percent, sym=True if args.scheme == "sym" else False, blocksize=args.blocksize, - nsamples=args.nsamples, + n_samples=args.n_samples, static_groups=args.static_groups, group_size=args.group_size, - max_input_length=args.max_input_length, + seq_len=args.seq_len, compute_dtype=args.compute_dtype, scale_dtype=args.compute_dtype, weight_dtype=args.weight_dtype, - calib_iters=args.calib_iters, + batch_size=args.batch_size, ) elif args.woq_algo.lower() == "autoround": quantization_config = AutoRoundConfig( @@ -171,16 +172,17 @@ bits=args.bits, sym=True if args.scheme == "sym" else False, group_size=args.group_size, - max_input_length=args.max_input_length, + seq_len=args.seq_len, compute_dtype=args.compute_dtype, scale_dtype=args.compute_dtype, weight_dtype=args.weight_dtype, - calib_iters=args.calib_iters, - calib_len=args.calib_len, - nsamples=args.nsamples, + iters=args.autoround_iters, + seq_len=args.seq_len, + n_samples=args.n_samples, lr=args.lr, minmax_lr=args.minmax_lr, - use_quant_input=args.use_quant_input, + disable_quanted_input=args.disable_quanted_input, + quant_lm_head = args.quant_lm_head, ) elif args.woq_algo.lower() == "rtn": quantization_config = RtnConfig( @@ -237,9 +239,9 @@ else: print("Disabled optimization with IPEX...") # start - num_iter = args.iters + num_iter = args.benchmark_iters num_warmup = args.num_warmup - prompt = [prompt] * args.batch_size + prompt = [prompt] * args.benchmark_batch_size amp_enabled = True amp_dtype = torch_dtype @@ -336,7 +338,7 @@ user_model = user_model, tasks = args.tasks, device = args.device, - batch_size = args.batch_size) + batch_size = args.eval_batch_size) results = evaluate(args) for task_name in args.tasks.split(","): if task_name == "wikitext": diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py index fd727af4d53..27057bd3e87 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py @@ -1,11 +1,8 @@ import argparse -import json import os -import re import time import torch -from optimum.intel.generation.modeling import TSModelForCausalLM from transformers import AutoConfig, AutoTokenizer from transformers.utils import check_min_version @@ -15,7 +12,6 @@ MixedPrecisionConfig, SmoothQuantConfig, ) -from intel_extension_for_transformers.transformers.utils import str2bool parser = argparse.ArgumentParser() parser.add_argument("--model", default=None) diff --git a/intel_extension_for_transformers/tools/utils.py b/intel_extension_for_transformers/tools/utils.py index 5bccdcd0e70..e0f6af12769 100644 --- a/intel_extension_for_transformers/tools/utils.py +++ b/intel_extension_for_transformers/tools/utils.py @@ -60,6 +60,17 @@ def is_ipex_available(): def is_autoround_available(): return _autoround_available +_neural_compressor_available = importlib.util.find_spec("neural_compressor") is not None +_neural_compressor_version = "N/A" + +if _neural_compressor_available: + try: + _neural_compressor_version = importlib_metadata.version("neural_compressor") + except importlib_metadata.PackageNotFoundError: + _neural_compressor_available = False +def is_neural_compressor_avaliable(): + return _neural_compressor_available + def get_device_type(): if torch.cuda.is_available(): device = "cuda" diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 6251b308be2..626fccb0fd4 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -67,11 +67,12 @@ convert_to_smoothquant_model, replace_linear, ) -from ...tools.utils import is_intel_gpu_available, is_ipex_available +from ...tools.utils import is_intel_gpu_available, is_ipex_available, _neural_compressor_version from accelerate import init_empty_weights from huggingface_hub import hf_hub_download from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear from neural_compressor.model.torch_model import PyTorchFXModel +from packaging import version from threading import Thread from transformers.configuration_utils import PretrainedConfig from transformers import AutoConfig @@ -799,6 +800,9 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: (RtnConfig, AwqConfig, TeqConfig, GPTQConfig, AutoRoundConfig), ): logger.info("Applying Weight Only Quantization.") + assert ( + version.parse(_neural_compressor_version) > version.parse("2.6") + ), "Please use neural_compressor version > 2.6." if use_neural_speed: if not isinstance(quantization_config, RtnConfig): logger.error("Only Supports RTN Quantization in Neural Speed.") @@ -907,6 +911,9 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: assert ( ipex.__version__ >= "2.2.0+cpu" ), "Please use Intel Extension for PyTorch >=2.2.0+cpu." + assert ( + version.parse(_neural_compressor_version) > version.parse("2.6") + ), "Please use neural_compressor version > 2.6." config.torchscript = True config.use_cache = True model = cls.ORIG_MODEL.from_pretrained( diff --git a/intel_extension_for_transformers/transformers/utils/__init__.py b/intel_extension_for_transformers/transformers/utils/__init__.py index 4eaba5a00fe..417c994107f 100644 --- a/intel_extension_for_transformers/transformers/utils/__init__.py +++ b/intel_extension_for_transformers/transformers/utils/__init__.py @@ -30,4 +30,4 @@ GPTQConfig, AutoRoundConfig ) -from .utility import LazyImport, logger, str2bool, CpuInfo +from .utility import LazyImport, logger, CpuInfo diff --git a/intel_extension_for_transformers/transformers/utils/utility.py b/intel_extension_for_transformers/transformers/utils/utility.py index 527d8b097ff..8738e3df791 100644 --- a/intel_extension_for_transformers/transformers/utils/utility.py +++ b/intel_extension_for_transformers/transformers/utils/utility.py @@ -40,16 +40,6 @@ import intel_extension_for_pytorch as ipex torch = LazyImport("torch") -def str2bool(v): - if isinstance(v, bool): - return v - if v.lower() in ('yes', 'true', 't', 'y', '1'): - return True - elif v.lower() in ('no', 'false', 'f', 'n', '0'): - return False - else: - raise argparse.ArgumentTypeError('Boolean value expected.') - def distributed_init( backend="gloo", world_size=1, diff --git a/tests/CI/test_quantization.py b/tests/CI/test_quantization.py index 264d924efad..9e27d3a0d93 100644 --- a/tests/CI/test_quantization.py +++ b/tests/CI/test_quantization.py @@ -342,21 +342,20 @@ def test_quantization_for_llm(self): output = woq_model(dummy_input) self.assertTrue(isclose(float(output[0][0][0][0]), 0.20071472227573395 , rel_tol=1e-04)) - # # TEQ - # need INC fix. - # woq_config = TeqConfig(bits=4, - # n_samples=5, - # batch_size=1, - # seq_len=512, - # tokenizer=tokenizer - # ) - # woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - # quantization_config=woq_config, - # use_neural_speed=False - # ) - # woq_model.eval() - # output = woq_model(dummy_input) - + # TEQ + woq_config = TeqConfig(bits=4, + n_samples=5, + batch_size=1, + seq_len=512, + tokenizer=tokenizer + ) + woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + quantization_config=woq_config, + use_neural_speed=False + ) + woq_model.eval() + output = woq_model(dummy_input) + self.assertTrue(isclose(float(output[0][0][0][0]), 0.17631684243679047 , rel_tol=1e-04)) # fp8 woq_config = RtnConfig(bits=8, weight_dtype="fp8_e5m2", scale_dtype="fp8_e8m0")