From 745c3f24d83454b67d95212eb3b48a8d0bf61547 Mon Sep 17 00:00:00 2001 From: changwangss Date: Thu, 11 Jul 2024 00:27:20 -0700 Subject: [PATCH 1/7] improve text-generation gpu example Signed-off-by: changwangss --- .../quantization/run_generation_gpu_woq.py | 54 ++++++++++--------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py index d54dd5f127f..f41f05df7c2 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py @@ -28,22 +28,17 @@ "--num_beams", default=1, type=int, help="number of beams" ) parser.add_argument("--output_dir", nargs="?", default="./saved_results") -parser.add_argument("--int8", action="store_true") -parser.add_argument( - "--int8_bf16_mixed", - action="store_true", - help="by default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)", -) -parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model") # ============Benchmark configs============== parser.add_argument("--benchmark", action="store_true") +parser.add_argument("--eval_batch_size", default=1, type=int, + help="batch size num.") parser.add_argument("--do_profiling", action="store_true") parser.add_argument("--profile_token_latency", action="store_true") -parser.add_argument("--iters", default=10, type=int, help="num iter") +parser.add_argument("--benchmark_iters", default=10, type=int, help="num iter") parser.add_argument("--num_warmup", default=3, type=int, help="num warmup") # ============Accuracy configs============== parser.add_argument("--accuracy", action="store_true") -parser.add_argument("--batch_size", default=1, type=int, +parser.add_argument("--eval_batch_size", default=56, type=int, help="batch size num.") parser.add_argument("--save_accuracy_path", default=None, help="Save accuracy results path.") @@ -60,12 +55,12 @@ "int4_fullrange", ] ) +parser.add_argument("--batch_size", default=8, type=int, + help="calibration batch size num.") parser.add_argument("--group_size", type=int, default=128) parser.add_argument("--scheme", default="sym") -parser.add_argument("--woq_enable_mse_search", action="store_true") parser.add_argument("--device", default="xpu") parser.add_argument("--compute_dtype", default="fp16") -parser.add_argument("--calib_iters", default=200, type=int, help="Calibration iters.") parser.add_argument("--load_in_4bit", type=bool, default=False) parser.add_argument("--load_in_8bit", type=bool, default=False) # ============GPTQ configs============== @@ -87,10 +82,10 @@ help="Block size. sub weight matrix size to run GPTQ.", ) parser.add_argument( - "--nsamples", type=int, default=512, help="Number of calibration data samples." + "--n_samples", type=int, default=512, help="Number of calibration data samples." ) parser.add_argument( - "--max_input_length", + "--seq_len", type=int, default=2048, help="Calibration dataset sequence max length, this should align with your model config", @@ -102,7 +97,7 @@ ) # ============AutoRound================== parser.add_argument( - "--calib_len", + "--autoround_iters", default=2048, type=int, help="Calibration dataset max or padding max length for AutoRound.", @@ -119,11 +114,17 @@ default=None, help="minmax learning rate, if None,it will beset to be the same with lr", ) +parser.add_argument("--autoround_iters", default=200, type=int, help="num iters for autoround calibration.") parser.add_argument( - "--use_quant_input", + "--disable_quanted_input", action="store_true", help="whether to use the output of quantized block to tune the next block", ) +parser.add_argument( + "--quant_lm_head", + action="store_true", + help="whether to quant the lm head layer", +) # ======================================= args = parser.parse_args() torch_dtype = convert_dtype_str2torch(args.compute_dtype) @@ -155,14 +156,14 @@ damp_percent=args.damp_percent, sym=True if args.scheme == "sym" else False, blocksize=args.blocksize, - nsamples=args.nsamples, + n_samples=args.n_samples, static_groups=args.static_groups, group_size=args.group_size, - max_input_length=args.max_input_length, + seq_len=args.seq_len, compute_dtype=args.compute_dtype, scale_dtype=args.compute_dtype, weight_dtype=args.weight_dtype, - calib_iters=args.calib_iters, + batch_size=args.batch_size, ) elif args.woq_algo.lower() == "autoround": quantization_config = AutoRoundConfig( @@ -171,16 +172,17 @@ bits=args.bits, sym=True if args.scheme == "sym" else False, group_size=args.group_size, - max_input_length=args.max_input_length, + seq_len=args.seq_len, compute_dtype=args.compute_dtype, scale_dtype=args.compute_dtype, weight_dtype=args.weight_dtype, - calib_iters=args.calib_iters, - calib_len=args.calib_len, - nsamples=args.nsamples, + iters=args.autoround_iters, + seq_len=args.seq_len, + n_samples=args.n_samples, lr=args.lr, minmax_lr=args.minmax_lr, - use_quant_input=args.use_quant_input, + disable_quanted_input=args.disable_quanted_input, + quant_lm_head = args.quant_lm_head, ) elif args.woq_algo.lower() == "rtn": quantization_config = RtnConfig( @@ -237,9 +239,9 @@ else: print("Disabled optimization with IPEX...") # start - num_iter = args.iters + num_iter = args.benchmark_iters num_warmup = args.num_warmup - prompt = [prompt] * args.batch_size + prompt = [prompt] * args.benchmark_batch_size amp_enabled = True amp_dtype = torch_dtype @@ -336,7 +338,7 @@ user_model = user_model, tasks = args.tasks, device = args.device, - batch_size = args.batch_size) + batch_size = args.eval_batch_size) results = evaluate(args) for task_name in args.tasks.split(","): if task_name == "wikitext": From 83f086b4225d3c7ee929970cae23e125f911a462 Mon Sep 17 00:00:00 2001 From: changwangss Date: Thu, 11 Jul 2024 02:19:45 -0700 Subject: [PATCH 2/7] improve code-generation Signed-off-by: changwangss --- .../quantization/run_generation.py | 201 +++++++++++------- 1 file changed, 121 insertions(+), 80 deletions(-) diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_generation.py b/examples/huggingface/pytorch/code-generation/quantization/run_generation.py index 9f37784eea2..95d061414ed 100644 --- a/examples/huggingface/pytorch/code-generation/quantization/run_generation.py +++ b/examples/huggingface/pytorch/code-generation/quantization/run_generation.py @@ -41,23 +41,22 @@ "--max_new_tokens", default=32, type=int, help="output max new tokens" ) parser.add_argument("--output_dir", nargs="?", default="./saved_results") -parser.add_argument("--calib_iters", default=500, type=int, help="calibration iters.") -parser.add_argument("--int8", action="store_true") +parser.add_argument("--n_samples", default=500, type=int, help="calibration iters.") parser.add_argument( - "--int8_bf16_mixed", + "--restore_sq_model_from_json", action="store_true", - help="by default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)", + help="restore ipex quantized model from output_dir/best_configure.json", ) # ============Benchmark configs============== parser.add_argument("--benchmark", action="store_true") -parser.add_argument("--iters", default=100, type=int, help="num iter") +parser.add_argument("--benchmark_iters", default=100, type=int, help="num iter") parser.add_argument("--num_warmup", default=10, type=int, help="num warmup") parser.add_argument( "--prompt_size", default=32, type=int, help="generate dummy input_ids size" ) # ============Accuracy configs============== parser.add_argument("--accuracy", action="store_true") -parser.add_argument("--batch_size", default=56, type=int, help="batch size num.") +parser.add_argument("--eval_batch_size", default=56, type=int, help="batch size num.") parser.add_argument( "--save_accuracy_path", default=None, help="Save accuracy results path." ) @@ -65,31 +64,44 @@ parser.add_argument("--mixed_precision", action="store_true") # ============SmoothQuant configs============== parser.add_argument("--sq", action="store_true") +parser.add_argument("--alpha", default=0.5, help="Smooth quant parameter.") parser.add_argument( - "--calib_padding", action="store_true", help="Calibration dataset do padding." + "--n_samples", default=100, type=int, help="Smooth quant calibration samples." ) parser.add_argument( - "--calib_shuffle", - default=True, - type=str2bool, - help="Calibration dataset do shuffle.", + "--seq_len", default=512, type=int, help="Smooth quant calibration input length." ) +parser.add_argument("--batch_size", default=1, type=int, help="batch size num.") +parser.add_argument("--padding", action="store_true") +parser.add_argument("--shuffle", action="store_true") +# sq alpha "auto" parameters +parser.add_argument("--scale_sharing", action="store_true") parser.add_argument( - "--calib_pad_val", default=1, type=int, help="Calibration dataset padding value." + "--init_alpha", default=0.5, type=float, help="Smooth quant parameter." ) parser.add_argument( - "--calib_len", - default=512, - type=int, - help="Calibration dataset max or padding max length.", + "--alpha_min", default=0.0, type=float, help="Smooth quant parameter." +) +parser.add_argument( + "--alpha_max", default=1.0, type=float, help="Smooth quant parameter." ) parser.add_argument( - "--recipes", type=str, help="A dictionary as a string, recipes for smoothquant." + "--alpha_step", default=0.1, type=float, help="Smooth quant parameter." ) -parser.add_argument("--alpha", default="0.5", help="Smooth quant parameter.") +parser.add_argument("--shared_criterion", default="max", type=str) +parser.add_argument("--do_blockwise", action="store_true") # ============BitsAndBytes configs============== parser.add_argument("--bitsandbytes", action="store_true") # ============WeightOnlyQuant configs=============== +parser.add_argument("--accuracy", action="store_true") +parser.add_argument("--eval_batch_size", default=56, type=int, help="batch size num for evaluation.") +parser.add_argument( + "--tasks", + default="lambada_openai", + type=str, + help="tasks list for accuracy validation", +) +# ============WeightOnlyQuant configs=============== parser.add_argument("--woq", action="store_true") parser.add_argument( "--woq_algo", @@ -124,7 +136,7 @@ "--scale_dtype", type=str, default="fp32", - choices=["fp32", "fp8"], + choices=["fp32", "bf16", "fp8"], ) parser.add_argument( "--compute_dtype", @@ -132,15 +144,28 @@ default="fp32", choices=["fp32", "bf16", "int8"], ) -parser.add_argument("--group_size", type=int, default=32) -parser.add_argument("--scheme", default="sym") -parser.add_argument("--load_in_4bit", action="store_true") -parser.add_argument("--load_in_8bit", action="store_true") +parser.add_argument("--group_size", type=int, default=128) +parser.add_argument("--scheme", default=None) parser.add_argument( "--layer_wise", action="store_true", help="Use layer wise to do quantization", ) +parser.add_argument( + "--n_samples", type=int, default=512, help="Number of calibration data samples." +) +parser.add_argument( + "--seq_len", + type=int, + default=2048, + help="Calibration dataset sequence max length, this should align with your model config", +) +parser.add_argument( + "--batch_size", + type=int, + default=8, + help="Calibration batchsize.", +) # ============GPTQ configs============== parser.add_argument( "--desc_act", @@ -153,21 +178,17 @@ default=0.01, help="Percent of the average Hessian diagonal to use for dampening.", ) +parser.add_argument( + "--true_sequential", + action="store_true", + help="Whether to quantize layers within a transformer block in their original order.", +) parser.add_argument( "--blocksize", type=int, default=128, help="Block size. sub weight matrix size to run GPTQ.", ) -parser.add_argument( - "--nsamples", type=int, default=128, help="Number of calibration data samples." -) -parser.add_argument( - "--max_input_length", - type=int, - default=2048, - help="Calibration dataset sequence max length, this should align with your model config", -) parser.add_argument( "--static_groups", action="store_true", @@ -177,20 +198,26 @@ parser.add_argument( "--lr", type=float, - default=0.0025, + default=None, help="learning rate, if None, it will be set to 1.0/iters automatically", ) parser.add_argument( "--minmax_lr", type=float, - default=0.0025, + default=None, help="minmax learning rate, if None,it will beset to be the same with lr", ) +parser.add_argument("--autoround_iters", default=200, type=int, help="num iters for autoround calibration.") parser.add_argument( - "--use_quant_input", + "--disable_quanted_input", action="store_true", help="whether to use the output of quantized block to tune the next block", ) +parser.add_argument( + "--quant_lm_head", + action="store_true", + help="whether to quant the lm head layer", +) # ============Harness configs============ parser.add_argument("--tasks", default=None, help="Evaluation tasks") parser.add_argument( @@ -293,34 +320,36 @@ tokenizer.pad_token = tokenizer.eos_token +# Generation +generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4) -op_type_dict = { - "add": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}}, -} -recipes = { - "smooth_quant": True, - "smooth_quant_args": { - "alpha": args.alpha if args.alpha == "auto" else float(args.alpha) - }, -} -excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] # mp/sq/woq/bitsandbytes config setting quantization_config = None if args.mixed_precision: quantization_config = MixedPrecisionConfig(dtype="bfloat16") # default is bfloat16 elif args.sq: + excluded_precisions = ["bf16"] quantization_config = SmoothQuantConfig( - tokenizer=tokenizer, # either two of one, tokenizer or calib_func - recipes=recipes, - op_type_dict=op_type_dict, # default is {} - excluded_precisions=excluded_precisions, # default is [] - calib_dataset=args.dataset, - calib_iters=args.calib_iters, + tokenizer=tokenizer, + seq_len=args.seq_len, + n_samples=args.n_samples, + batch_size=args.batch_size, + excluded_precisions=excluded_precisions, + alpha=args.alpha if args.alpha == "auto" else float(args.alpha), + scale_sharing=args.scale_sharing, + init_alpha=args.init_alpha, + alpha_min=args.alpha_min, + alpha_max=args.alpha_max, + alpha_step=args.alpha_step, + shared_criterion=args.shared_criterion, + do_blockwise=args.do_blockwise, + shuffle=args.shuffle, + padding=args.padding, + num_beams=generate_kwargs["num_beams"], ) elif args.woq: if args.woq_algo == "Rtn": quantization_config = RtnConfig( - tokenizer=tokenizer, bits=args.bits, sym=True if args.scheme == "sym" else False, group_size=args.group_size, @@ -328,6 +357,7 @@ scale_dtype=args.scale_dtype, weight_dtype=args.weight_dtype, layer_wise=args.layer_wise, + use_ipex=args.use_ipex, ) elif args.woq_algo == "Awq": quantization_config = AwqConfig( @@ -336,11 +366,13 @@ bits=args.bits, zero_point=False if args.scheme == "sym" else True, group_size=args.group_size, - max_input_length=args.max_input_length, + seq_len=args.seq_len, + n_samples=args.n_samples, + batch_size=args.batch_size, compute_dtype=args.compute_dtype, scale_dtype=args.scale_dtype, weight_dtype=args.weight_dtype, - calib_iters=args.calib_iters, + use_ipex=args.use_ipex, ) elif args.woq_algo == "Teq": quantization_config = TeqConfig( @@ -349,11 +381,13 @@ bits=args.bits, sym=True if args.scheme == "sym" else False, group_size=args.group_size, - max_input_length=args.max_input_length, + seq_len=args.seq_len, + batch_size=args.batch_size, + n_samples=args.n_samples, compute_dtype=args.compute_dtype, scale_dtype=args.scale_dtype, weight_dtype=args.weight_dtype, - calib_iters=args.calib_iters, + use_ipex=args.use_ipex, ) elif args.woq_algo == "GPTQ": quantization_config = GPTQConfig( @@ -364,15 +398,17 @@ damp_percent=args.damp_percent, sym=True if args.scheme == "sym" else False, blocksize=args.blocksize, - nsamples=args.nsamples, static_groups=args.static_groups, group_size=args.group_size, - max_input_length=args.max_input_length, + n_samples=args.n_samples, + seq_len=args.seq_len, + batch_size=args.batch_size, compute_dtype=args.compute_dtype, scale_dtype=args.scale_dtype, weight_dtype=args.weight_dtype, - calib_iters=args.calib_iters, layer_wise=args.layer_wise, + true_sequential=args.true_sequential, + use_ipex=args.use_ipex, ) elif args.woq_algo == "AutoRound": quantization_config = AutoRoundConfig( @@ -380,16 +416,18 @@ dataset=args.dataset, bits=args.bits, sym=True if args.scheme == "sym" else False, - nsamples=args.nsamples, + n_samples=args.n_samples, group_size=args.group_size, compute_dtype=args.compute_dtype, scale_dtype=args.scale_dtype, weight_dtype=args.weight_dtype, - calib_iters=args.calib_iters, - calib_len=args.calib_len, + iters=args.autoround_iters, + seq_len=args.seq_len, lr=args.lr, minmax_lr=args.minmax_lr, - use_quant_input=args.use_quant_input, + disable_quanted_input=args.disable_quanted_input, + quant_lm_head = args.quant_lm_head, + use_ipex=args.use_ipex, ) else: assert False, "Please set the correct '--woq_algo'" @@ -432,28 +470,31 @@ if args.output_dir is not None: tokenizer.save_pretrained(args.output_dir) if args.sq: + quantization_config.remove_redundant_parameters() + config.quantization_config = quantization_config config.save_pretrained(args.output_dir) user_model.save(args.output_dir) - elif args.mixed_precision or args.woq: - # user_model will be changed. - user_model.save_pretrained(args.output_dir) - # loading saved woq model user_model = AutoModelForCausalLM.from_pretrained( - args.output_dir, + args.output_dir, trust_remote_code=args.trust_remote_code, - use_neural_speed=args.use_neural_speed - ) + _commit_hash=args._commit_hash, + ) + elif args.mixed_precision: + user_model.save_pretrained(args.output_dir) -if args.int8 or args.int8_bf16_mixed: - # TorchScript model don't attribute generate method, the wrapper is provided. - import intel_extension_for_pytorch as ipex - from intel_extension_for_transformers.transformers.llm.evaluation.models import ( - TSModelCausalLMForITREX, +if args.restore_sq_model_from_json: + from intel_extension_for_transformers.transformers.llm.quantization.sq_utils import ( + recover_model_from_json, + ) + user_model = recover_model_from_json( + args.model, + os.path.join(args.output_dir, "qconfig.json"), + args.trust_remote_code, ) - user_model = TSModelCausalLMForITREX.from_pretrained( - args.output_dir, - file_name="best_model.pt", +elif not (args.sq or args.mixed_precision or args.woq): + user_model = AutoModelForCausalLM.from_pretrained( + args.model, trust_remote_code=args.trust_remote_code, _commit_hash=args._commit_hash, ) @@ -472,7 +513,7 @@ if hasattr(normalized_config, "multi_query_group_num"): num_key_value_heads = normalized_config.multi_query_group_num - num_iter = args.iters + num_iter = args.benchmark_iters num_warmup = args.num_warmup total_latency = 0 @@ -567,7 +608,7 @@ model=user_model, tokenizer=tokenizer, tasks=args.tasks, - batch_size=args.batch_size, + batch_size=args.eval_batch_size, args=args, ) print(results) From 307d34e922dd429d620974d488a57d59419fd91f Mon Sep 17 00:00:00 2001 From: changwangss Date: Thu, 11 Jul 2024 21:31:27 -0700 Subject: [PATCH 3/7] add inc version limit Signed-off-by: changwangss --- .../code-generation/quantization/README.md | 26 +- .../quantization/run_benchmark.sh | 4 +- ...eneration.py => run_generation_cpu_woq.py} | 144 +------ .../quantization/run_generation_sq.py | 357 ++++++++++++++++++ .../quantization/run_tuning.sh | 2 +- .../quantization/run_generation_cpu_woq.py | 3 - .../quantization/run_generation_sq.py | 4 - .../tools/utils.py | 11 + .../transformers/modeling/modeling_auto.py | 9 +- .../transformers/utils/utility.py | 10 - 10 files changed, 409 insertions(+), 161 deletions(-) rename examples/huggingface/pytorch/code-generation/quantization/{run_generation.py => run_generation_cpu_woq.py} (77%) create mode 100644 examples/huggingface/pytorch/code-generation/quantization/run_generation_sq.py diff --git a/examples/huggingface/pytorch/code-generation/quantization/README.md b/examples/huggingface/pytorch/code-generation/quantization/README.md index 92c5ccd1619..55dbb3dbb4a 100644 --- a/examples/huggingface/pytorch/code-generation/quantization/README.md +++ b/examples/huggingface/pytorch/code-generation/quantization/README.md @@ -30,42 +30,42 @@ export KMP_AFFINITY=granularity=fine,compact,1,0 export LD_PRELOAD=${CONDA_PREFIX}/lib/libiomp5.so export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so # fp32 -OMP_NUM_THREADS= numactl -m -C python run_generation.py \ +OMP_NUM_THREADS= numactl -m -C python run_generation_sq.py \ --model bigcode/starcoder \ --benchmark \ --batch_size 1 # mixedprecision -OMP_NUM_THREADS= numactl -m -C python run_generation.py \ +OMP_NUM_THREADS= numactl -m -C python run_generation_sq.py \ --model bigcode/starcoder \ --mixed_precision \ --benchmark \ --batch_size 1 # smoothquant # [alternative] --int8 is used for int8 only, --int8_bf16_mixed is used for int8 mixed bfloat16 precision. -python run_generation.py \ +python run_generation_sq.py \ --model bigcode/starcoder \ --output_dir "./saved_results" \ --sq \ --alpha 0.7 \ - --calib_iters 500 \ + --calib_n_samples 500 \ --dataset "mbpp" --int8 \ --benchmark \ --batch_size 1 # weightonlyquant -OMP_NUM_THREADS= numactl -m -C python run_generation.py \ +OMP_NUM_THREADS= numactl -m -C python run_generation_cpu_woq.py \ --model bigcode/starcoder \ --woq \ --benchmark \ --batch_size 1 # load_in_4bit -OMP_NUM_THREADS= numactl -m -C python run_generation.py \ +OMP_NUM_THREADS= numactl -m -C python run_generation_cpu_woq.py \ --model bigcode/starcoder \ --load_in_4bit \ --benchmark \ --batch_size 1 # load_in_8bit -OMP_NUM_THREADS= numactl -m -C python run_generation.py \ +OMP_NUM_THREADS= numactl -m -C python run_generation_cpu_woq.py \ --model bigcode/starcoder \ --load_in_8bit \ --benchmark \ @@ -75,7 +75,7 @@ OMP_NUM_THREADS= numactl -m -C python ru ```bash # fp32 -python run_generation.py \ +python run_generation_sq.py \ --model bigcode/starcoder \ --accuracy \ --batch_size 20 \ @@ -85,7 +85,7 @@ python run_generation.py \ --do_sample \ --tasks "humaneval" # mixedprecision -python run_generation.py \ +python run_generation_sq.py \ --model bigcode/starcoder \ --mixed_precision \ --accuracy \ @@ -97,7 +97,7 @@ python run_generation.py \ --tasks "humaneval" # smoothquant # [alternative] --int8 is used for int8 only, --int8_bf16_mixed is used for int8 mixed bfloat16 precision. -python run_generation.py \ +python run_generation_sq.py \ --model bigcode/starcoder \ --sq \ --alpha 1.0 \ @@ -110,7 +110,7 @@ python run_generation.py \ --do_sample \ --tasks "humaneval" # weightonlyquant -python run_generation.py \ +python run_generation_cpu_woq.py \ --model bigcode/starcoder \ --woq \ --woq_weight_dtype "nf4" \ @@ -122,7 +122,7 @@ python run_generation.py \ --do_sample \ --tasks "humaneval" # load_in_4bit -python run_generation.py \ +python run_generation_cpu_woq.py \ --model bigcode/starcoder \ --load_in_4bit \ --accuracy \ @@ -133,7 +133,7 @@ python run_generation.py \ --do_sample \ --tasks "humaneval" # load_in_8bit -python run_generation.py \ +python run_generation_cpu_woq.py \ --model bigcode/starcoder \ --load_in_8bit \ --accuracy \ diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/code-generation/quantization/run_benchmark.sh index 7100da888ca..060a6bcdd9c 100644 --- a/examples/huggingface/pytorch/code-generation/quantization/run_benchmark.sh +++ b/examples/huggingface/pytorch/code-generation/quantization/run_benchmark.sh @@ -14,7 +14,7 @@ function init_params { batch_size=1 tuned_checkpoint=saved_results lm_eval_tasks="humaneval" - script="run_generation.py" + script="run_generation_sq.py" for var in "$@" do case $var in @@ -85,7 +85,7 @@ function run_benchmark { if [[ ${int8} == "true" ]]; then - extra_cmd=$extra_cmd" --int8" + model_name_or_path=$tuned_checkpoint fi echo $extra_cmd diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_generation.py b/examples/huggingface/pytorch/code-generation/quantization/run_generation_cpu_woq.py similarity index 77% rename from examples/huggingface/pytorch/code-generation/quantization/run_generation.py rename to examples/huggingface/pytorch/code-generation/quantization/run_generation_cpu_woq.py index 95d061414ed..7d2c8d23139 100644 --- a/examples/huggingface/pytorch/code-generation/quantization/run_generation.py +++ b/examples/huggingface/pytorch/code-generation/quantization/run_generation_cpu_woq.py @@ -1,19 +1,10 @@ import argparse -import re import time -import json import os -import pathlib import torch -import types -import numpy as np -from itertools import chain -from pathlib import Path from transformers import AutoTokenizer, AutoConfig from optimum.utils import NormalizedConfigManager from intel_extension_for_transformers.transformers import ( - MixedPrecisionConfig, - SmoothQuantConfig, BitsAndBytesConfig, RtnConfig, AwqConfig, @@ -24,7 +15,6 @@ from intel_extension_for_transformers.transformers import ( AutoModelForCausalLM, ) -from intel_extension_for_transformers.transformers.utils import str2bool parser = argparse.ArgumentParser() @@ -41,12 +31,6 @@ "--max_new_tokens", default=32, type=int, help="output max new tokens" ) parser.add_argument("--output_dir", nargs="?", default="./saved_results") -parser.add_argument("--n_samples", default=500, type=int, help="calibration iters.") -parser.add_argument( - "--restore_sq_model_from_json", - action="store_true", - help="restore ipex quantized model from output_dir/best_configure.json", -) # ============Benchmark configs============== parser.add_argument("--benchmark", action="store_true") parser.add_argument("--benchmark_iters", default=100, type=int, help="num iter") @@ -54,54 +38,9 @@ parser.add_argument( "--prompt_size", default=32, type=int, help="generate dummy input_ids size" ) -# ============Accuracy configs============== -parser.add_argument("--accuracy", action="store_true") -parser.add_argument("--eval_batch_size", default=56, type=int, help="batch size num.") -parser.add_argument( - "--save_accuracy_path", default=None, help="Save accuracy results path." -) -# ============MixedPrecision configs============== -parser.add_argument("--mixed_precision", action="store_true") -# ============SmoothQuant configs============== -parser.add_argument("--sq", action="store_true") -parser.add_argument("--alpha", default=0.5, help="Smooth quant parameter.") -parser.add_argument( - "--n_samples", default=100, type=int, help="Smooth quant calibration samples." -) -parser.add_argument( - "--seq_len", default=512, type=int, help="Smooth quant calibration input length." -) -parser.add_argument("--batch_size", default=1, type=int, help="batch size num.") -parser.add_argument("--padding", action="store_true") -parser.add_argument("--shuffle", action="store_true") -# sq alpha "auto" parameters -parser.add_argument("--scale_sharing", action="store_true") -parser.add_argument( - "--init_alpha", default=0.5, type=float, help="Smooth quant parameter." -) -parser.add_argument( - "--alpha_min", default=0.0, type=float, help="Smooth quant parameter." -) -parser.add_argument( - "--alpha_max", default=1.0, type=float, help="Smooth quant parameter." -) -parser.add_argument( - "--alpha_step", default=0.1, type=float, help="Smooth quant parameter." -) -parser.add_argument("--shared_criterion", default="max", type=str) -parser.add_argument("--do_blockwise", action="store_true") # ============BitsAndBytes configs============== parser.add_argument("--bitsandbytes", action="store_true") # ============WeightOnlyQuant configs=============== -parser.add_argument("--accuracy", action="store_true") -parser.add_argument("--eval_batch_size", default=56, type=int, help="batch size num for evaluation.") -parser.add_argument( - "--tasks", - default="lambada_openai", - type=str, - help="tasks list for accuracy validation", -) -# ============WeightOnlyQuant configs=============== parser.add_argument("--woq", action="store_true") parser.add_argument( "--woq_algo", @@ -277,6 +216,7 @@ help="Path of additional data to load for the tasks", ) # ============Evaluation configs============== +parser.add_argument("--accuracy", action="store_true") parser.add_argument("--prefix", default="") parser.add_argument("--do_sample", action="store_true") parser.add_argument("--temperature", default=0.2, type=float) @@ -321,33 +261,14 @@ tokenizer.pad_token = tokenizer.eos_token # Generation -generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4) +if args.use_neural_speed: + generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=1) +else: + generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4) -# mp/sq/woq/bitsandbytes config setting +# woq/bitsandbytes config setting quantization_config = None -if args.mixed_precision: - quantization_config = MixedPrecisionConfig(dtype="bfloat16") # default is bfloat16 -elif args.sq: - excluded_precisions = ["bf16"] - quantization_config = SmoothQuantConfig( - tokenizer=tokenizer, - seq_len=args.seq_len, - n_samples=args.n_samples, - batch_size=args.batch_size, - excluded_precisions=excluded_precisions, - alpha=args.alpha if args.alpha == "auto" else float(args.alpha), - scale_sharing=args.scale_sharing, - init_alpha=args.init_alpha, - alpha_min=args.alpha_min, - alpha_max=args.alpha_max, - alpha_step=args.alpha_step, - shared_criterion=args.shared_criterion, - do_blockwise=args.do_blockwise, - shuffle=args.shuffle, - padding=args.padding, - num_beams=generate_kwargs["num_beams"], - ) -elif args.woq: +if args.woq: if args.woq_algo == "Rtn": quantization_config = RtnConfig( bits=args.bits, @@ -433,11 +354,12 @@ assert False, "Please set the correct '--woq_algo'" # bitsandbytes elif args.bitsandbytes: - # GPU device is need for `load_in_4bit` and `load_in_8bit`. quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", ) +else: + print("The quantization_config is None.") # get optimized model if quantization_config is not None: @@ -446,7 +368,7 @@ quantization_config=quantization_config, trust_remote_code=args.trust_remote_code, _commit_hash=args._commit_hash, - use_neural_speed=False, + use_neural_speed=args.use_neural_speed, ) elif args.load_in_4bit or args.load_in_8bit: # CPU device usage is provided by intel-extension-for-transformers. @@ -455,49 +377,17 @@ load_in_4bit=args.load_in_4bit, load_in_8bit=args.load_in_8bit, _commit_hash=args._commit_hash, - use_neural_speed=False, - ) -elif not args.int8 and not args.int8_bf16_mixed: - user_model = AutoModelForCausalLM.from_pretrained( - args.model, - config=config, - trust_remote_code=args.trust_remote_code, - _commit_hash=args._commit_hash, - use_neural_speed=False, + use_neural_speed=args.use_neural_speed, ) +else: + print("Didn't do Weight Only Quantization.") # save model -if args.output_dir is not None: +if args.output_dir is not None and ((args.woq or args.load_in_4bit or args.load_in_8bit) and not args.use_neural_speed): + user_model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) - if args.sq: - quantization_config.remove_redundant_parameters() - config.quantization_config = quantization_config - config.save_pretrained(args.output_dir) - user_model.save(args.output_dir) - user_model = AutoModelForCausalLM.from_pretrained( - args.output_dir, - trust_remote_code=args.trust_remote_code, - _commit_hash=args._commit_hash, - ) - elif args.mixed_precision: - user_model.save_pretrained(args.output_dir) - -if args.restore_sq_model_from_json: - from intel_extension_for_transformers.transformers.llm.quantization.sq_utils import ( - recover_model_from_json, - ) - user_model = recover_model_from_json( - args.model, - os.path.join(args.output_dir, "qconfig.json"), - args.trust_remote_code, - ) - -elif not (args.sq or args.mixed_precision or args.woq): - user_model = AutoModelForCausalLM.from_pretrained( - args.model, - trust_remote_code=args.trust_remote_code, - _commit_hash=args._commit_hash, - ) + # to validate woq model accuracy + args.model = args.output_dir if args.benchmark: normalized_config = NormalizedConfigManager.get_normalized_config_class( diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_generation_sq.py b/examples/huggingface/pytorch/code-generation/quantization/run_generation_sq.py new file mode 100644 index 00000000000..b7802c01996 --- /dev/null +++ b/examples/huggingface/pytorch/code-generation/quantization/run_generation_sq.py @@ -0,0 +1,357 @@ +import argparse +import time +import os +import torch +from transformers import AutoTokenizer, AutoConfig +from optimum.utils import NormalizedConfigManager +from intel_extension_for_transformers.transformers import ( + MixedPrecisionConfig, + SmoothQuantConfig, +) +from intel_extension_for_transformers.transformers import ( + AutoModelForCausalLM, +) + +parser = argparse.ArgumentParser() + +# ============Main configs============ +parser.add_argument( + "--model", nargs="?", default="bigcode/starcoderbase", const="bigcode/starcoderbase" +) +parser.add_argument("--trust_remote_code", action="store_true") +parser.add_argument("--_commit_hash", default=None, type=str) +parser.add_argument("--dataset", nargs="?", default="mbpp", const="mbpp") +parser.add_argument("--dtype", type=str, default="int8") +parser.add_argument( + "--max_new_tokens", default=32, type=int, help="output max new tokens" +) +parser.add_argument("--output_dir", nargs="?", default="./saved_results") +# ============Benchmark configs============== +parser.add_argument("--benchmark", action="store_true") +parser.add_argument("--benchmark_iters", default=100, type=int, help="num iter") +parser.add_argument("--num_warmup", default=10, type=int, help="num warmup") +parser.add_argument( + "--prompt_size", default=32, type=int, help="generate dummy input_ids size" +) +# ============MixedPrecision configs============== +parser.add_argument("--mixed_precision", action="store_true") +# ============SmoothQuant configs============== +parser.add_argument("--sq", action="store_true") +parser.add_argument("--alpha", default=0.5, help="Smooth quant parameter.") +parser.add_argument( + "--calib_n_samples", default=100, type=int, help="Smooth quant calibration samples." +) +parser.add_argument( + "--seq_len", default=512, type=int, help="Smooth quant calibration input length." +) +parser.add_argument("--batch_size", default=1, type=int, help="batch size num.") +parser.add_argument("--padding", action="store_true") +parser.add_argument("--shuffle", action="store_true") +# sq alpha "auto" parameters +parser.add_argument("--scale_sharing", action="store_true") +parser.add_argument( + "--init_alpha", default=0.5, type=float, help="Smooth quant parameter." +) +parser.add_argument( + "--alpha_min", default=0.0, type=float, help="Smooth quant parameter." +) +parser.add_argument( + "--alpha_max", default=1.0, type=float, help="Smooth quant parameter." +) +parser.add_argument( + "--alpha_step", default=0.1, type=float, help="Smooth quant parameter." +) +parser.add_argument("--shared_criterion", default="max", type=str) +parser.add_argument("--do_blockwise", action="store_true") +parser.add_argument( + "--restore_sq_model_from_json", + action="store_true", + help="restore ipex quantized model from output_dir/best_configure.json", +) +# ============Harness configs============ +parser.add_argument("--accuracy", action="store_true") +parser.add_argument("--tasks", default=None, help="Evaluation tasks") +parser.add_argument( + "--limit", default=None, type=int, help="Limit number of samples to eval" +) +parser.add_argument("--allow_code_execution", action="store_true") +parser.add_argument("--generation_only", action="store_true") +parser.add_argument("--postprocess", action="store_false") +parser.add_argument("--save_references", action="store_true") +parser.add_argument("--save_generations", action="store_true") +parser.add_argument("--instruction_tokens", default=None) +parser.add_argument("--save_generations_path", default="generations.json") +parser.add_argument("--load_generations_path", default=None) +parser.add_argument("--metric_output_path", default="evaluation_results.json") +parser.add_argument( + "--load_generations_intermediate_paths", + type=str, + nargs="*", + help="List of paths for saving the intermediate code generations", +) +# ============Generation config============ +parser.add_argument("--max_length_generation", default=512, type=int) +parser.add_argument("--check_references", action="store_true") +parser.add_argument("--max_memory_per_gpu", type=str, default=None) +parser.add_argument( + "--prompt", + type=str, + default="prompt", + help="Prompt type to use for generation in HumanEvalPack tasks", +) +parser.add_argument( + "--modeltype", + default="causal", + help="AutoModel to use, it can be causal or seq2seq", +) +parser.add_argument( + "--limit_start", + type=int, + default=0, + help="Optional offset to start from when limiting the number of samples", +) +parser.add_argument( + "--save_every_k_tasks", + type=int, + default=-1, + help="Optional saving after every k tasks", +) +parser.add_argument( + "--left_padding", + action="store_true", + help="Force left padding, needed for models like chatglm3-6b", +) +parser.add_argument( + "--load_data_path", + type=str, + default=None, + help="Path of additional data to load for the tasks", +) +# ============Evaluation configs============== +parser.add_argument("--prefix", default="") +parser.add_argument("--do_sample", action="store_true") +parser.add_argument("--temperature", default=0.2, type=float) +parser.add_argument("--top_p", default=0.95, type=float) +parser.add_argument("--top_k", default=0, type=int) +parser.add_argument("--n_samples", default=1, type=int) +parser.add_argument("--eos", default="<|endoftext|>", type=str) +parser.add_argument("--seed", default=0, type=int) +args = parser.parse_args() + + +tokenizer = AutoTokenizer.from_pretrained( + args.model, + truncation_side="left", + padding_side="right", + trust_remote_code=args.trust_remote_code, + _commit_hash=args._commit_hash, +) + +config = AutoConfig.from_pretrained( + args.model, + torchscript=( + True + if ( + args.sq + or args.woq_algo in ["Awq", "Teq"] + or (args.int8 or args.int8_bf16_mixed or args.benchmark) + ) + else False + ), # torchscript will force `return_dict=False` to avoid jit errors + use_cache=True, # to use kv cache. + trust_remote_code=args.trust_remote_code, + _commit_hash=args._commit_hash, +) +if not tokenizer.eos_token: + if tokenizer.bos_token: + tokenizer.eos_token = tokenizer.bos_token + print("bos_token used as eos_token") + else: + raise ValueError("No eos_token or bos_token found") + +tokenizer.pad_token = tokenizer.eos_token + +# Generation +generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4) + +# mp/sq/woq/bitsandbytes config setting +quantization_config = None +if args.mixed_precision: + quantization_config = MixedPrecisionConfig(dtype="bfloat16") # default is bfloat16 +elif args.sq: + excluded_precisions = ["bf16"] + quantization_config = SmoothQuantConfig( + tokenizer=tokenizer, + seq_len=args.seq_len, + n_samples=args.calib_n_samples, + batch_size=args.batch_size, + excluded_precisions=excluded_precisions, + alpha=args.alpha if args.alpha == "auto" else float(args.alpha), + scale_sharing=args.scale_sharing, + init_alpha=args.init_alpha, + alpha_min=args.alpha_min, + alpha_max=args.alpha_max, + alpha_step=args.alpha_step, + shared_criterion=args.shared_criterion, + do_blockwise=args.do_blockwise, + shuffle=args.shuffle, + padding=args.padding, + num_beams=generate_kwargs["num_beams"], + ) + +# get optimized model +if quantization_config is not None: + user_model = AutoModelForCausalLM.from_pretrained( + args.model, + quantization_config=quantization_config, + trust_remote_code=args.trust_remote_code, + _commit_hash=args._commit_hash, + ) + + +# save model +if args.output_dir is not None: + tokenizer.save_pretrained(args.output_dir) + if args.sq: + quantization_config.remove_redundant_parameters() + config.quantization_config = quantization_config + config.save_pretrained(args.output_dir) + user_model.save(args.output_dir) + user_model = AutoModelForCausalLM.from_pretrained( + args.output_dir, + trust_remote_code=args.trust_remote_code, + _commit_hash=args._commit_hash, + ) + elif args.mixed_precision: + user_model.save_pretrained(args.output_dir) + +if args.restore_sq_model_from_json: + from intel_extension_for_transformers.transformers.llm.quantization.sq_utils import ( + recover_model_from_json, + ) + user_model = recover_model_from_json( + args.model, + os.path.join(args.output_dir, "qconfig.json"), + args.trust_remote_code, + ) + +elif not (args.sq or args.mixed_precision): + user_model = AutoModelForCausalLM.from_pretrained( + args.model, + trust_remote_code=args.trust_remote_code, + _commit_hash=args._commit_hash, + ) + +if args.benchmark: + normalized_config = NormalizedConfigManager.get_normalized_config_class( + user_model.config.model_type + )(user_model.config) + num_layers = normalized_config.num_layers + num_attention_heads = normalized_config.num_attention_heads + hidden_size = normalized_config.hidden_size + d_k = hidden_size // num_attention_heads + num_beams = 1 + if hasattr(normalized_config, "num_key_value_heads"): + num_key_value_heads = normalized_config.num_key_value_heads + if hasattr(normalized_config, "multi_query_group_num"): + num_key_value_heads = normalized_config.multi_query_group_num + + num_iter = args.benchmark_iters + num_warmup = args.num_warmup + + total_latency = 0 + for j in range(args.max_new_tokens): + total_time = 0.0 + with torch.inference_mode(), torch.no_grad(): + for i in range(num_iter): + tic = time.time() + if j == 0: + input_ids = torch.randint( + 1, + tokenizer.vocab_size, + size=(args.batch_size, args.prompt_size), + ) + input_bs, input_len = input_ids.shape + attention_mask = torch.ones(input_bs, input_len) + position_ids = ( + torch.arange(input_len).unsqueeze(0).expand(input_bs, -1) + ) + if user_model.config.model_type == "gpt_bigcode": + new_shape = [input_bs, 0, d_k * 2] + dummy_tensor = torch.zeros(size=new_shape) + past_key_values = tuple([dummy_tensor] * num_layers) + else: + if not (args.int8 or args.int8_bf16_mixed): + new_shape = [input_bs, num_key_value_heads, 0, d_k] + past_key_values = [ + ( + torch.zeros(size=new_shape).contiguous(), + torch.zeros(size=new_shape).contiguous(), + ) + for _ in range(num_layers) + ] + past_key_values = tuple(past_key_values) + + else: + new_shape = [input_bs, num_key_value_heads, 1, d_k] + beam_idx_tmp = torch.zeros( + (2048, int(input_bs * num_beams)), dtype=torch.long + ).contiguous() + past_key_values = [ + ( + torch.zeros( + 1, 0, 0, 1, dtype=torch.long + ).contiguous(), + torch.zeros(size=new_shape).contiguous(), + torch.zeros(size=new_shape).contiguous(), + beam_idx_tmp, + ) + for _ in range(num_layers) + ] + past_key_values = tuple(past_key_values) + + inp = { + "input_ids": input_ids, + "past_key_values": past_key_values, + "attention_mask": attention_mask, + "position_ids": position_ids, + } + out = user_model(**inp) + gen_id = torch.argmax(out[0][:, -1:, :], axis=-1) + gen_text = tokenizer.batch_decode(gen_id, skip_special_tokens=True) + toc = time.time() + if i >= num_warmup: + total_time += toc - tic + + print("\n", "-" * 10, "Summary:", "-" * 10) + print("Generated token index:", j + 1) + latency = total_time / (num_iter - num_warmup) + print("Inference latency: %.5f sec." % latency) + throughput = (num_iter - num_warmup) / total_time + print("Throughput: {} samples/sec".format(throughput)) + + input_ids = gen_id + past_key_values = out[1] + attention_mask = torch.ones( + (attention_mask.shape[0], attention_mask.shape[1] + 1) + ) + position_ids = torch.tensor([[len(inp["position_ids"])]]) + total_latency += latency + + average_latency = total_latency / args.max_new_tokens + print("Average inference latency: %.5f sec." % latency) + average_throughput = args.max_new_tokens / total_latency + print("Average throughput: {} samples/sec".format(throughput)) + + +if args.accuracy: + from intel_extension_for_transformers.transformers.llm.evaluation.bigcode_eval import evaluate + + results = evaluate( + model=user_model, + tokenizer=tokenizer, + tasks=args.tasks, + batch_size=args.eval_batch_size, + args=args, + ) + print(results) diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_tuning.sh b/examples/huggingface/pytorch/code-generation/quantization/run_tuning.sh index 0a793301dbb..2f4ad9d0c44 100644 --- a/examples/huggingface/pytorch/code-generation/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/code-generation/quantization/run_tuning.sh @@ -63,7 +63,7 @@ function run_tuning { alpha=0.5 fi - if [ ${script} = "run_generation.py" ];then + if [ ${script} = "run_generation_sq.py" ];then python ./${script} \ --model ${model_name_or_path} \ --output_dir ${tuned_checkpoint} \ diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py index cd59d9c4086..087980de926 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py @@ -1,8 +1,5 @@ import argparse -import os -import re import time -import json import torch from transformers import AutoConfig, AutoTokenizer from intel_extension_for_transformers.transformers import ( diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py index fd727af4d53..27057bd3e87 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py @@ -1,11 +1,8 @@ import argparse -import json import os -import re import time import torch -from optimum.intel.generation.modeling import TSModelForCausalLM from transformers import AutoConfig, AutoTokenizer from transformers.utils import check_min_version @@ -15,7 +12,6 @@ MixedPrecisionConfig, SmoothQuantConfig, ) -from intel_extension_for_transformers.transformers.utils import str2bool parser = argparse.ArgumentParser() parser.add_argument("--model", default=None) diff --git a/intel_extension_for_transformers/tools/utils.py b/intel_extension_for_transformers/tools/utils.py index 5bccdcd0e70..e0f6af12769 100644 --- a/intel_extension_for_transformers/tools/utils.py +++ b/intel_extension_for_transformers/tools/utils.py @@ -60,6 +60,17 @@ def is_ipex_available(): def is_autoround_available(): return _autoround_available +_neural_compressor_available = importlib.util.find_spec("neural_compressor") is not None +_neural_compressor_version = "N/A" + +if _neural_compressor_available: + try: + _neural_compressor_version = importlib_metadata.version("neural_compressor") + except importlib_metadata.PackageNotFoundError: + _neural_compressor_available = False +def is_neural_compressor_avaliable(): + return _neural_compressor_available + def get_device_type(): if torch.cuda.is_available(): device = "cuda" diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 6251b308be2..626fccb0fd4 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -67,11 +67,12 @@ convert_to_smoothquant_model, replace_linear, ) -from ...tools.utils import is_intel_gpu_available, is_ipex_available +from ...tools.utils import is_intel_gpu_available, is_ipex_available, _neural_compressor_version from accelerate import init_empty_weights from huggingface_hub import hf_hub_download from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear from neural_compressor.model.torch_model import PyTorchFXModel +from packaging import version from threading import Thread from transformers.configuration_utils import PretrainedConfig from transformers import AutoConfig @@ -799,6 +800,9 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: (RtnConfig, AwqConfig, TeqConfig, GPTQConfig, AutoRoundConfig), ): logger.info("Applying Weight Only Quantization.") + assert ( + version.parse(_neural_compressor_version) > version.parse("2.6") + ), "Please use neural_compressor version > 2.6." if use_neural_speed: if not isinstance(quantization_config, RtnConfig): logger.error("Only Supports RTN Quantization in Neural Speed.") @@ -907,6 +911,9 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: assert ( ipex.__version__ >= "2.2.0+cpu" ), "Please use Intel Extension for PyTorch >=2.2.0+cpu." + assert ( + version.parse(_neural_compressor_version) > version.parse("2.6") + ), "Please use neural_compressor version > 2.6." config.torchscript = True config.use_cache = True model = cls.ORIG_MODEL.from_pretrained( diff --git a/intel_extension_for_transformers/transformers/utils/utility.py b/intel_extension_for_transformers/transformers/utils/utility.py index 527d8b097ff..8738e3df791 100644 --- a/intel_extension_for_transformers/transformers/utils/utility.py +++ b/intel_extension_for_transformers/transformers/utils/utility.py @@ -40,16 +40,6 @@ import intel_extension_for_pytorch as ipex torch = LazyImport("torch") -def str2bool(v): - if isinstance(v, bool): - return v - if v.lower() in ('yes', 'true', 't', 'y', '1'): - return True - elif v.lower() in ('no', 'false', 'f', 'n', '0'): - return False - else: - raise argparse.ArgumentTypeError('Boolean value expected.') - def distributed_init( backend="gloo", world_size=1, From db79354977544b9aa7b8ad0f0a6295aaadcb06ae Mon Sep 17 00:00:00 2001 From: changwangss Date: Thu, 11 Jul 2024 21:35:12 -0700 Subject: [PATCH 4/7] fix eval args Signed-off-by: changwangss --- .../text-generation/quantization/run_generation_gpu_woq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py index f41f05df7c2..caee47c9461 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py @@ -30,7 +30,7 @@ parser.add_argument("--output_dir", nargs="?", default="./saved_results") # ============Benchmark configs============== parser.add_argument("--benchmark", action="store_true") -parser.add_argument("--eval_batch_size", default=1, type=int, +parser.add_argument("--benchmark_batch_size", default=1, type=int, help="batch size num.") parser.add_argument("--do_profiling", action="store_true") parser.add_argument("--profile_token_latency", action="store_true") From 7c5fa2138efb9d8b29b205a9688344919262de0c Mon Sep 17 00:00:00 2001 From: changwangss Date: Thu, 11 Jul 2024 22:40:15 -0700 Subject: [PATCH 5/7] fix import Signed-off-by: changwangss --- intel_extension_for_transformers/transformers/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/intel_extension_for_transformers/transformers/utils/__init__.py b/intel_extension_for_transformers/transformers/utils/__init__.py index 4eaba5a00fe..417c994107f 100644 --- a/intel_extension_for_transformers/transformers/utils/__init__.py +++ b/intel_extension_for_transformers/transformers/utils/__init__.py @@ -30,4 +30,4 @@ GPTQConfig, AutoRoundConfig ) -from .utility import LazyImport, logger, str2bool, CpuInfo +from .utility import LazyImport, logger, CpuInfo From a990914f5c88df957d7eead8f43d0d9d8808b0dd Mon Sep 17 00:00:00 2001 From: changwangss Date: Fri, 12 Jul 2024 00:59:36 -0700 Subject: [PATCH 6/7] fit inc teq Signed-off-by: changwangss --- .../code-generation/quantization/README.md | 78 +++++++------ .../quantization/run_generation_cpu_woq.py | 105 +++++++----------- .../quantization/run_generation_sq.py | 71 ++++-------- tests/CI/test_quantization.py | 29 +++-- 4 files changed, 116 insertions(+), 167 deletions(-) diff --git a/examples/huggingface/pytorch/code-generation/quantization/README.md b/examples/huggingface/pytorch/code-generation/quantization/README.md index 55dbb3dbb4a..b3f9f336df1 100644 --- a/examples/huggingface/pytorch/code-generation/quantization/README.md +++ b/examples/huggingface/pytorch/code-generation/quantization/README.md @@ -19,10 +19,10 @@ pip install -r requirements.txt # Run We provide compression technologies such as `MixedPrecision`, `SmoothQuant` and `WeightOnlyQuant` with `Rtn/Awq/Teq/GPTQ/AutoRound` algorithms and `BitsandBytes`, `load_in_4bit` and `load_in_8bit` work on CPU device, the followings are command to show how to use it. ->**Note**: -> Model type "llama" will default use [ipex.optimize_transformers](https://github.com/intel/intel-extension-for-pytorch/blob/339bd251841e153ad9c34e1033ab8b2d936a1781/docs/tutorials/llm/llm_optimize_transformers.md) to accelerate the inference, but "llama" requests transformers version lower than 4.36.0, "falcon" requests transformers version lower than 4.33.3. -## 1. Performance +## MixedPrecison and SmoothQuant + +### 1. Performance ```bash export KMP_BLOCKTIME=1 export KMP_SETTINGS=1 @@ -33,13 +33,15 @@ export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so OMP_NUM_THREADS= numactl -m -C python run_generation_sq.py \ --model bigcode/starcoder \ --benchmark \ - --batch_size 1 + --benchmark_batch_size 1 + # mixedprecision OMP_NUM_THREADS= numactl -m -C python run_generation_sq.py \ --model bigcode/starcoder \ --mixed_precision \ --benchmark \ --batch_size 1 + # smoothquant # [alternative] --int8 is used for int8 only, --int8_bf16_mixed is used for int8 mixed bfloat16 precision. python run_generation_sq.py \ @@ -49,29 +51,10 @@ python run_generation_sq.py \ --alpha 0.7 \ --calib_n_samples 500 \ --dataset "mbpp" - --int8 \ - --benchmark \ - --batch_size 1 -# weightonlyquant -OMP_NUM_THREADS= numactl -m -C python run_generation_cpu_woq.py \ - --model bigcode/starcoder \ - --woq \ - --benchmark \ - --batch_size 1 -# load_in_4bit -OMP_NUM_THREADS= numactl -m -C python run_generation_cpu_woq.py \ - --model bigcode/starcoder \ - --load_in_4bit \ - --benchmark \ - --batch_size 1 -# load_in_8bit -OMP_NUM_THREADS= numactl -m -C python run_generation_cpu_woq.py \ - --model bigcode/starcoder \ - --load_in_8bit \ --benchmark \ --batch_size 1 ``` -## 2. Accuracy +### 2. Accuracy ```bash # fp32 @@ -101,7 +84,6 @@ python run_generation_sq.py \ --model bigcode/starcoder \ --sq \ --alpha 1.0 \ - --int8 \ --accuracy \ --batch_size 20 \ --n_samples 20 \ @@ -109,11 +91,42 @@ python run_generation_sq.py \ --temperature 0.2 \ --do_sample \ --tasks "humaneval" +``` + +## WeightOnlyQuant + +1. ### Performance + +```bash +# weightonlyquant +OMP_NUM_THREADS= numactl -m -C python run_generation_cpu_woq.py \ + --model bigcode/starcoder \ + --woq \ + --benchmark \ + --benchmark_batch_size 1 +# load_in_4bit +OMP_NUM_THREADS= numactl -m -C python run_generation_cpu_woq.py \ + --model bigcode/starcoder \ + --load_in_4bit \ + --benchmark \ + --benchmark_batch_size 1 +# load_in_8bit +OMP_NUM_THREADS= numactl -m -C python run_generation_cpu_woq.py \ + --model bigcode/starcoder \ + --load_in_8bit \ + --benchmark \ + --benchmark_batch_size 1 +``` + +2. ### Accuracy + +```bash + # weightonlyquant python run_generation_cpu_woq.py \ --model bigcode/starcoder \ --woq \ - --woq_weight_dtype "nf4" \ + --weight_dtype "nf4" \ --accuracy \ --batch_size 20 \ --n_samples 20 \ @@ -166,17 +179,14 @@ This creates an image called `evaluation-harness-multiple`, and runs a test on i Suppose the fp32 model is `starcoder-3b`, saved quantized model in `saved_results` and do evaluation on `multiple-lua` tasks with: ``` docker run -v $(CURDIR):$(CURDIR) -it /bin/bash -python3 run_generation.py \ +python3 run_generation_sq.py \ --model $(CURDIR)/starcoder-3b \ - --quantize \ --sq \ --alpha 0.7 \ - --ipex \ - --calib_iters 500 \ + --calib_n_samples 500 \ --calib_batch_size 1 \ --dataset "mbpp" \ --output_dir "$(CURDIR)/saved_results" \ - --int8 \ --accuracy \ --tasks multiple-py \ --batch_size 20 \ @@ -191,9 +201,9 @@ python3 run_generation.py \ To run the container (here from image `evaluation-harness-multiple`) to quantize and evaluate on `CURDIR`, or another file mount it with -v, specify n_samples and allow code execution with --allow_code_execution (and add the number of problems --limit if it was used during generation): ```bash docker run -v $(CURDIR):$(CURDIR) \ - -it $(IMAGE_NAME) python3 run_generation.py --model $(CURDIR)/starcoder-3b --quantize --sq --alpha 0.7 --ipex \ - --calib_iters 5 --calib_batch_size 1 --dataset "mbpp" --calib_split "test" --output_dir "$(CURDIR)/saved_results" \ - --int8 --accuracy --tasks multiple-py --batch_size 20 --n_samples 20 --allow_code_execution \ + -it $(IMAGE_NAME) python3 run_generation_sq.py --model $(CURDIR)/starcoder-3b --sq --alpha 0.7 + --calib_n_samples 5 --calib_batch_size 1 --dataset "mbpp" --output_dir "$(CURDIR)/saved_results" \ + --accuracy --tasks multiple-py --batch_size 20 --n_samples 20 --allow_code_execution \ --do_sample --temperature 0.2 --limit 2 ``` diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_generation_cpu_woq.py b/examples/huggingface/pytorch/code-generation/quantization/run_generation_cpu_woq.py index 7d2c8d23139..a5c282acd33 100644 --- a/examples/huggingface/pytorch/code-generation/quantization/run_generation_cpu_woq.py +++ b/examples/huggingface/pytorch/code-generation/quantization/run_generation_cpu_woq.py @@ -15,16 +15,15 @@ from intel_extension_for_transformers.transformers import ( AutoModelForCausalLM, ) - +from intel_extension_for_transformers.transformers.llm.quantization.sq_utils import( + generate_dummy_past_key_values, +) parser = argparse.ArgumentParser() # ============Main configs============ parser.add_argument( "--model", nargs="?", default="bigcode/starcoderbase", const="bigcode/starcoderbase" ) -parser.add_argument("--trust_remote_code", action="store_true") -parser.add_argument("--_commit_hash", default=None, type=str) -parser.add_argument("--use_neural_speed", action="store_true") parser.add_argument("--dataset", nargs="?", default="mbpp", const="mbpp") parser.add_argument("--dtype", type=str, default="int8") parser.add_argument( @@ -33,13 +32,12 @@ parser.add_argument("--output_dir", nargs="?", default="./saved_results") # ============Benchmark configs============== parser.add_argument("--benchmark", action="store_true") +parser.add_argument("--benchmark_batch_size", default=1, type=int, help="num benchmark batchsize") parser.add_argument("--benchmark_iters", default=100, type=int, help="num iter") parser.add_argument("--num_warmup", default=10, type=int, help="num warmup") parser.add_argument( "--prompt_size", default=32, type=int, help="generate dummy input_ids size" ) -# ============BitsAndBytes configs============== -parser.add_argument("--bitsandbytes", action="store_true") # ============WeightOnlyQuant configs=============== parser.add_argument("--woq", action="store_true") parser.add_argument( @@ -91,7 +89,7 @@ help="Use layer wise to do quantization", ) parser.add_argument( - "--n_samples", type=int, default=512, help="Number of calibration data samples." + "--calib_n_samples", type=int, default=512, help="Number of calibration data samples." ) parser.add_argument( "--seq_len", @@ -100,7 +98,7 @@ help="Calibration dataset sequence max length, this should align with your model config", ) parser.add_argument( - "--batch_size", + "--calib_batch_size", type=int, default=8, help="Calibration batchsize.", @@ -157,6 +155,14 @@ action="store_true", help="whether to quant the lm head layer", ) +# ============BitsAndBytes configs============== +parser.add_argument("--bitsandbytes", action="store_true") +# ============AutoModel parameters============== +parser.add_argument("--load_in_4bit", action="store_true") +parser.add_argument("--load_in_8bit", action="store_true") +parser.add_argument("--_commit_hash", default=None, type=str) +parser.add_argument("--trust_remote_code", action="store_true") +parser.add_argument("--use_neural_speed", action="store_true") # ============Harness configs============ parser.add_argument("--tasks", default=None, help="Evaluation tasks") parser.add_argument( @@ -225,6 +231,7 @@ parser.add_argument("--n_samples", default=1, type=int) parser.add_argument("--eos", default="<|endoftext|>", type=str) parser.add_argument("--seed", default=0, type=int) +parser.add_argument("--batch_size", default=1, type=int, help="batch size num.") args = parser.parse_args() @@ -240,11 +247,7 @@ args.model, torchscript=( True - if ( - args.sq - or args.woq_algo in ["Awq", "Teq"] - or (args.int8 or args.int8_bf16_mixed or args.benchmark) - ) + if args.woq_algo in ["Awq", "Teq"] else False ), # torchscript will force `return_dict=False` to avoid jit errors use_cache=True, # to use kv cache. @@ -288,8 +291,8 @@ zero_point=False if args.scheme == "sym" else True, group_size=args.group_size, seq_len=args.seq_len, - n_samples=args.n_samples, - batch_size=args.batch_size, + n_samples=args.calib_n_samples, + batch_size=args.calib_batch_size, compute_dtype=args.compute_dtype, scale_dtype=args.scale_dtype, weight_dtype=args.weight_dtype, @@ -303,8 +306,8 @@ sym=True if args.scheme == "sym" else False, group_size=args.group_size, seq_len=args.seq_len, - batch_size=args.batch_size, - n_samples=args.n_samples, + batch_size=args.calib_batch_size, + n_samples=args.calib_n_samples, compute_dtype=args.compute_dtype, scale_dtype=args.scale_dtype, weight_dtype=args.weight_dtype, @@ -321,9 +324,9 @@ blocksize=args.blocksize, static_groups=args.static_groups, group_size=args.group_size, - n_samples=args.n_samples, + batch_size=args.calib_batch_size, + n_samples=args.calib_n_samples, seq_len=args.seq_len, - batch_size=args.batch_size, compute_dtype=args.compute_dtype, scale_dtype=args.scale_dtype, weight_dtype=args.weight_dtype, @@ -337,7 +340,8 @@ dataset=args.dataset, bits=args.bits, sym=True if args.scheme == "sym" else False, - n_samples=args.n_samples, + batch_size=args.calib_batch_size, + n_samples=args.calib_n_samples, group_size=args.group_size, compute_dtype=args.compute_dtype, scale_dtype=args.scale_dtype, @@ -384,25 +388,22 @@ # save model if args.output_dir is not None and ((args.woq or args.load_in_4bit or args.load_in_8bit) and not args.use_neural_speed): + import pdb;pdb.set_trace(); user_model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # to validate woq model accuracy args.model = args.output_dir if args.benchmark: - normalized_config = NormalizedConfigManager.get_normalized_config_class( - user_model.config.model_type - )(user_model.config) - num_layers = normalized_config.num_layers - num_attention_heads = normalized_config.num_attention_heads - hidden_size = normalized_config.hidden_size - d_k = hidden_size // num_attention_heads - num_beams = 1 - if hasattr(normalized_config, "num_key_value_heads"): - num_key_value_heads = normalized_config.num_key_value_heads - if hasattr(normalized_config, "multi_query_group_num"): - num_key_value_heads = normalized_config.multi_query_group_num - + print("Loading model from: ", args.model) + user_model = AutoModelForCausalLM.from_pretrained( + args.model, + quantization_config=quantization_config, + trust_remote_code=args.trust_remote_code, + _commit_hash=args._commit_hash, + use_neural_speed=args.use_neural_speed, + ) + model_config = user_model.config num_iter = args.benchmark_iters num_warmup = args.num_warmup @@ -416,46 +417,16 @@ input_ids = torch.randint( 1, tokenizer.vocab_size, - size=(args.batch_size, args.prompt_size), + size=(args.benchmark_batch_size, args.prompt_size), ) input_bs, input_len = input_ids.shape attention_mask = torch.ones(input_bs, input_len) position_ids = ( torch.arange(input_len).unsqueeze(0).expand(input_bs, -1) ) - if user_model.config.model_type == "gpt_bigcode": - new_shape = [input_bs, 0, d_k * 2] - dummy_tensor = torch.zeros(size=new_shape) - past_key_values = tuple([dummy_tensor] * num_layers) - else: - if not (args.int8 or args.int8_bf16_mixed): - new_shape = [input_bs, num_key_value_heads, 0, d_k] - past_key_values = [ - ( - torch.zeros(size=new_shape).contiguous(), - torch.zeros(size=new_shape).contiguous(), - ) - for _ in range(num_layers) - ] - past_key_values = tuple(past_key_values) - - else: - new_shape = [input_bs, num_key_value_heads, 1, d_k] - beam_idx_tmp = torch.zeros( - (2048, int(input_bs * num_beams)), dtype=torch.long - ).contiguous() - past_key_values = [ - ( - torch.zeros( - 1, 0, 0, 1, dtype=torch.long - ).contiguous(), - torch.zeros(size=new_shape).contiguous(), - torch.zeros(size=new_shape).contiguous(), - beam_idx_tmp, - ) - for _ in range(num_layers) - ] - past_key_values = tuple(past_key_values) + past_key_values = generate_dummy_past_key_values( + config=model_config, input_bs=input_bs + ) inp = { "input_ids": input_ids, diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_generation_sq.py b/examples/huggingface/pytorch/code-generation/quantization/run_generation_sq.py index b7802c01996..156d3dcfd8a 100644 --- a/examples/huggingface/pytorch/code-generation/quantization/run_generation_sq.py +++ b/examples/huggingface/pytorch/code-generation/quantization/run_generation_sq.py @@ -12,6 +12,11 @@ AutoModelForCausalLM, ) +from intel_extension_for_transformers.transformers.llm.quantization.sq_utils import( + generate_dummy_past_key_values, + generate_dummy_past_key_values_for_opt_llm, + IPEX_OPT_LLM_SUPPORTED, +) parser = argparse.ArgumentParser() # ============Main configs============ @@ -29,6 +34,7 @@ # ============Benchmark configs============== parser.add_argument("--benchmark", action="store_true") parser.add_argument("--benchmark_iters", default=100, type=int, help="num iter") +parser.add_argument("--benchmark_batch_size", default=1, type=int, help="batch size num.") parser.add_argument("--num_warmup", default=10, type=int, help="num warmup") parser.add_argument( "--prompt_size", default=32, type=int, help="generate dummy input_ids size" @@ -44,7 +50,7 @@ parser.add_argument( "--seq_len", default=512, type=int, help="Smooth quant calibration input length." ) -parser.add_argument("--batch_size", default=1, type=int, help="batch size num.") +parser.add_argument("--calib_batch_size", default=1, type=int, help="batch size num.") parser.add_argument("--padding", action="store_true") parser.add_argument("--shuffle", action="store_true") # sq alpha "auto" parameters @@ -136,6 +142,7 @@ parser.add_argument("--n_samples", default=1, type=int) parser.add_argument("--eos", default="<|endoftext|>", type=str) parser.add_argument("--seed", default=0, type=int) +parser.add_argument("--batch_size", default=1, type=int, help="batch size num.") args = parser.parse_args() @@ -153,8 +160,7 @@ True if ( args.sq - or args.woq_algo in ["Awq", "Teq"] - or (args.int8 or args.int8_bf16_mixed or args.benchmark) + or args.benchmark ) else False ), # torchscript will force `return_dict=False` to avoid jit errors @@ -184,7 +190,7 @@ tokenizer=tokenizer, seq_len=args.seq_len, n_samples=args.calib_n_samples, - batch_size=args.batch_size, + batch_size=args.calib_batch_size, excluded_precisions=excluded_precisions, alpha=args.alpha if args.alpha == "auto" else float(args.alpha), scale_sharing=args.scale_sharing, @@ -243,19 +249,7 @@ ) if args.benchmark: - normalized_config = NormalizedConfigManager.get_normalized_config_class( - user_model.config.model_type - )(user_model.config) - num_layers = normalized_config.num_layers - num_attention_heads = normalized_config.num_attention_heads - hidden_size = normalized_config.hidden_size - d_k = hidden_size // num_attention_heads - num_beams = 1 - if hasattr(normalized_config, "num_key_value_heads"): - num_key_value_heads = normalized_config.num_key_value_heads - if hasattr(normalized_config, "multi_query_group_num"): - num_key_value_heads = normalized_config.multi_query_group_num - + model_config = user_model.config num_iter = args.benchmark_iters num_warmup = args.num_warmup @@ -269,46 +263,21 @@ input_ids = torch.randint( 1, tokenizer.vocab_size, - size=(args.batch_size, args.prompt_size), + size=(args.benchmark_batch_size, args.prompt_size), ) input_bs, input_len = input_ids.shape attention_mask = torch.ones(input_bs, input_len) position_ids = ( torch.arange(input_len).unsqueeze(0).expand(input_bs, -1) ) - if user_model.config.model_type == "gpt_bigcode": - new_shape = [input_bs, 0, d_k * 2] - dummy_tensor = torch.zeros(size=new_shape) - past_key_values = tuple([dummy_tensor] * num_layers) + if model_config.model_type in IPEX_OPT_LLM_SUPPORTED: + past_key_values = generate_dummy_past_key_values_for_opt_llm( + config=model_config, input_bs=input_bs, num_beams=1 + ) else: - if not (args.int8 or args.int8_bf16_mixed): - new_shape = [input_bs, num_key_value_heads, 0, d_k] - past_key_values = [ - ( - torch.zeros(size=new_shape).contiguous(), - torch.zeros(size=new_shape).contiguous(), - ) - for _ in range(num_layers) - ] - past_key_values = tuple(past_key_values) - - else: - new_shape = [input_bs, num_key_value_heads, 1, d_k] - beam_idx_tmp = torch.zeros( - (2048, int(input_bs * num_beams)), dtype=torch.long - ).contiguous() - past_key_values = [ - ( - torch.zeros( - 1, 0, 0, 1, dtype=torch.long - ).contiguous(), - torch.zeros(size=new_shape).contiguous(), - torch.zeros(size=new_shape).contiguous(), - beam_idx_tmp, - ) - for _ in range(num_layers) - ] - past_key_values = tuple(past_key_values) + past_key_values = generate_dummy_past_key_values( + config=model_config, input_bs=input_bs + ) inp = { "input_ids": input_ids, @@ -351,7 +320,7 @@ model=user_model, tokenizer=tokenizer, tasks=args.tasks, - batch_size=args.eval_batch_size, + batch_size=args.batch_size, args=args, ) print(results) diff --git a/tests/CI/test_quantization.py b/tests/CI/test_quantization.py index 264d924efad..9e27d3a0d93 100644 --- a/tests/CI/test_quantization.py +++ b/tests/CI/test_quantization.py @@ -342,21 +342,20 @@ def test_quantization_for_llm(self): output = woq_model(dummy_input) self.assertTrue(isclose(float(output[0][0][0][0]), 0.20071472227573395 , rel_tol=1e-04)) - # # TEQ - # need INC fix. - # woq_config = TeqConfig(bits=4, - # n_samples=5, - # batch_size=1, - # seq_len=512, - # tokenizer=tokenizer - # ) - # woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - # quantization_config=woq_config, - # use_neural_speed=False - # ) - # woq_model.eval() - # output = woq_model(dummy_input) - + # TEQ + woq_config = TeqConfig(bits=4, + n_samples=5, + batch_size=1, + seq_len=512, + tokenizer=tokenizer + ) + woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + quantization_config=woq_config, + use_neural_speed=False + ) + woq_model.eval() + output = woq_model(dummy_input) + self.assertTrue(isclose(float(output[0][0][0][0]), 0.17631684243679047 , rel_tol=1e-04)) # fp8 woq_config = RtnConfig(bits=8, weight_dtype="fp8_e5m2", scale_dtype="fp8_e8m0") From 7a956978ec701a5ff26885e9460b099178e071f8 Mon Sep 17 00:00:00 2001 From: changwangss Date: Fri, 12 Jul 2024 01:03:42 -0700 Subject: [PATCH 7/7] remove pdb Signed-off-by: changwangss --- .../code-generation/quantization/run_generation_cpu_woq.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_generation_cpu_woq.py b/examples/huggingface/pytorch/code-generation/quantization/run_generation_cpu_woq.py index a5c282acd33..6934d8b55b3 100644 --- a/examples/huggingface/pytorch/code-generation/quantization/run_generation_cpu_woq.py +++ b/examples/huggingface/pytorch/code-generation/quantization/run_generation_cpu_woq.py @@ -388,7 +388,6 @@ # save model if args.output_dir is not None and ((args.woq or args.load_in_4bit or args.load_in_8bit) and not args.use_neural_speed): - import pdb;pdb.set_trace(); user_model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # to validate woq model accuracy