Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

Commit

Permalink
add inc version limit
Browse files Browse the repository at this point in the history
Signed-off-by: changwangss <[email protected]>
  • Loading branch information
changwangss committed Jul 12, 2024
1 parent 83f086b commit 307d34e
Show file tree
Hide file tree
Showing 10 changed files with 409 additions and 161 deletions.
26 changes: 13 additions & 13 deletions examples/huggingface/pytorch/code-generation/quantization/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,42 +30,42 @@ export KMP_AFFINITY=granularity=fine,compact,1,0
export LD_PRELOAD=${CONDA_PREFIX}/lib/libiomp5.so
export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
# fp32
OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation.py \
OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation_sq.py \
--model bigcode/starcoder \
--benchmark \
--batch_size 1
# mixedprecision
OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation.py \
OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation_sq.py \
--model bigcode/starcoder \
--mixed_precision \
--benchmark \
--batch_size 1
# smoothquant
# [alternative] --int8 is used for int8 only, --int8_bf16_mixed is used for int8 mixed bfloat16 precision.
python run_generation.py \
python run_generation_sq.py \
--model bigcode/starcoder \
--output_dir "./saved_results" \
--sq \
--alpha 0.7 \
--calib_iters 500 \
--calib_n_samples 500 \
--dataset "mbpp"
--int8 \
--benchmark \
--batch_size 1
# weightonlyquant
OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation.py \
OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation_cpu_woq.py \
--model bigcode/starcoder \
--woq \
--benchmark \
--batch_size 1
# load_in_4bit
OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation.py \
OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation_cpu_woq.py \
--model bigcode/starcoder \
--load_in_4bit \
--benchmark \
--batch_size 1
# load_in_8bit
OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation.py \
OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation_cpu_woq.py \
--model bigcode/starcoder \
--load_in_8bit \
--benchmark \
Expand All @@ -75,7 +75,7 @@ OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python ru

```bash
# fp32
python run_generation.py \
python run_generation_sq.py \
--model bigcode/starcoder \
--accuracy \
--batch_size 20 \
Expand All @@ -85,7 +85,7 @@ python run_generation.py \
--do_sample \
--tasks "humaneval"
# mixedprecision
python run_generation.py \
python run_generation_sq.py \
--model bigcode/starcoder \
--mixed_precision \
--accuracy \
Expand All @@ -97,7 +97,7 @@ python run_generation.py \
--tasks "humaneval"
# smoothquant
# [alternative] --int8 is used for int8 only, --int8_bf16_mixed is used for int8 mixed bfloat16 precision.
python run_generation.py \
python run_generation_sq.py \
--model bigcode/starcoder \
--sq \
--alpha 1.0 \
Expand All @@ -110,7 +110,7 @@ python run_generation.py \
--do_sample \
--tasks "humaneval"
# weightonlyquant
python run_generation.py \
python run_generation_cpu_woq.py \
--model bigcode/starcoder \
--woq \
--woq_weight_dtype "nf4" \
Expand All @@ -122,7 +122,7 @@ python run_generation.py \
--do_sample \
--tasks "humaneval"
# load_in_4bit
python run_generation.py \
python run_generation_cpu_woq.py \
--model bigcode/starcoder \
--load_in_4bit \
--accuracy \
Expand All @@ -133,7 +133,7 @@ python run_generation.py \
--do_sample \
--tasks "humaneval"
# load_in_8bit
python run_generation.py \
python run_generation_cpu_woq.py \
--model bigcode/starcoder \
--load_in_8bit \
--accuracy \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ function init_params {
batch_size=1
tuned_checkpoint=saved_results
lm_eval_tasks="humaneval"
script="run_generation.py"
script="run_generation_sq.py"
for var in "$@"
do
case $var in
Expand Down Expand Up @@ -85,7 +85,7 @@ function run_benchmark {


if [[ ${int8} == "true" ]]; then
extra_cmd=$extra_cmd" --int8"
model_name_or_path=$tuned_checkpoint
fi

echo $extra_cmd
Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,10 @@
import argparse
import re
import time
import json
import os
import pathlib
import torch
import types
import numpy as np
from itertools import chain
from pathlib import Path
from transformers import AutoTokenizer, AutoConfig
from optimum.utils import NormalizedConfigManager
from intel_extension_for_transformers.transformers import (
MixedPrecisionConfig,
SmoothQuantConfig,
BitsAndBytesConfig,
RtnConfig,
AwqConfig,
Expand All @@ -24,7 +15,6 @@
from intel_extension_for_transformers.transformers import (
AutoModelForCausalLM,
)
from intel_extension_for_transformers.transformers.utils import str2bool

parser = argparse.ArgumentParser()

Expand All @@ -41,67 +31,16 @@
"--max_new_tokens", default=32, type=int, help="output max new tokens"
)
parser.add_argument("--output_dir", nargs="?", default="./saved_results")
parser.add_argument("--n_samples", default=500, type=int, help="calibration iters.")
parser.add_argument(
"--restore_sq_model_from_json",
action="store_true",
help="restore ipex quantized model from output_dir/best_configure.json",
)
# ============Benchmark configs==============
parser.add_argument("--benchmark", action="store_true")
parser.add_argument("--benchmark_iters", default=100, type=int, help="num iter")
parser.add_argument("--num_warmup", default=10, type=int, help="num warmup")
parser.add_argument(
"--prompt_size", default=32, type=int, help="generate dummy input_ids size"
)
# ============Accuracy configs==============
parser.add_argument("--accuracy", action="store_true")
parser.add_argument("--eval_batch_size", default=56, type=int, help="batch size num.")
parser.add_argument(
"--save_accuracy_path", default=None, help="Save accuracy results path."
)
# ============MixedPrecision configs==============
parser.add_argument("--mixed_precision", action="store_true")
# ============SmoothQuant configs==============
parser.add_argument("--sq", action="store_true")
parser.add_argument("--alpha", default=0.5, help="Smooth quant parameter.")
parser.add_argument(
"--n_samples", default=100, type=int, help="Smooth quant calibration samples."
)
parser.add_argument(
"--seq_len", default=512, type=int, help="Smooth quant calibration input length."
)
parser.add_argument("--batch_size", default=1, type=int, help="batch size num.")
parser.add_argument("--padding", action="store_true")
parser.add_argument("--shuffle", action="store_true")
# sq alpha "auto" parameters
parser.add_argument("--scale_sharing", action="store_true")
parser.add_argument(
"--init_alpha", default=0.5, type=float, help="Smooth quant parameter."
)
parser.add_argument(
"--alpha_min", default=0.0, type=float, help="Smooth quant parameter."
)
parser.add_argument(
"--alpha_max", default=1.0, type=float, help="Smooth quant parameter."
)
parser.add_argument(
"--alpha_step", default=0.1, type=float, help="Smooth quant parameter."
)
parser.add_argument("--shared_criterion", default="max", type=str)
parser.add_argument("--do_blockwise", action="store_true")
# ============BitsAndBytes configs==============
parser.add_argument("--bitsandbytes", action="store_true")
# ============WeightOnlyQuant configs===============
parser.add_argument("--accuracy", action="store_true")
parser.add_argument("--eval_batch_size", default=56, type=int, help="batch size num for evaluation.")
parser.add_argument(
"--tasks",
default="lambada_openai",
type=str,
help="tasks list for accuracy validation",
)
# ============WeightOnlyQuant configs===============
parser.add_argument("--woq", action="store_true")
parser.add_argument(
"--woq_algo",
Expand Down Expand Up @@ -277,6 +216,7 @@
help="Path of additional data to load for the tasks",
)
# ============Evaluation configs==============
parser.add_argument("--accuracy", action="store_true")
parser.add_argument("--prefix", default="")
parser.add_argument("--do_sample", action="store_true")
parser.add_argument("--temperature", default=0.2, type=float)
Expand Down Expand Up @@ -321,33 +261,14 @@
tokenizer.pad_token = tokenizer.eos_token

# Generation
generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4)
if args.use_neural_speed:
generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=1)
else:
generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4)

# mp/sq/woq/bitsandbytes config setting
# woq/bitsandbytes config setting
quantization_config = None
if args.mixed_precision:
quantization_config = MixedPrecisionConfig(dtype="bfloat16") # default is bfloat16
elif args.sq:
excluded_precisions = ["bf16"]
quantization_config = SmoothQuantConfig(
tokenizer=tokenizer,
seq_len=args.seq_len,
n_samples=args.n_samples,
batch_size=args.batch_size,
excluded_precisions=excluded_precisions,
alpha=args.alpha if args.alpha == "auto" else float(args.alpha),
scale_sharing=args.scale_sharing,
init_alpha=args.init_alpha,
alpha_min=args.alpha_min,
alpha_max=args.alpha_max,
alpha_step=args.alpha_step,
shared_criterion=args.shared_criterion,
do_blockwise=args.do_blockwise,
shuffle=args.shuffle,
padding=args.padding,
num_beams=generate_kwargs["num_beams"],
)
elif args.woq:
if args.woq:
if args.woq_algo == "Rtn":
quantization_config = RtnConfig(
bits=args.bits,
Expand Down Expand Up @@ -433,11 +354,12 @@
assert False, "Please set the correct '--woq_algo'"
# bitsandbytes
elif args.bitsandbytes:
# GPU device is need for `load_in_4bit` and `load_in_8bit`.
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
)
else:
print("The quantization_config is None.")

# get optimized model
if quantization_config is not None:
Expand All @@ -446,7 +368,7 @@
quantization_config=quantization_config,
trust_remote_code=args.trust_remote_code,
_commit_hash=args._commit_hash,
use_neural_speed=False,
use_neural_speed=args.use_neural_speed,
)
elif args.load_in_4bit or args.load_in_8bit:
# CPU device usage is provided by intel-extension-for-transformers.
Expand All @@ -455,49 +377,17 @@
load_in_4bit=args.load_in_4bit,
load_in_8bit=args.load_in_8bit,
_commit_hash=args._commit_hash,
use_neural_speed=False,
)
elif not args.int8 and not args.int8_bf16_mixed:
user_model = AutoModelForCausalLM.from_pretrained(
args.model,
config=config,
trust_remote_code=args.trust_remote_code,
_commit_hash=args._commit_hash,
use_neural_speed=False,
use_neural_speed=args.use_neural_speed,
)
else:
print("Didn't do Weight Only Quantization.")

# save model
if args.output_dir is not None:
if args.output_dir is not None and ((args.woq or args.load_in_4bit or args.load_in_8bit) and not args.use_neural_speed):
user_model.save_pretrained(args.output_dir)
tokenizer.save_pretrained(args.output_dir)
if args.sq:
quantization_config.remove_redundant_parameters()
config.quantization_config = quantization_config
config.save_pretrained(args.output_dir)
user_model.save(args.output_dir)
user_model = AutoModelForCausalLM.from_pretrained(
args.output_dir,
trust_remote_code=args.trust_remote_code,
_commit_hash=args._commit_hash,
)
elif args.mixed_precision:
user_model.save_pretrained(args.output_dir)

if args.restore_sq_model_from_json:
from intel_extension_for_transformers.transformers.llm.quantization.sq_utils import (
recover_model_from_json,
)
user_model = recover_model_from_json(
args.model,
os.path.join(args.output_dir, "qconfig.json"),
args.trust_remote_code,
)

elif not (args.sq or args.mixed_precision or args.woq):
user_model = AutoModelForCausalLM.from_pretrained(
args.model,
trust_remote_code=args.trust_remote_code,
_commit_hash=args._commit_hash,
)
# to validate woq model accuracy
args.model = args.output_dir

if args.benchmark:
normalized_config = NormalizedConfigManager.get_normalized_config_class(
Expand Down
Loading

0 comments on commit 307d34e

Please sign in to comment.