diff --git a/src/brevitas_examples/llm/README.md b/src/brevitas_examples/llm/README.md index 225923e3f..fc192ae73 100644 --- a/src/brevitas_examples/llm/README.md +++ b/src/brevitas_examples/llm/README.md @@ -43,8 +43,9 @@ usage: main.py [-h] [--model MODEL] [--seed SEED] [--nsamples NSAMPLES] [--act-calibration] [--bias-corr] [--ln-affine-merge] [--convert-layernorm-to-rmsnorm] [--replace-rmsnorm] [--no-quantize] [--no-float16] - [--scaling-min-val SCALING_MIN_VAL] [--replace-mha] - [--weight-equalization] [--rotation {fx,layerwise,fused_no_fx}] + [--scaling-min-val SCALING_MIN_VAL] [--quant-sdpa] + [--replace-mha] [--weight-equalization] + [--rotation {fx,layerwise,fused_no_fx}] [--rotation-mode {had,ort}] [--rotation-orphan-sink] [--act-equalization {None,layerwise,fx}] [--load-awq LOAD_AWQ] [--export-target {None,onnx_qcdq,torch_qcdq,sharded_torchmlir_group_weight,sharded_packed_torchmlir_group_weight}] @@ -142,6 +143,8 @@ options: --scaling-min-val SCALING_MIN_VAL Minimum value to clamp scale to when using bf16 or fp16 quantization. + --quant-sdpa Quantize `F.scaled_dot_product_attention` (default: + False) --replace-mha Replace HuggingFace Attention with a quantizable version --weight-equalization @@ -176,5 +179,4 @@ options: sequence. This is useful in case you would like to quantize or evaluate on long sequences (default: False). - ```