docs (examples/llm): Update README

Xilinx · Dec 4, 2024 · 091d982 · 091d982
1 parent 198a648
commit 091d982
Showing 1 changed file with 5 additions and 3 deletions.
diff --git a/src/brevitas_examples/llm/README.md b/src/brevitas_examples/llm/README.md
@@ -43,8 +43,9 @@ usage: main.py [-h] [--model MODEL] [--seed SEED] [--nsamples NSAMPLES]
                [--act-calibration] [--bias-corr] [--ln-affine-merge]
                [--convert-layernorm-to-rmsnorm] [--replace-rmsnorm]
                [--no-quantize] [--no-float16]
-               [--scaling-min-val SCALING_MIN_VAL] [--replace-mha]
-               [--weight-equalization] [--rotation {fx,layerwise,fused_no_fx}]
+               [--scaling-min-val SCALING_MIN_VAL] [--quant-sdpa]
+               [--replace-mha] [--weight-equalization]
+               [--rotation {fx,layerwise,fused_no_fx}]
                [--rotation-mode {had,ort}] [--rotation-orphan-sink]
                [--act-equalization {None,layerwise,fx}] [--load-awq LOAD_AWQ]
                [--export-target {None,onnx_qcdq,torch_qcdq,sharded_torchmlir_group_weight,sharded_packed_torchmlir_group_weight}]
@@ -142,6 +143,8 @@ options:
   --scaling-min-val SCALING_MIN_VAL
                         Minimum value to clamp scale to when using bf16 or
                         fp16 quantization.
+  --quant-sdpa          Quantize `F.scaled_dot_product_attention` (default:
+                        False)
   --replace-mha         Replace HuggingFace Attention with a quantizable
                         version
   --weight-equalization
@@ -176,5 +179,4 @@ options:
                         sequence. This is useful in case you would like to
                         quantize or evaluate on long sequences (default:
                         False).
-
 ```