From 87bd3c7325ba87cc71d7eeaf6765e4f34de44bac Mon Sep 17 00:00:00 2001
From: Avinash Sharma <avinash.sharma@amd.com>
Date: Mon, 16 Dec 2024 17:03:40 -0800
Subject: [PATCH] Update llama_benchmarking.md

---
 llama_benchmarking.md | 49 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 6 deletions(-)

diff --git a/llama_benchmarking.md b/llama_benchmarking.md
index 9531c16..1c44da0 100644
--- a/llama_benchmarking.md
+++ b/llama_benchmarking.md
@@ -26,7 +26,9 @@ Create a SAS token in Azure:
 - Replace [Add your SAS token here] (including the [ and ]) by SAS token string in instructions below 
 
 ```
-azcopy copy 'https://sharkblobs.blob.core.windows.net/halo-models/llm-dev/llama3_8b/8b_f16.irpa?[Add SAS token here]' '8b_f16.irpa'
+azcopy copy \
+'https://sharkblobs.blob.core.windows.net/halo-models/llm-dev/llama3_8b/8b_f16.irpa?[Add SAS token here]' \
+'8b_f16.irpa'
 ```
 
 If you have trouble accessing `sharkblobs`, you can copy the 8b f16 unsharded irpa file from the `SharkMi300x` machine:
@@ -37,12 +39,25 @@ scp nod@10.23.233.219:/data/llama3.1/weights/8b/fp16/llama3.1_8b_fp16.irpa 8b_f1
 ## 2. Generate the IR
 a. To generate the IR for prefill only:
 ```
-python3 -m sharktank.examples.export_paged_llm_v1 --bs=4 --irpa-file=8b_f16.irpa --output-mlir=8b_f16_prefill_nondecomposed.mlir --output-config=8b_f16_prefill_nondecomposed.json --attention-kernel=torch --skip-decode --block-seq-stride=32
+python3 -m sharktank.examples.export_paged_llm_v1 \
+  --bs=4 \
+  --irpa-file=8b_f16.irpa \
+  --output-mlir=8b_f16_prefill_nondecomposed.mlir \
+  --output-config=8b_f16_prefill_nondecomposed.json \
+  --attention-kernel=torch \
+  --skip-decode \
+  --block-seq-stride=32
 ```
 
 To generate the IR for both prefill + decode (remove the `--skip-decode` flag):
 ```
-python3 -m sharktank.examples.export_paged_llm_v1 --bs=4 --irpa-file=8b_f16.irpa --output-mlir=8b_f16_prefill_nondecomposed.mlir --output-config=8b_f16_prefill_nondecomposed.json --attention-kernel=torch --block-seq-stride=32
+python3 -m sharktank.examples.export_paged_llm_v1 \
+  --bs=4 \
+  --irpa-file=8b_f16.irpa \
+  --output-mlir=8b_f16_prefill_nondecomposed.mlir \
+  --output-config=8b_f16_prefill_nondecomposed.json \
+  --attention-kernel=torch \
+  --block-seq-stride=32
 ```
 
 ## 3. Get the numpy inputs
@@ -55,7 +70,20 @@ Get the 8b f16 tp1 unsharded decode numpy inputs: [get_8b_f16_tp1_decode_inputs.
 This command compiles the full IR (both prefill + decode) into a vmfb.
 
 ```
-../iree-build-no-trace/tools/iree-compile 8b_f16_prefill_nondecomposed.mlir --iree-hip-target=gfx942 -o=prefill_8b.vmfb --iree-hal-target-device=hip --iree-dispatch-creation-enable-aggressive-fusion=true --iree-global-opt-propagate-transposes=true --iree-opt-aggressively-propagate-transposes=true --iree-opt-data-tiling=false --iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' --iree-hal-indirect-command-buffers=true --iree-stream-resource-memory-model=discrete --iree-hip-legacy-sync=false --iree-hal-memoization=true --iree-opt-strip-assertions
+../iree-build-no-trace/tools/iree-compile 8b_f16_prefill_nondecomposed.mlir \
+  --iree-hip-target=gfx942 \
+  -o=prefill_8b.vmfb \
+  --iree-hal-target-device=hip \
+  --iree-dispatch-creation-enable-aggressive-fusion=true \
+  --iree-global-opt-propagate-transposes=true \
+  --iree-opt-aggressively-propagate-transposes=true \
+  --iree-opt-data-tiling=false \
+  --iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
+  --iree-hal-indirect-command-buffers=true \
+  --iree-stream-resource-memory-model=discrete \
+  --iree-hip-legacy-sync=false \
+  --iree-hal-memoization=true \
+  --iree-opt-strip-assertions
 ```
 
 ## 5. Benchmark command
@@ -105,7 +133,10 @@ ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7  \
 ## Sharded
 Sharded - If you want to create your own tp8 sharded irpa files use this command:
 ```
-python3 -m sharktank.examples.sharding.shard_llm_dataset --irpa-file 8b_fp16.irpa --output-irpa 8b_fp16_tp8.irpa --tensor-parallelism-size 8
+python3 -m sharktank.examples.sharding.shard_llm_dataset \
+  --irpa-file 8b_fp16.irpa \
+  --output-irpa 8b_fp16_tp8.irpa \
+  --tensor-parallelism-size 8
 ```
 
 Larger sharded irpa files (e.g. 70b, 405b) will be stored in `sharkblobs` soon. Otherwise, you can copy the 70b/405b f16 sharded irpa files from the `SharkMi300x` machine (long copy time):
@@ -116,7 +147,13 @@ scp nod@10.23.233.219:/data/llama3.1/weights/405b/fp16/tp8/* .
 Sharded - You need to use the unranked sharded irpa file to generate the sharded IR:
 
 ```
-python3 -m sharktank.examples.export_paged_llm_v1 --bs=4 --irpa-file=/shark-dev/405b/llama3.1_405b_fp16_tp8_parameters.irpa --output-mlir=405b_f16_prefill_tp8_nondecomposed.mlir --output-config=405b_f16_prefill_tp8_nondecomposed.json --attention-kernel=torch --skip-decode
+python3 -m sharktank.examples.export_paged_llm_v1 \
+  --bs=4 \
+  --irpa-file=/shark-dev/405b/llama3.1_405b_fp16_tp8_parameters.irpa \
+  --output-mlir=405b_f16_prefill_tp8_nondecomposed.mlir \
+  --output-config=405b_f16_prefill_tp8_nondecomposed.json \
+  --attention-kernel=torch \
+  --skip-decode
 ```
 
 Get the 8b f16 tp8 sharded numpy inputs: [get_8b_f16_tp8_numpy_inputs.sh](https://gist.github.com/aviator19941/9b3cd6511347e57671b7ff1da7c80bfa)