update fix for distributed

vllm-project · robertgshaw2-neuralmagic · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024 · Dec 10, 2024
commit 02ea274b5159f60486c3d150ad33e0197c2785c4
diff --git a/profile/README.md b/profile/README.md
@@ -8,7 +8,7 @@ This guide explains how to profile the TPU performance on VLLM for specific shap
 
 ```bash
 export XLA_HLO_DEBUG=1
-export MODEL=meta-llama/Llama-3.1-8B-Instruct
+export MODEL=meta-llama/Llama-3.1-70B-Instruct
 export VLLM_TPU_PROFILE_DURATION_MS=2000
 export VLLM_TPU_PROFILE_DELAY_MS=1000
 
@@ -20,7 +20,7 @@ python3 profile_tpu.py \
     --batch-size 32 \
     --enforce-eager \
     --profile-result-dir profiles \
-    --max-model-len 2048
+    --max-model-len 2048 --tensor-parallel-size 8
 ```
 
 ### Generate Prefill Trace
@@ -39,5 +39,5 @@ python3 profile_tpu.py \
     --batch-size 1 \
     --enforce-eager \
     --profile-result-dir profiles \
-    --max-model-len 2048
+    --max-model-len 2048 --tensor-parallel-size 8
 ```
diff --git a/profile/profile_tpu.py b/profile/profile_tpu.py
@@ -17,11 +17,11 @@
 DELAY_MS = int(os.getenv("VLLM_TPU_PROFILE_DELAY_MS", 0))
 
 def main(args: argparse.Namespace):
-    server = xp.start_server(9012)
     print(args)
 
     engine_args = EngineArgs.from_cli_args(args)
     llm = LLM(**dataclasses.asdict(engine_args))
+    server = xp.start_server(9012)
 
     sampling_params = SamplingParams(
         temperature=0.0,
@@ -44,13 +44,10 @@
                               duration_ms=DURATION_MS)
             if DELAY_MS == 0:
                 time.sleep(1.0)
-            # NOTE: for prefill, you could run this in a loop
-            # so that you get a trace of multiple prefill steps
-            # NOTE: for decode, you will get traces of multiple
-            # steps because we generae for 128 tokens.
-            llm.generate(dummy_prompts,
-                         sampling_params=sampling_params,
-                         use_tqdm=False)
+            for _ in range(5):
+                llm.generate(dummy_prompts,
+                            sampling_params=sampling_params,
+                            use_tqdm=False)
         else:
             start_time = time.perf_counter()
             llm.generate(dummy_prompts,