updated

vllm-project · Dec 9, 2024 · ad4076d · ad4076d
1 parent 1c5a3a1
commit ad4076d
Showing 1 changed file with 20 additions and 58 deletions.
diff --git a/profile/profile_tpu.py b/profile/profile_tpu.py
@@ -1,9 +1,7 @@
-"""Benchmark the latency of processing a single batch of requests."""
 import argparse
 import dataclasses
-import json
 import time
-from pathlib import Path
+import os
 from typing import List, Optional
 
 import numpy as np
@@ -15,15 +13,14 @@
 from vllm.inputs import PromptType
 from vllm.utils import FlexibleArgumentParser
 
+DURATION_MS = int(os.getenv("VLLM_TPU_PROFILE_DURATION_MS", 3000))
+DELAY_MS = int(os.getenv("VLLM_TPU_PROFILE_DELAY_MS", 0))
 
 def main(args: argparse.Namespace):
-    _ = xp.start_server(9012)
+    server = xp.start_server(9012)
     print(args)
 
     engine_args = EngineArgs.from_cli_args(args)
-
-    # NOTE(woosuk): If the request cannot be processed in a single batch,
-    # the engine will automatically process the request in multiple batches.
     llm = LLM(**dataclasses.asdict(engine_args))
 
     sampling_params = SamplingParams(
@@ -41,11 +38,16 @@ def main(args: argparse.Namespace):
 
     def run_to_completion(profile_dir: Optional[str] = None):
         if profile_dir:
-            # For decode profiles ->
-            xp.trace_detached('localhost:9012', profile_dir, delay_ms=1000, duration_ms=2000)
-            # For prefill profiles ->
-            # xp.trace_detached('localhost:9012', profile_dir, duration_ms=3000)
-            # time.sleep(1.)
+            xp.trace_detached("localhost:9012",
+                              profile_dir,
+                              delay_ms=DELAY_MS,
+                              duration_ms=DURATION_MS)
+            if DELAY_MS == 0:
+                time.sleep(1.0)
+            # NOTE: for prefill, you could run this in a loop
+            # so that you get a trace of multiple prefill steps
+            # NOTE: for decode, you will get traces of multiple
+            # steps because we generae for 128 tokens.
             llm.generate(dummy_prompts,
                          sampling_params=sampling_params,
                          use_tqdm=False)
@@ -62,37 +64,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
     for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
         run_to_completion(profile_dir=None)
 
-    if args.profile:
-        profile_dir = args.profile_result_dir
-        if not profile_dir:
-            profile_dir = Path(
-                "."
-            ) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
-        print(f"Profiling (results will be saved to '{profile_dir}')...")
-        run_to_completion(profile_dir=profile_dir)
-        return
-
-    # Benchmark.
-    latencies = []
-    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
-        latencies.append(run_to_completion(profile_dir=None))
-    latencies = np.array(latencies)
-    percentages = [10, 25, 50, 75, 90, 99]
-    percentiles = np.percentile(latencies, percentages)
-    print(f'Avg latency: {np.mean(latencies)} seconds')
-    for percentage, percentile in zip(percentages, percentiles):
-        print(f'{percentage}% percentile latency: {percentile} seconds')
-
-    # Output JSON results if specified
-    if args.output_json:
-        results = {
-            "avg_latency": np.mean(latencies),
-            "latencies": latencies.tolist(),
-            "percentiles": dict(zip(percentages, percentiles.tolist())),
-        }
-        with open(args.output_json, "w") as f:
-            json.dump(results, f, indent=4)
-
+    profile_dir = args.profile_result_dir
+    print(f"Profiling (results will be saved to '{profile_dir}')...")
+    run_to_completion(profile_dir=profile_dir)
+    return
 
 if __name__ == '__main__':
     parser = FlexibleArgumentParser(
@@ -103,27 +78,14 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument('--batch-size', type=int, default=8)
     parser.add_argument('--num-iters-warmup',
                         type=int,
-                        default=10,
+                        default=3,
                         help='Number of iterations to run for warmup.')
-    parser.add_argument('--num-iters',
-                        type=int,
-                        default=30,
-                        help='Number of iterations to run.')
-    parser.add_argument(
-        '--profile',
-        action='store_true',
-        help='profile the generation process of a single batch')
     parser.add_argument(
         '--profile-result-dir',
         type=str,
-        default=None,
+        default="profiles",
         help=('path to save the pytorch profiler output. Can be visualized '
               'with ui.perfetto.dev or Tensorboard.'))
-    parser.add_argument(
-        '--output-json',
-        type=str,
-        default=None,
-        help='Path to save the latency results in JSON format.')
 
     parser = EngineArgs.add_cli_args(parser)
     args = parser.parse_args()