nod-ai · stbaione · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024
diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
@@ -28,7 +28,7 @@ jobs:
       matrix:
         version: [3.11]
       fail-fast: false
-    runs-on: llama-mi300x-3
+    runs-on: mi300x-4
     defaults:
       run:
         shell: bash
@@ -78,7 +78,7 @@ jobs:
         run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
 
       - name: Launch Shortfin Server
-        run: pytest -v app_tests/benchmark_tests/llm/sglang_benchmark_test.py --log-cli-level=INFO --html=out/llm/sglang/index.html
+        run: pytest -v app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py --log-cli-level=INFO --html=out/llm/sglang/index.html
 
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0

diff --git a/.github/workflows/ci-sglang-integration-tests.yml b/.github/workflows/ci-sglang-integration-tests.yml
@@ -29,7 +29,7 @@ jobs:
       matrix:
         version: [3.11]
       fail-fast: false
-    runs-on: llama-mi300x-3
+    runs-on: mi300x-4
     defaults:
       run:
         shell: bash

diff --git a/app_tests/benchmark_tests/llm/sglang_benchmarks/__init__.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/__init__.py
@@ -0,0 +1,5 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/app_tests/benchmark_tests/llm/conftest.py → ...k_tests/llm/sglang_benchmarks/conftest.py b/app_tests/benchmark_tests/llm/conftest.py → ...k_tests/llm/sglang_benchmarks/conftest.py
@@ -9,22 +9,32 @@
 import pytest
 import sys
 
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from integration_tests.llm.utils import compile_model, export_paged_llm_v1
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+)
+from integration_tests.llm.utils import (
+    compile_model,
+    export_paged_llm_v1,
+    download_with_hf_datasets,
+)
 
 
 @pytest.fixture(scope="module")
 def pre_process_model(request, tmp_path_factory):
     tmp_dir = tmp_path_factory.mktemp("sglang_benchmark_test")
 
-    model_path = request.param["model_path"]
+    model_name = request.param["model_name"]
+    model_param_file_name = request.param["model_param_file_name"]
     settings = request.param["settings"]
     batch_sizes = request.param["batch_sizes"]
 
     mlir_path = tmp_dir / "model.mlir"
     config_path = tmp_dir / "config.json"
     vmfb_path = tmp_dir / "model.vmfb"
 
+    model_path = tmp_dir / model_param_file_name
+    download_with_hf_datasets(tmp_dir, model_name)
+
     export_paged_llm_v1(mlir_path, config_path, model_path, batch_sizes)
 
     config = {

diff --git a/...chmark_tests/llm/sglang_benchmark_test.py → ...glang_benchmarks/sglang_benchmark_test.py b/...chmark_tests/llm/sglang_benchmark_test.py → ...glang_benchmarks/sglang_benchmark_test.py
@@ -4,7 +4,6 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import json
 import logging
 import multiprocessing
 import os
@@ -16,14 +15,14 @@
 pytest.importorskip("sglang")
 from sglang import bench_serving
 
-from utils import SGLangBenchmarkArgs
+from .utils import SGLangBenchmarkArgs, log_jsonl_result
 
 from integration_tests.llm.utils import (
     find_available_port,
     start_llm_server,
 )
 
-logger = logging.getLogger("__name__")
+logger = logging.getLogger(__name__)
 
 device_settings = {
     "device_flags": [
@@ -33,46 +32,40 @@
     "device": "hip",
 }
 
-# TODO: Download on demand instead of assuming files exist at this path
-MODEL_PATH = Path("/data/llama3.1/8b/llama8b_f16.irpa")
-TOKENIZER_DIR = Path("/data/llama3.1/8b/")
-
-
-def log_jsonl_result(file_path):
-    with open(file_path, "r") as file:
-        json_string = file.readline().strip()
-
-    json_data = json.loads(json_string)
-    for key, val in json_data.items():
-        logger.info(f"{key.upper()}: {val}")
-
 
 @pytest.mark.parametrize(
-    "request_rate",
-    [1, 2, 4, 8, 16, 32],
+    "request_rate,model_param_file_name",
+    [
+        (req_rate, "meta-llama-3.1-8b-instruct.f16.gguf")
+        for req_rate in [1, 2, 4, 8, 16, 32]
+    ],
 )
 @pytest.mark.parametrize(
     "pre_process_model",
     [
         (
             {
-                "model_path": MODEL_PATH,
+                "model_name": "llama3_8B_fp16",
+                "model_param_file_name": "meta-llama-3.1-8b-instruct.f16.gguf",
                 "settings": device_settings,
                 "batch_sizes": [1, 4],
             }
         )
     ],
     indirect=True,
 )
-def test_sglang_benchmark_server(request_rate, pre_process_model):
+def test_sglang_benchmark_server(
+    request_rate, model_param_file_name, pre_process_model
+):
     # TODO: Remove when multi-device is fixed
     os.environ["ROCR_VISIBLE_DEVICES"] = "1"
 
     tmp_dir = pre_process_model
 
     config_path = tmp_dir / "config.json"
     vmfb_path = tmp_dir / "model.vmfb"
-    tokenizer_path = TOKENIZER_DIR / "tokenizer.json"
+    tokenizer_path = tmp_dir / "tokenizer.json"
+    model_path = tmp_dir / model_param_file_name
 
     # Start shortfin llm server
     port = find_available_port()
@@ -81,7 +74,7 @@ def test_sglang_benchmark_server(request_rate, pre_process_model):
         tokenizer_path,
         config_path,
         vmfb_path,
-        MODEL_PATH,
+        model_path,
         device_settings,
         timeout=30,
     )
@@ -91,7 +84,7 @@ def test_sglang_benchmark_server(request_rate, pre_process_model):
         backend="shortfin",
         num_prompt=10,
         base_url=f"http://localhost:{port}",
-        tokenizer=TOKENIZER_DIR,
+        tokenizer=tmp_dir,
         request_rate=request_rate,
     )
     output_file = (
@@ -116,7 +109,7 @@ def test_sglang_benchmark_server(request_rate, pre_process_model):
         logger.info("======== RESULTS ========")
         log_jsonl_result(benchmark_args.output_file)
     except Exception as e:
-        logger.info(e)
+        logger.error(e)
 
     server_process.terminate()
     server_process.wait()
diff --git a/app_tests/benchmark_tests/llm/utils.py → ...mark_tests/llm/sglang_benchmarks/utils.py b/app_tests/benchmark_tests/llm/utils.py → ...mark_tests/llm/sglang_benchmarks/utils.py
@@ -6,8 +6,12 @@
 
 from argparse import Namespace
 from dataclasses import dataclass
+import json
+import logging
 from pathlib import Path
 
+logger = logging.getLogger(__name__)
+
 
 @dataclass
 class SGLangBenchmarkArgs:
@@ -54,3 +58,12 @@ def __repr__(self):
             f"Tokenizer: {self.tokenizer}\n"
             f"Request Rate: {self.request_rate}"
         )
+
+
+def log_jsonl_result(file_path):
+    with open(file_path, "r") as file:
+        json_string = file.readline().strip()
+
+    json_data = json.loads(json_string)
+    for key, val in json_data.items():
+        logger.info(f"{key.upper()}: {val}")
diff --git a/app_tests/integration_tests/llm/utils.py b/app_tests/integration_tests/llm/utils.py
@@ -15,7 +15,7 @@
 import requests
 from transformers import AutoTokenizer
 
-logger = logging.getLogger("__name__")
+logger = logging.getLogger(__name__)
 
 
 class AccuracyValidationException(RuntimeError):