nod-ai · stbaione · Feb 5, 2025 · Feb 4, 2025 · Feb 4, 2025 · Feb 4, 2025
diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
@@ -80,6 +80,10 @@ jobs:
 
           pip freeze
 
+      - name: Login to huggingface
+        continue-on-error: true
+        run: huggingface-cli login --token ${{ secrets.HF_TOKEN }}
+
       - name: Run Shortfin Benchmark Tests
         run: |
           source ${VENV_DIR}/bin/activate
@@ -110,11 +114,25 @@ jobs:
         with:
           python-version: ${{matrix.version}}
 
-      - name: Install SGLang
+      - name: Install deps
         run: |
           python -m pip install --no-compile --upgrade pip
+
           pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
 
+          pip install pytest
+
+          # Use newest possible releases to be able to track commits that may
+          # cause errors or performance changes.
+          pip install -r requirements-iree-unpinned.txt
+
+          pip install --no-compile \
+            -r sharktank/requirements-tests.txt \
+            -r shortfin/requirements-tests.txt \
+            -e sharktank/ shortfin/
+
+          pip freeze
+
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
 
@@ -158,19 +176,26 @@ jobs:
         run: |
           pytest -v app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py --port 30000 --log-cli-level=INFO --html=sglang_index.html --self-contained-html
 
+      - name: Upload pytest report
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
+        with:
+          name: sglang_benchmark
+          path: sglang_index.html
+
+  # Ensure that the container is always cleaned up after job
+  container_cleanup:
+    needs: benchmark_sglang
+    name: "Docker Cleanup"
+    if: always()
+    runs-on: mi300x-3
+    steps:
       - name: Stop sglang-server
         run: docker stop sglang-server || true # Stop container if it's running
 
       # Deleting image after run due to large disk space requirement (83 GB)
       - name: Cleanup SGLang Image
         run: docker image rm lmsysorg/sglang:v0.3.5.post1-rocm620
 
-      - name: Upload pytest report
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
-        with:
-          name: sglang_benchmark
-          path: sglang_index.html
-
   merge_and_upload_reports:
     name: "Merge and upload benchmark reports"
     needs: [benchmark_shortfin, benchmark_sglang]

diff --git a/app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py
@@ -18,88 +18,74 @@
     ModelConfig,
     ModelProcessor,
     ModelSource,
+    ModelArtifacts,
 )
+from integration_tests.llm.server_management import ServerInstance, ServerConfig
+
+from integration_tests.llm import device_settings
 from integration_tests.llm.logging_utils import start_log_group, end_log_group
 
 logger = logging.getLogger(__name__)
 
 MODEL_DIR_CACHE = {}
 
 
-@pytest.fixture(scope="module")
-def pre_process_model(request, tmp_path_factory):
-    tmp_dir = tmp_path_factory.mktemp("sglang_benchmark_test")
+# we can replace this with an import after #890 merges
+TEST_MODELS = {
+    "llama3.1_8b": ModelConfig(
+        source=ModelSource.HUGGINGFACE,
+        repo_id="SanctumAI/Meta-Llama-3.1-8B-Instruct-GGUF",
+        model_file="meta-llama-3.1-8b-instruct.f16.gguf",
+        tokenizer_id="NousResearch/Meta-Llama-3.1-8B",
+        batch_sizes=(1, 4),
+        device_settings=device_settings.GFX942,
+    ),
+}
 
-    logger.info(
-        "Preparing model artifacts..." + start_log_group("Preparing model artifacts")
-    )
 
-    param_key = hashlib.md5(str(request.param).encode()).hexdigest()
-    if (directory := MODEL_DIR_CACHE.get(param_key)) is not None:
-        logger.info(
-            f"Reusing existing model artifacts directory: {directory}" + end_log_group()
+@pytest.fixture(scope="module")
+def model_artifacts(tmp_path_factory, request):
+    """Prepares model artifacts in a cached directory."""
+    model_config = TEST_MODELS[request.param]
+    cache_key = hashlib.md5(str(model_config).encode()).hexdigest()
+
+    cache_dir = tmp_path_factory.mktemp("model_cache")
+    model_dir = cache_dir / cache_key
+
+    # Return cached artifacts if available
+    if model_dir.exists():
+        return ModelArtifacts(
+            weights_path=model_dir / model_config.model_file,
+            tokenizer_path=model_dir / "tokenizer.json",
+            mlir_path=model_dir / "model.mlir",
+            vmfb_path=model_dir / "model.vmfb",
+            config_path=model_dir / "config.json",
         )
-        return MODEL_DIR_CACHE[param_key]
-
-    model_name = request.param["model_name"]
-    model_param_file_name = request.param["model_param_file_name"]
-    settings = request.param["settings"]
-    batch_sizes = request.param["batch_sizes"]
-
-    # Configure model
-    config = ModelConfig(
-        model_file=model_param_file_name,
-        tokenizer_id=model_name,  # Using model_name as tokenizer_id, adjust if needed
-        batch_sizes=batch_sizes,
-        device_settings=settings,
-        source=ModelSource.HUGGINGFACE,
-        repo_id=model_name,  # Using model_name as repo_id, adjust if needed
-    )
 
-    # Process model through all stages
-    processor = ModelProcessor(tmp_dir)
-    artifacts = processor.process_model(config)
-
-    logger.info("Model artifacts setup successfully" + end_log_group())
-    MODEL_DIR_CACHE[param_key] = tmp_dir
-    return tmp_dir
+    # Process model and create artifacts
+    processor = ModelProcessor(cache_dir)
+    return processor.process_model(model_config)
 
 
 @pytest.fixture(scope="module")
-def write_config(request, pre_process_model):
-    batch_sizes = request.param["batch_sizes"]
-    prefix_sharing_algorithm = request.param["prefix_sharing_algorithm"]
-
-    # Construct the new config filename
-    config_path = (
-        pre_process_model
-        / f"{'_'.join(str(bs) for bs in batch_sizes)}_{prefix_sharing_algorithm}.json"
+def server(model_artifacts, request):
+    """Starts and manages the test server."""
+    model_id = request.param["model"]
+    model_config = TEST_MODELS[model_id]
+
+    server_config = ServerConfig(
+        artifacts=model_artifacts,
+        device_settings=model_config.device_settings,
+        prefix_sharing_algorithm=request.param.get("prefix_sharing", "none"),
     )
 
-    # Read the base config file
-    base_config_path = pre_process_model / "config.json"
-    with open(base_config_path, "r") as f:
-        config = json.load(f)
-
-    # Override specific fields
-    config.update(
-        {
-            "prefill_batch_sizes": batch_sizes,
-            "decode_batch_sizes": batch_sizes,
-            "paged_kv_cache": {
-                **config.get(
-                    "paged_kv_cache", {}
-                ),  # Preserve other paged_kv_cache settings
-                "prefix_sharing_algorithm": prefix_sharing_algorithm,
-            },
-        }
-    )
+    server_instance = ServerInstance(server_config)
+    server_instance.start()
+    process, port = server_instance.process, server_instance.port
+    yield process, port
 
-    logger.info(f"Saving edited config to: {config_path}\n")
-    logger.info(f"Config: {json.dumps(config, indent=2)}")
-    with open(config_path, "w") as f:
-        json.dump(config, f)
-    yield config_path
+    process.terminate()
+    process.wait()
 
 
 def pytest_addoption(parser):

diff --git a/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py
@@ -8,6 +8,9 @@
 import pytest
 import time
 from unittest.mock import patch
+from transformers import AutoTokenizer
+import os
+import requests
 
 pytest.importorskip("sglang")
 from sglang import bench_serving
@@ -23,38 +26,56 @@
 logger = logging.getLogger(__name__)
 
 
+def download_tokenizer(local_dir, tokenizer_id):
+    # Set up tokenizer if it doesn't exist
+    tokenizer_path = local_dir / "tokenizer.json"
+    logger.info(f"Preparing tokenizer_path: {tokenizer_path}...")
+    if not os.path.exists(tokenizer_path):
+        logger.info(f"Downloading tokenizer {tokenizer_id} from Hugging Face...")
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_id,
+        )
+        tokenizer.save_pretrained(local_dir)
+        logger.info(f"Tokenizer saved to {tokenizer_path}")
+    else:
+        logger.info("Using cached tokenizer")
+
+
+def wait_for_server(url, timeout):
+    logger.info(f"Waiting for server to start at {url}...")
+    start = time.time()
+    elapsed = 0
+    while elapsed <= timeout:
+        try:
+            requests.get(f"{url}/health")
+            logger.info("Server successfully started")
+            return
+        except requests.exceptions.ConnectionError:
+            logger.info(
+                f"Server has not started yet; waited {elapsed} seconds; timeout: {timeout} seconds."
+            )
+            time.sleep(1)
+        elapsed = time.time() - start
+    raise TimeoutError(f"Server did not start within {timeout} seconds at {url}")
+
+
 @pytest.mark.parametrize(
     "request_rate,tokenizer_id",
     [(req_rate, "NousResearch/Meta-Llama-3-8B") for req_rate in [1, 2, 4, 8, 16, 32]],
 )
 def test_sglang_benchmark(request_rate, tokenizer_id, sglang_args, tmp_path_factory):
     tmp_dir = tmp_path_factory.mktemp("sglang_benchmark_test")
 
-    # Download tokenizer using ModelProcessor
-    config = ModelConfig(
-        model_file="tokenizer.json",  # Only need tokenizer
-        tokenizer_id=tokenizer_id,
-        batch_sizes=(1,),  # Not relevant for tokenizer only
-        device_settings=None,  # Not relevant for tokenizer only
-        source=ModelSource.HUGGINGFACE,
-        repo_id=tokenizer_id,
-    )
-    processor = ModelProcessor(tmp_dir)
-    artifacts = processor.process_model(config)
+    download_tokenizer(tmp_dir, tokenizer_id)
 
     logger.info("Beginning SGLang benchmark test...")
 
     port = sglang_args
     base_url = f"http://localhost:{port}"
 
-    # Wait for server using ServerInstance's method
-    server = ServerInstance(
-        None
-    )  # We don't need config since we're just using wait_for_ready
-    server.port = int(port)  # Set port manually since we didn't start the server
-    server.wait_for_ready(
-        timeout=600
-    )  # High timeout for model artifacts download and server startup
+    # Setting a high timeout gives enough time for downloading model artifacts
+    # and starting up server... Takes a little longer than shortfin.
+    wait_for_server(base_url, timeout=600)
 
     benchmark_args = SGLangBenchmarkArgs(
         backend="sglang",