Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes for SGLang Benchmark Test #910

Merged
merged 16 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 32 additions & 7 deletions .github/workflows/ci-sglang-benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ jobs:

pip freeze

- name: Login to huggingface
continue-on-error: true
run: huggingface-cli login --token ${{ secrets.HF_TOKEN }}

- name: Run Shortfin Benchmark Tests
run: |
source ${VENV_DIR}/bin/activate
Expand Down Expand Up @@ -110,11 +114,25 @@ jobs:
with:
python-version: ${{matrix.version}}

- name: Install SGLang
- name: Install deps
run: |
python -m pip install --no-compile --upgrade pip

pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"

pip install pytest

# Use newest possible releases to be able to track commits that may
# cause errors or performance changes.
pip install -r requirements-iree-unpinned.txt

pip install --no-compile \
-r sharktank/requirements-tests.txt \
-r shortfin/requirements-tests.txt \
-e sharktank/ shortfin/

pip freeze

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

Expand Down Expand Up @@ -158,19 +176,26 @@ jobs:
run: |
pytest -v app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py --port 30000 --log-cli-level=INFO --html=sglang_index.html --self-contained-html

- name: Upload pytest report
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
with:
name: sglang_benchmark
path: sglang_index.html

# Ensure that the container is always cleaned up after job
container_cleanup:
needs: benchmark_sglang
name: "Docker Cleanup"
if: always()
runs-on: mi300x-3
steps:
- name: Stop sglang-server
run: docker stop sglang-server || true # Stop container if it's running

# Deleting image after run due to large disk space requirement (83 GB)
- name: Cleanup SGLang Image
run: docker image rm lmsysorg/sglang:v0.3.5.post1-rocm620

- name: Upload pytest report
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
with:
name: sglang_benchmark
path: sglang_index.html

merge_and_upload_reports:
name: "Merge and upload benchmark reports"
needs: [benchmark_shortfin, benchmark_sglang]
Expand Down
114 changes: 50 additions & 64 deletions app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,88 +18,74 @@
ModelConfig,
ModelProcessor,
ModelSource,
ModelArtifacts,
)
from integration_tests.llm.server_management import ServerInstance, ServerConfig

from integration_tests.llm import device_settings
from integration_tests.llm.logging_utils import start_log_group, end_log_group

logger = logging.getLogger(__name__)

MODEL_DIR_CACHE = {}


@pytest.fixture(scope="module")
def pre_process_model(request, tmp_path_factory):
tmp_dir = tmp_path_factory.mktemp("sglang_benchmark_test")
# we can replace this with an import after #890 merges
TEST_MODELS = {
"llama3.1_8b": ModelConfig(
source=ModelSource.HUGGINGFACE,
repo_id="SanctumAI/Meta-Llama-3.1-8B-Instruct-GGUF",
model_file="meta-llama-3.1-8b-instruct.f16.gguf",
tokenizer_id="NousResearch/Meta-Llama-3.1-8B",
batch_sizes=(1, 4),
device_settings=device_settings.GFX942,
),
}

logger.info(
"Preparing model artifacts..." + start_log_group("Preparing model artifacts")
)

param_key = hashlib.md5(str(request.param).encode()).hexdigest()
if (directory := MODEL_DIR_CACHE.get(param_key)) is not None:
logger.info(
f"Reusing existing model artifacts directory: {directory}" + end_log_group()
@pytest.fixture(scope="module")
def model_artifacts(tmp_path_factory, request):
"""Prepares model artifacts in a cached directory."""
model_config = TEST_MODELS[request.param]
cache_key = hashlib.md5(str(model_config).encode()).hexdigest()

cache_dir = tmp_path_factory.mktemp("model_cache")
model_dir = cache_dir / cache_key

# Return cached artifacts if available
if model_dir.exists():
return ModelArtifacts(
weights_path=model_dir / model_config.model_file,
tokenizer_path=model_dir / "tokenizer.json",
mlir_path=model_dir / "model.mlir",
vmfb_path=model_dir / "model.vmfb",
config_path=model_dir / "config.json",
)
return MODEL_DIR_CACHE[param_key]

model_name = request.param["model_name"]
model_param_file_name = request.param["model_param_file_name"]
settings = request.param["settings"]
batch_sizes = request.param["batch_sizes"]

# Configure model
config = ModelConfig(
model_file=model_param_file_name,
tokenizer_id=model_name, # Using model_name as tokenizer_id, adjust if needed
batch_sizes=batch_sizes,
device_settings=settings,
source=ModelSource.HUGGINGFACE,
repo_id=model_name, # Using model_name as repo_id, adjust if needed
)

# Process model through all stages
processor = ModelProcessor(tmp_dir)
artifacts = processor.process_model(config)

logger.info("Model artifacts setup successfully" + end_log_group())
MODEL_DIR_CACHE[param_key] = tmp_dir
return tmp_dir
# Process model and create artifacts
processor = ModelProcessor(cache_dir)
return processor.process_model(model_config)


@pytest.fixture(scope="module")
def write_config(request, pre_process_model):
batch_sizes = request.param["batch_sizes"]
prefix_sharing_algorithm = request.param["prefix_sharing_algorithm"]

# Construct the new config filename
config_path = (
pre_process_model
/ f"{'_'.join(str(bs) for bs in batch_sizes)}_{prefix_sharing_algorithm}.json"
def server(model_artifacts, request):
"""Starts and manages the test server."""
model_id = request.param["model"]
model_config = TEST_MODELS[model_id]

server_config = ServerConfig(
artifacts=model_artifacts,
device_settings=model_config.device_settings,
prefix_sharing_algorithm=request.param.get("prefix_sharing", "none"),
)

# Read the base config file
base_config_path = pre_process_model / "config.json"
with open(base_config_path, "r") as f:
config = json.load(f)

# Override specific fields
config.update(
{
"prefill_batch_sizes": batch_sizes,
"decode_batch_sizes": batch_sizes,
"paged_kv_cache": {
**config.get(
"paged_kv_cache", {}
), # Preserve other paged_kv_cache settings
"prefix_sharing_algorithm": prefix_sharing_algorithm,
},
}
)
server_instance = ServerInstance(server_config)
server_instance.start()
process, port = server_instance.process, server_instance.port
yield process, port

logger.info(f"Saving edited config to: {config_path}\n")
logger.info(f"Config: {json.dumps(config, indent=2)}")
with open(config_path, "w") as f:
json.dump(config, f)
yield config_path
process.terminate()
process.wait()


def pytest_addoption(parser):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
import pytest
import time
from unittest.mock import patch
from transformers import AutoTokenizer
import os
import requests

pytest.importorskip("sglang")
from sglang import bench_serving
Expand All @@ -23,38 +26,56 @@
logger = logging.getLogger(__name__)


def download_tokenizer(local_dir, tokenizer_id):
# Set up tokenizer if it doesn't exist
tokenizer_path = local_dir / "tokenizer.json"
logger.info(f"Preparing tokenizer_path: {tokenizer_path}...")
if not os.path.exists(tokenizer_path):
logger.info(f"Downloading tokenizer {tokenizer_id} from Hugging Face...")
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_id,
)
tokenizer.save_pretrained(local_dir)
logger.info(f"Tokenizer saved to {tokenizer_path}")
else:
logger.info("Using cached tokenizer")


def wait_for_server(url, timeout):
logger.info(f"Waiting for server to start at {url}...")
start = time.time()
elapsed = 0
while elapsed <= timeout:
try:
requests.get(f"{url}/health")
logger.info("Server successfully started")
return
except requests.exceptions.ConnectionError:
logger.info(
f"Server has not started yet; waited {elapsed} seconds; timeout: {timeout} seconds."
)
time.sleep(1)
elapsed = time.time() - start
raise TimeoutError(f"Server did not start within {timeout} seconds at {url}")


@pytest.mark.parametrize(
"request_rate,tokenizer_id",
[(req_rate, "NousResearch/Meta-Llama-3-8B") for req_rate in [1, 2, 4, 8, 16, 32]],
)
def test_sglang_benchmark(request_rate, tokenizer_id, sglang_args, tmp_path_factory):
tmp_dir = tmp_path_factory.mktemp("sglang_benchmark_test")

# Download tokenizer using ModelProcessor
config = ModelConfig(
model_file="tokenizer.json", # Only need tokenizer
tokenizer_id=tokenizer_id,
batch_sizes=(1,), # Not relevant for tokenizer only
device_settings=None, # Not relevant for tokenizer only
source=ModelSource.HUGGINGFACE,
repo_id=tokenizer_id,
)
processor = ModelProcessor(tmp_dir)
artifacts = processor.process_model(config)
download_tokenizer(tmp_dir, tokenizer_id)

logger.info("Beginning SGLang benchmark test...")

port = sglang_args
base_url = f"http://localhost:{port}"

# Wait for server using ServerInstance's method
server = ServerInstance(
None
) # We don't need config since we're just using wait_for_ready
server.port = int(port) # Set port manually since we didn't start the server
server.wait_for_ready(
timeout=600
) # High timeout for model artifacts download and server startup
# Setting a high timeout gives enough time for downloading model artifacts
# and starting up server... Takes a little longer than shortfin.
wait_for_server(base_url, timeout=600)

benchmark_args = SGLangBenchmarkArgs(
backend="sglang",
Expand Down
Loading
Loading