Skip to content

Commit

Permalink
Adding support vLLM openai entrypoint benchmarking script (#793)
Browse files Browse the repository at this point in the history
* changes to support openai entrypoint

* cat command to show results

* defaulted entrypoint in vllm case, updated docs and versions

* readded comment

* fixed lint issue

* ran terraform fmt

* fixed comments

* changes to support jetstream
  • Loading branch information
Edwinhr716 committed Sep 3, 2024
1 parent b22273b commit c872599
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -114,13 +114,15 @@ async def send_request(
top_k: int,
tokenizer: PreTrainedTokenizerBase,
sax_model: str,
model: str,
) -> None:
"""Sends request to server."""
request_start_time = time.time()

headers = {"User-Agent": "Benchmark Client"}
if backend == "vllm":
pload = {
"model": model,
"prompt": prompt,
"n": 1,
"best_of": best_of,
Expand Down Expand Up @@ -179,7 +181,7 @@ async def send_request(
elif backend == "jetstream":
pload = {
"prompt": prompt,
"max_tokens": 1,
"max_tokens": output_len,
}
else:
raise ValueError(f"Unknown backend: {backend}")
Expand Down Expand Up @@ -219,9 +221,8 @@ async def send_request(
output_token_ids = tokenizer(output["generated_text"]).input_ids
output_len = len(output_token_ids)
elif backend == "vllm":
total_token_ids = tokenizer(output["text"][0]).input_ids
new_total_len = len(total_token_ids)
output_len = new_total_len - prompt_len
output_token_ids = tokenizer(output["choices"][0]["text"]).input_ids
output_len = len(output_token_ids)
elif backend == "jetstream":
output_token_ids = tokenizer(output["response"]).input_ids
output_len = len(output_token_ids)
Expand All @@ -240,6 +241,7 @@ async def benchmark(
top_k: int,
tokenizer: PreTrainedTokenizerBase,
sax_model: str,
model: str,
) -> None:
"""Runs benchmark with asynchronous requests."""
tasks: List[asyncio.Task] = []
Expand All @@ -257,6 +259,7 @@ async def benchmark(
top_k,
tokenizer,
sax_model,
model,
)
)
tasks.append(task)
Expand All @@ -268,7 +271,13 @@ def main(args: argparse.Namespace):
random.seed(args.seed)
np.random.seed(args.seed)

api_url = f"http://{args.host}:{args.port}/{args.endpoint}"
endpoint = (
"v1/completions"
if args.backend == "vllm"
else args.endpoint
)

api_url = f"http://{args.host}:{args.port}/{endpoint}"
tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer, trust_remote_code=args.trust_remote_code
)
Expand All @@ -293,6 +302,7 @@ def main(args: argparse.Namespace):
args.top_k,
tokenizer,
args.sax_model,
args.model,
)
)
benchmark_end_time = time.time()
Expand Down Expand Up @@ -388,6 +398,11 @@ def main(args: argparse.Namespace):
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=7080)
parser.add_argument("--dataset", type=str, help="Path to the dataset.")
parser.add_argument(
"--model",
type=str,
help="Name of the model.",
)
parser.add_argument(
"--tokenizer",
type=str,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,8 @@ for request_rate in $(echo $REQUEST_RATES | tr ',' ' '); do
# TODO: Check if profile already exists, if so then skip
timestamp=$(date +"%Y-%m-%d_%H-%M-%S")
output_file="latency-profile-${timestamp}.txt"
python3 benchmark_serving.py --host="$IP" --port="$PORT" --dataset=ShareGPT_V3_unfiltered_cleaned_split.json --tokenizer="$TOKENIZER" --request-rate=$request_rate --backend="$BACKEND" --num-prompts=$((request_rate * 30)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH > $output_file
done
python3 benchmark_serving.py --host="$IP" --port="$PORT" --model="$TOKENIZER" --dataset=ShareGPT_V3_unfiltered_cleaned_split.json --tokenizer="$TOKENIZER" --request-rate=$request_rate --backend="$BACKEND" --num-prompts=$((request_rate * 30)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH > $output_file
cat $output_file
sleep 5 # wait 5 seconds before next run
done

5 changes: 3 additions & 2 deletions benchmarks/benchmark/tools/profile-generator/sample.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ output_bucket = "your_project_id-benchmark-output-b
k8s_hf_secret = "hf-token"

# Benchmark configuration for Locust Docker accessing inference server
request_rates = [5, 10, 15, 20]
request_rates = [5, 10, 15, 20]
artifact_registry = "your_project_artifact_registry"

# Model server configuration information
targets = {
Expand All @@ -39,4 +40,4 @@ targets = {
service_port = "your_model_service_service_port"
tokenizer = "your_tokenizer"
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ spec:
- name: vllm
ports:
- containerPort: 80
image: "vllm/vllm-openai:v0.3.3"
image: "vllm/vllm-openai:v0.5.5"
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
args: ["--model", "${model_id}", "--tensor-parallel-size", "${gpu_count}", "--port", "80", "--swap-space", "${swap_space}", "--disable-log-requests"]
env:
Expand Down

0 comments on commit c872599

Please sign in to comment.