diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index f7d67692f697b..3243bb94f787c 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -24,6 +24,7 @@ class RequestFuncInput: model: str best_of: int = 1 use_beam_search: bool = False + logprobs: Optional[int] = None @dataclass @@ -236,6 +237,7 @@ async def async_request_openai_completions( "temperature": 0.0, "best_of": request_func_input.best_of, "max_tokens": request_func_input.output_len, + "logprobs": request_func_input.logprobs, "stream": True, } headers = { diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 84f366bdba387..bdfa81be4208e 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -318,6 +318,7 @@ async def benchmark( model_id: str, tokenizer: PreTrainedTokenizerBase, input_requests: List[Tuple[str, int, int]], + logprobs: Optional[int], best_of: int, use_beam_search: bool, request_rate: float, @@ -339,6 +340,7 @@ async def benchmark( api_url=api_url, prompt_len=test_prompt_len, output_len=test_output_len, + logprobs=logprobs, best_of=best_of, use_beam_search=use_beam_search, ) @@ -358,6 +360,7 @@ async def benchmark( api_url=base_url + "/start_profile", prompt_len=test_prompt_len, output_len=test_output_len, + logprobs=logprobs, best_of=best_of, use_beam_search=use_beam_search, ) @@ -379,6 +382,7 @@ async def benchmark( api_url=api_url, prompt_len=prompt_len, output_len=output_len, + logprobs=logprobs, best_of=best_of, use_beam_search=use_beam_search, ) @@ -396,6 +400,7 @@ async def benchmark( api_url=base_url + "/stop_profile", prompt_len=test_prompt_len, output_len=test_output_len, + logprobs=logprobs, best_of=best_of, use_beam_search=use_beam_search, ) @@ -580,6 +585,7 @@ def main(args: argparse.Namespace): model_id=model_id, tokenizer=tokenizer, input_requests=input_requests, + logprobs=args.logprobs, best_of=args.best_of, use_beam_search=args.use_beam_search, request_rate=args.request_rate, @@ -721,6 +727,16 @@ def main(args: argparse.Namespace): help= "Number of output tokens per request, used only for sonnet dataset.", ) + parser.add_argument( + "--logprobs", + type=int, + default=None, + help=("Number of logprobs-per-token to compute & return as part of " + "the request. If unspecified, then either (1) if beam search " + "is disabled, no logprobs are computed & a single dummy " + "logprob is returned for each token; or (2) if beam search " + "is enabled 1 logprob per token is computed"), + ) parser.add_argument( "--sonnet-prefix-len", type=int, diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py index 50c85df932e25..24ebb60a9cbfd 100644 --- a/tests/multi_step/test_correctness_llm.py +++ b/tests/multi_step/test_correctness_llm.py @@ -57,7 +57,7 @@ def test_multi_step_llm( GPU -> CPU output transfer num_prompts: number of example prompts under test num_logprobs: corresponds to the `logprobs` argument to the OpenAI - completions endpoint; `None` -> no logprobs + completions endpoint; `None` -> 1 logprob returned. """ prompts = example_prompts