From eade5d65e03b5650334bd1a47b4757e43dbe7c17 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Tue, 5 Nov 2024 14:29:48 -0800 Subject: [PATCH] [Frontend] Adjust try/except blocks in API impl These were changed to separate blocks in https://github.com/vllm-project/vllm/pull/9759 but I feel it's cleaner/clearer as a single block. It actually doesn't matter which parts of the block raise the specific exceptions in the except clauses, we still want to handle them in the same way. --- benchmarks/backend_request_func.py | 11 ++++++++--- vllm/entrypoints/openai/serving_completion.py | 8 ++------ vllm/entrypoints/openai/serving_embedding.py | 8 +++----- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 0a903877f000d..ec6d6ad9647b4 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -230,6 +230,8 @@ async def async_request_openai_completions( ("completions", "profile") ), "OpenAI Completions API URL must end with 'completions' or 'profile'." + stream = False + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: payload = { "model": request_func_input.model, @@ -238,7 +240,7 @@ async def async_request_openai_completions( "best_of": request_func_input.best_of, "max_tokens": request_func_input.output_len, "logprobs": request_func_input.logprobs, - "stream": True, + "stream": stream, "ignore_eos": request_func_input.ignore_eos, } headers = { @@ -263,9 +265,10 @@ async def async_request_openai_completions( chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ") - if chunk == "[DONE]": + stream_is_done = stream and chunk == "[DONE]" + if not stream or stream_is_done: latency = time.perf_counter() - st - else: + if not stream_is_done: data = json.loads(chunk) # NOTE: Some completion API might have a last @@ -379,10 +382,12 @@ async def async_request_openai_chat_completions( else: output.error = response.reason or "" output.success = False + print("Error reason", response.reason) except Exception: output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + traceback.print_exc() if pbar: pbar.update(1) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 570232be38379..db31b1153d97e 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -189,13 +189,7 @@ async def create_completion( try: async for i, res in result_generator: final_res_batch[i] = res - except asyncio.CancelledError: - return self.create_error_response("Client disconnected") - except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) - try: for i, final_res in enumerate(final_res_batch): assert final_res is not None @@ -217,6 +211,8 @@ async def create_completion( tokenizer, request_metadata, ) + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 917856cd2b2dd..bbe7db8f13231 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -205,12 +205,8 @@ async def create_embedding( try: async for i, res in result_generator: final_res_batch[i] = res - except asyncio.CancelledError: - return self.create_error_response("Client disconnected") - try: - for final_res in final_res_batch: - assert final_res is not None + assert all(final_res is not None for final_res in final_res_batch) final_res_batch_checked = cast(List[EmbeddingRequestOutput], final_res_batch) @@ -218,6 +214,8 @@ async def create_embedding( response = request_output_to_embedding_response( final_res_batch_checked, request_id, created_time, model_name, encoding_format) + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e))