From eade5d65e03b5650334bd1a47b4757e43dbe7c17 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Tue, 5 Nov 2024 14:29:48 -0800
Subject: [PATCH] [Frontend] Adjust try/except blocks in API impl

These were changed to separate blocks in https://github.com/vllm-project/vllm/pull/9759 but I feel it's cleaner/clearer as a single block. It actually doesn't matter which parts of the block raise the specific exceptions in the except clauses, we still want to handle them in the same way.
---
 benchmarks/backend_request_func.py            | 11 ++++++++---
 vllm/entrypoints/openai/serving_completion.py |  8 ++------
 vllm/entrypoints/openai/serving_embedding.py  |  8 +++-----
 3 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 0a903877f000d..ec6d6ad9647b4 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -230,6 +230,8 @@ async def async_request_openai_completions(
         ("completions", "profile")
     ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
 
+    stream = False
+
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
         payload = {
             "model": request_func_input.model,
@@ -238,7 +240,7 @@ async def async_request_openai_completions(
             "best_of": request_func_input.best_of,
             "max_tokens": request_func_input.output_len,
             "logprobs": request_func_input.logprobs,
-            "stream": True,
+            "stream": stream,
             "ignore_eos": request_func_input.ignore_eos,
         }
         headers = {
@@ -263,9 +265,10 @@ async def async_request_openai_completions(
 
                         chunk = remove_prefix(chunk_bytes.decode("utf-8"),
                                               "data: ")
-                        if chunk == "[DONE]":
+                        stream_is_done = stream and chunk == "[DONE]"
+                        if not stream or stream_is_done:
                             latency = time.perf_counter() - st
-                        else:
+                        if not stream_is_done:
                             data = json.loads(chunk)
 
                             # NOTE: Some completion API might have a last
@@ -379,10 +382,12 @@ async def async_request_openai_chat_completions(
                 else:
                     output.error = response.reason or ""
                     output.success = False
+                    print("Error reason", response.reason)
         except Exception:
             output.success = False
             exc_info = sys.exc_info()
             output.error = "".join(traceback.format_exception(*exc_info))
+            traceback.print_exc()
 
     if pbar:
         pbar.update(1)
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 570232be38379..db31b1153d97e 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -189,13 +189,7 @@ async def create_completion(
         try:
             async for i, res in result_generator:
                 final_res_batch[i] = res
-        except asyncio.CancelledError:
-            return self.create_error_response("Client disconnected")
-        except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
 
-        try:
             for i, final_res in enumerate(final_res_batch):
                 assert final_res is not None
 
@@ -217,6 +211,8 @@ async def create_completion(
                 tokenizer,
                 request_metadata,
             )
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 917856cd2b2dd..bbe7db8f13231 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -205,12 +205,8 @@ async def create_embedding(
         try:
             async for i, res in result_generator:
                 final_res_batch[i] = res
-        except asyncio.CancelledError:
-            return self.create_error_response("Client disconnected")
 
-        try:
-            for final_res in final_res_batch:
-                assert final_res is not None
+            assert all(final_res is not None for final_res in final_res_batch)
 
             final_res_batch_checked = cast(List[EmbeddingRequestOutput],
                                            final_res_batch)
@@ -218,6 +214,8 @@ async def create_embedding(
             response = request_output_to_embedding_response(
                 final_res_batch_checked, request_id, created_time, model_name,
                 encoding_format)
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))