[Frontend] Adjust try/except blocks in API impl

These were changed to separate blocks in vllm-project#9759 but I feel it's cleaner/clearer as a single block. It actually doesn't matter which parts of the block raise the specific exceptions in the except clauses, we still want to handle them in the same way.
njhill · Nov 5, 2024 · eade5d6 · eade5d6
1 parent 235366f
commit eade5d6
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 14 deletions.
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
@@ -230,6 +230,8 @@ async def async_request_openai_completions(
         ("completions", "profile")
     ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
 
+    stream = False
+
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
         payload = {
             "model": request_func_input.model,
@@ -238,7 +240,7 @@ async def async_request_openai_completions(
             "best_of": request_func_input.best_of,
             "max_tokens": request_func_input.output_len,
             "logprobs": request_func_input.logprobs,
-            "stream": True,
+            "stream": stream,
             "ignore_eos": request_func_input.ignore_eos,
         }
         headers = {
@@ -263,9 +265,10 @@ async def async_request_openai_completions(
 
                         chunk = remove_prefix(chunk_bytes.decode("utf-8"),
                                               "data: ")
-                        if chunk == "[DONE]":
+                        stream_is_done = stream and chunk == "[DONE]"
+                        if not stream or stream_is_done:
                             latency = time.perf_counter() - st
-                        else:
+                        if not stream_is_done:
                             data = json.loads(chunk)
 
                             # NOTE: Some completion API might have a last
@@ -379,10 +382,12 @@ async def async_request_openai_chat_completions(
                 else:
                     output.error = response.reason or ""
                     output.success = False
+                    print("Error reason", response.reason)
         except Exception:
             output.success = False
             exc_info = sys.exc_info()
             output.error = "".join(traceback.format_exception(*exc_info))
+            traceback.print_exc()
 
     if pbar:
         pbar.update(1)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
@@ -189,13 +189,7 @@ async def create_completion(
         try:
             async for i, res in result_generator:
                 final_res_batch[i] = res
-        except asyncio.CancelledError:
-            return self.create_error_response("Client disconnected")
-        except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
 
-        try:
             for i, final_res in enumerate(final_res_batch):
                 assert final_res is not None
 
@@ -217,6 +211,8 @@ async def create_completion(
                 tokenizer,
                 request_metadata,
             )
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))

diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
@@ -205,19 +205,17 @@ async def create_embedding(
         try:
             async for i, res in result_generator:
                 final_res_batch[i] = res
-        except asyncio.CancelledError:
-            return self.create_error_response("Client disconnected")
 
-        try:
-            for final_res in final_res_batch:
-                assert final_res is not None
+            assert all(final_res is not None for final_res in final_res_batch)
 
             final_res_batch_checked = cast(List[EmbeddingRequestOutput],
                                            final_res_batch)
 
             response = request_output_to_embedding_response(
                 final_res_batch_checked, request_id, created_time, model_name,
                 encoding_format)
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))