diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 570232be38379..db31b1153d97e 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -189,13 +189,7 @@ async def create_completion( try: async for i, res in result_generator: final_res_batch[i] = res - except asyncio.CancelledError: - return self.create_error_response("Client disconnected") - except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) - try: for i, final_res in enumerate(final_res_batch): assert final_res is not None @@ -217,6 +211,8 @@ async def create_completion( tokenizer, request_metadata, ) + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 917856cd2b2dd..bbe7db8f13231 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -205,12 +205,8 @@ async def create_embedding( try: async for i, res in result_generator: final_res_batch[i] = res - except asyncio.CancelledError: - return self.create_error_response("Client disconnected") - try: - for final_res in final_res_batch: - assert final_res is not None + assert all(final_res is not None for final_res in final_res_batch) final_res_batch_checked = cast(List[EmbeddingRequestOutput], final_res_batch) @@ -218,6 +214,8 @@ async def create_embedding( response = request_output_to_embedding_response( final_res_batch_checked, request_id, created_time, model_name, encoding_format) + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e))