From a52610fb19ca32c39fa431559e674ea50c803568 Mon Sep 17 00:00:00 2001
From: randoentity <random>
Date: Sun, 24 Nov 2024 13:40:33 +0100
Subject: [PATCH 1/2] workaround for tool calling

---
 endpoints/OAI/utils/chat_completion.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py
index 14a2243..9fe9aa8 100644
--- a/endpoints/OAI/utils/chat_completion.py
+++ b/endpoints/OAI/utils/chat_completion.py
@@ -454,11 +454,11 @@ async def generate_tool_calls(
         if gen["stop_str"] in tool_data.tool_call_start:
             if "text" in gen:
                 # non streaming, all generations will have the text they generated
-                pre_tool_prompt = await apply_chat_template(data, gen["text"])
+                pre_tool_prompt, _ = await apply_chat_template(data, gen["text"])
             elif current_generations is not None:
                 # streaming, we wont have text in the generation,
                 # we'll have to use the current_generations
-                pre_tool_prompt = await apply_chat_template(data, current_generations)
+                pre_tool_prompt, _ = await apply_chat_template(data, current_generations)
 
             gen_tasks.append(
                 asyncio.create_task(

From 2e06fb01d3703e71075ade2a5c202309f68d52ad Mon Sep 17 00:00:00 2001
From: kingbri <bdashore3@proton.me>
Date: Thu, 28 Nov 2024 23:27:59 -0500
Subject: [PATCH 2/2] OAI: Pass mm_embeddings to tool call generation

Don't exclude the vision embeddings when regenerating for a tool call.

Signed-off-by: kingbri <bdashore3@proton.me>
---
 endpoints/OAI/utils/chat_completion.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py
index 9fe9aa8..a646924 100644
--- a/endpoints/OAI/utils/chat_completion.py
+++ b/endpoints/OAI/utils/chat_completion.py
@@ -454,16 +454,23 @@ async def generate_tool_calls(
         if gen["stop_str"] in tool_data.tool_call_start:
             if "text" in gen:
                 # non streaming, all generations will have the text they generated
-                pre_tool_prompt, _ = await apply_chat_template(data, gen["text"])
+                pre_tool_prompt, mm_embeddings = await apply_chat_template(
+                    data, gen["text"]
+                )
             elif current_generations is not None:
                 # streaming, we wont have text in the generation,
                 # we'll have to use the current_generations
-                pre_tool_prompt, _ = await apply_chat_template(data, current_generations)
+                pre_tool_prompt, mm_embeddings = await apply_chat_template(
+                    data, current_generations
+                )
 
             gen_tasks.append(
                 asyncio.create_task(
                     model.container.generate(
-                        pre_tool_prompt, request.state.id, **gen_params
+                        pre_tool_prompt,
+                        request.state.id,
+                        embeddings=mm_embeddings,
+                        **gen_params,
                     )
                 )
             )