From 77303da5516341bbdc78572c0ad83628b0ee5889 Mon Sep 17 00:00:00 2001
From: Pamela Fox <pamelafox@microsoft.com>
Date: Tue, 21 May 2024 16:26:21 -0700
Subject: [PATCH] Improve token counting for messages with package (#1577)

* Disable openai key access

* Use message token helper instead

* Update to latest package

* Revert launch change

* Improve typing
---
 app/backend/approaches/approach.py            |   7 +-
 app/backend/approaches/chatapproach.py        |  76 ++------
 .../approaches/chatreadretrieveread.py        |  39 +++--
 .../approaches/chatreadretrievereadvision.py  |  37 ++--
 app/backend/approaches/retrievethenread.py    |  30 ++--
 .../approaches/retrievethenreadvision.py      |  26 +--
 app/backend/core/imageshelper.py              |  52 ------
 app/backend/core/messagebuilder.py            |  66 -------
 app/backend/core/modelhelper.py               |  68 --------
 app/backend/requirements.in                   |   1 +
 app/backend/requirements.txt                  |  17 +-
 tests/test_chatapproach.py                    | 164 ------------------
 tests/test_imageshelper.py                    |  34 ----
 tests/test_messagebuilder.py                  |  49 ------
 tests/test_modelhelper.py                     |  93 ----------
 15 files changed, 109 insertions(+), 650 deletions(-)
 delete mode 100644 app/backend/core/messagebuilder.py
 delete mode 100644 app/backend/core/modelhelper.py
 delete mode 100644 tests/test_imageshelper.py
 delete mode 100644 tests/test_messagebuilder.py
 delete mode 100644 tests/test_modelhelper.py

diff --git a/app/backend/approaches/approach.py b/app/backend/approaches/approach.py
index 7ff25b449b..b5a71e33e6 100644
--- a/app/backend/approaches/approach.py
+++ b/app/backend/approaches/approach.py
@@ -23,6 +23,7 @@
     VectorQuery,
 )
 from openai import AsyncOpenAI
+from openai.types.chat import ChatCompletionMessageParam
 
 from core.authentication import AuthenticationHelper
 from text import nonewlines
@@ -254,6 +255,10 @@ async def compute_image_embedding(self, q: str):
         return VectorizedQuery(vector=image_query_vector, k_nearest_neighbors=50, fields="imageEmbedding")
 
     async def run(
-        self, messages: list[dict], stream: bool = False, session_state: Any = None, context: dict[str, Any] = {}
+        self,
+        messages: list[ChatCompletionMessageParam],
+        stream: bool = False,
+        session_state: Any = None,
+        context: dict[str, Any] = {},
     ) -> Union[dict[str, Any], AsyncGenerator[dict[str, Any], None]]:
         raise NotImplementedError
diff --git a/app/backend/approaches/chatapproach.py b/app/backend/approaches/chatapproach.py
index e3f2e492f3..4bad676716 100644
--- a/app/backend/approaches/chatapproach.py
+++ b/app/backend/approaches/chatapproach.py
@@ -1,30 +1,19 @@
 import json
-import logging
 import re
 from abc import ABC, abstractmethod
 from typing import Any, AsyncGenerator, Optional, Union
 
-from openai.types.chat import (
-    ChatCompletion,
-    ChatCompletionContentPartParam,
-    ChatCompletionMessageParam,
-)
+from openai.types.chat import ChatCompletion, ChatCompletionMessageParam
 
 from approaches.approach import Approach
-from core.messagebuilder import MessageBuilder
 
 
 class ChatApproach(Approach, ABC):
-    # Chat roles
-    SYSTEM = "system"
-    USER = "user"
-    ASSISTANT = "assistant"
-
-    query_prompt_few_shots = [
-        {"role": USER, "content": "How did crypto do last year?"},
-        {"role": ASSISTANT, "content": "Summarize Cryptocurrency Market Dynamics from last year"},
-        {"role": USER, "content": "What are my health plans?"},
-        {"role": ASSISTANT, "content": "Show available health plans"},
+    query_prompt_few_shots: list[ChatCompletionMessageParam] = [
+        {"role": "user", "content": "How did crypto do last year?"},
+        {"role": "assistant", "content": "Summarize Cryptocurrency Market Dynamics from last year"},
+        {"role": "user", "content": "What are my health plans?"},
+        {"role": "assistant", "content": "Show available health plans"},
     ]
     NO_RESPONSE = "0"
 
@@ -53,7 +42,7 @@ def system_message_chat_conversation(self) -> str:
         pass
 
     @abstractmethod
-    async def run_until_final_call(self, history, overrides, auth_claims, should_stream) -> tuple:
+    async def run_until_final_call(self, messages, overrides, auth_claims, should_stream) -> tuple:
         pass
 
     def get_system_prompt(self, override_prompt: Optional[str], follow_up_questions_prompt: str) -> str:
@@ -89,48 +78,15 @@ def get_search_query(self, chat_completion: ChatCompletion, user_query: str):
     def extract_followup_questions(self, content: str):
         return content.split("<<")[0], re.findall(r"<<([^>>]+)>>", content)
 
-    def get_messages_from_history(
-        self,
-        system_prompt: str,
-        model_id: str,
-        history: list[dict[str, str]],
-        user_content: Union[str, list[ChatCompletionContentPartParam]],
-        max_tokens: int,
-        few_shots=[],
-    ) -> list[ChatCompletionMessageParam]:
-        message_builder = MessageBuilder(system_prompt, model_id)
-
-        # Add examples to show the chat what responses we want. It will try to mimic any responses and make sure they match the rules laid out in the system message.
-        for shot in reversed(few_shots):
-            message_builder.insert_message(shot.get("role"), shot.get("content"))
-
-        append_index = len(few_shots) + 1
-
-        message_builder.insert_message(self.USER, user_content, index=append_index)
-
-        total_token_count = 0
-        for existing_message in message_builder.messages:
-            total_token_count += message_builder.count_tokens_for_message(existing_message)
-
-        newest_to_oldest = list(reversed(history[:-1]))
-        for message in newest_to_oldest:
-            potential_message_count = message_builder.count_tokens_for_message(message)
-            if (total_token_count + potential_message_count) > max_tokens:
-                logging.info("Reached max tokens of %d, history will be truncated", max_tokens)
-                break
-            message_builder.insert_message(message["role"], message["content"], index=append_index)
-            total_token_count += potential_message_count
-        return message_builder.messages
-
     async def run_without_streaming(
         self,
-        history: list[dict[str, str]],
+        messages: list[ChatCompletionMessageParam],
         overrides: dict[str, Any],
         auth_claims: dict[str, Any],
         session_state: Any = None,
     ) -> dict[str, Any]:
         extra_info, chat_coroutine = await self.run_until_final_call(
-            history, overrides, auth_claims, should_stream=False
+            messages, overrides, auth_claims, should_stream=False
         )
         chat_completion_response: ChatCompletion = await chat_coroutine
         chat_resp = chat_completion_response.model_dump()  # Convert to dict to make it JSON serializable
@@ -144,18 +100,18 @@ async def run_without_streaming(
 
     async def run_with_streaming(
         self,
-        history: list[dict[str, str]],
+        messages: list[ChatCompletionMessageParam],
         overrides: dict[str, Any],
         auth_claims: dict[str, Any],
         session_state: Any = None,
     ) -> AsyncGenerator[dict, None]:
         extra_info, chat_coroutine = await self.run_until_final_call(
-            history, overrides, auth_claims, should_stream=True
+            messages, overrides, auth_claims, should_stream=True
         )
         yield {
             "choices": [
                 {
-                    "delta": {"role": self.ASSISTANT},
+                    "delta": {"role": "assistant"},
                     "context": extra_info,
                     "session_state": session_state,
                     "finish_reason": None,
@@ -190,7 +146,7 @@ async def run_with_streaming(
             yield {
                 "choices": [
                     {
-                        "delta": {"role": self.ASSISTANT},
+                        "delta": {"role": "assistant"},
                         "context": {"followup_questions": followup_questions},
                         "finish_reason": None,
                         "index": 0,
@@ -200,7 +156,11 @@ async def run_with_streaming(
             }
 
     async def run(
-        self, messages: list[dict], stream: bool = False, session_state: Any = None, context: dict[str, Any] = {}
+        self,
+        messages: list[ChatCompletionMessageParam],
+        stream: bool = False,
+        session_state: Any = None,
+        context: dict[str, Any] = {},
     ) -> Union[dict[str, Any], AsyncGenerator[dict[str, Any], None]]:
         overrides = context.get("overrides", {})
         auth_claims = context.get("auth_claims", {})
diff --git a/app/backend/approaches/chatreadretrieveread.py b/app/backend/approaches/chatreadretrieveread.py
index f7159a453a..e5d3719152 100644
--- a/app/backend/approaches/chatreadretrieveread.py
+++ b/app/backend/approaches/chatreadretrieveread.py
@@ -6,13 +6,14 @@
 from openai.types.chat import (
     ChatCompletion,
     ChatCompletionChunk,
+    ChatCompletionMessageParam,
     ChatCompletionToolParam,
 )
+from openai_messages_token_helper import build_messages, get_token_limit
 
 from approaches.approach import ThoughtStep
 from approaches.chatapproach import ChatApproach
 from core.authentication import AuthenticationHelper
-from core.modelhelper import get_token_limit
 
 
 class ChatReadRetrieveReadApproach(ChatApproach):
@@ -65,7 +66,7 @@ def system_message_chat_conversation(self):
     @overload
     async def run_until_final_call(
         self,
-        history: list[dict[str, str]],
+        messages: list[ChatCompletionMessageParam],
         overrides: dict[str, Any],
         auth_claims: dict[str, Any],
         should_stream: Literal[False],
@@ -74,7 +75,7 @@ async def run_until_final_call(
     @overload
     async def run_until_final_call(
         self,
-        history: list[dict[str, str]],
+        messages: list[ChatCompletionMessageParam],
         overrides: dict[str, Any],
         auth_claims: dict[str, Any],
         should_stream: Literal[True],
@@ -82,7 +83,7 @@ async def run_until_final_call(
 
     async def run_until_final_call(
         self,
-        history: list[dict[str, str]],
+        messages: list[ChatCompletionMessageParam],
         overrides: dict[str, Any],
         auth_claims: dict[str, Any],
         should_stream: bool = False,
@@ -97,7 +98,9 @@ async def run_until_final_call(
         filter = self.build_filter(overrides, auth_claims)
         use_semantic_ranker = True if overrides.get("semantic_ranker") and has_text else False
 
-        original_user_query = history[-1]["content"]
+        original_user_query = messages[-1]["content"]
+        if not isinstance(original_user_query, str):
+            raise ValueError("The most recent message content must be a string.")
         user_query_request = "Generate search query for: " + original_user_query
 
         tools: List[ChatCompletionToolParam] = [
@@ -121,13 +124,15 @@ async def run_until_final_call(
         ]
 
         # STEP 1: Generate an optimized keyword search query based on the chat history and the last question
-        query_messages = self.get_messages_from_history(
+        query_response_token_limit = 100
+        query_messages = build_messages(
+            model=self.chatgpt_model,
             system_prompt=self.query_prompt_template,
-            model_id=self.chatgpt_model,
-            history=history,
-            user_content=user_query_request,
-            max_tokens=self.chatgpt_token_limit - len(user_query_request),
+            tools=tools,
             few_shots=self.query_prompt_few_shots,
+            past_messages=messages[:-1],
+            new_user_content=user_query_request,
+            max_tokens=self.chatgpt_token_limit - query_response_token_limit,
         )
 
         chat_completion: ChatCompletion = await self.openai_client.chat.completions.create(
@@ -135,10 +140,9 @@ async def run_until_final_call(
             # Azure OpenAI takes the deployment name as the model name
             model=self.chatgpt_deployment if self.chatgpt_deployment else self.chatgpt_model,
             temperature=0.0,  # Minimize creativity for search query generation
-            max_tokens=100,  # Setting too low risks malformed JSON, setting too high may affect performance
+            max_tokens=query_response_token_limit,  # Setting too low risks malformed JSON, setting too high may affect performance
             n=1,
             tools=tools,
-            tool_choice="auto",
         )
 
         query_text = self.get_search_query(chat_completion, original_user_query)
@@ -177,14 +181,13 @@ async def run_until_final_call(
         )
 
         response_token_limit = 1024
-        messages_token_limit = self.chatgpt_token_limit - response_token_limit
-        messages = self.get_messages_from_history(
+        messages = build_messages(
+            model=self.chatgpt_model,
             system_prompt=system_message,
-            model_id=self.chatgpt_model,
-            history=history,
+            past_messages=messages[:-1],
             # Model does not handle lengthy system messages well. Moving sources to latest user conversation to solve follow up questions prompt.
-            user_content=original_user_query + "\n\nSources:\n" + content,
-            max_tokens=messages_token_limit,
+            new_user_content=original_user_query + "\n\nSources:\n" + content,
+            max_tokens=self.chatgpt_token_limit - response_token_limit,
         )
 
         data_points = {"text": sources_content}
diff --git a/app/backend/approaches/chatreadretrievereadvision.py b/app/backend/approaches/chatreadretrievereadvision.py
index 65218839c5..899cdf58eb 100644
--- a/app/backend/approaches/chatreadretrievereadvision.py
+++ b/app/backend/approaches/chatreadretrievereadvision.py
@@ -8,13 +8,14 @@
     ChatCompletionChunk,
     ChatCompletionContentPartImageParam,
     ChatCompletionContentPartParam,
+    ChatCompletionMessageParam,
 )
+from openai_messages_token_helper import build_messages, get_token_limit
 
 from approaches.approach import ThoughtStep
 from approaches.chatapproach import ChatApproach
 from core.authentication import AuthenticationHelper
 from core.imageshelper import fetch_image
-from core.modelhelper import get_token_limit
 
 
 class ChatReadRetrieveReadVisionApproach(ChatApproach):
@@ -79,7 +80,7 @@ def system_message_chat_conversation(self):
 
     async def run_until_final_call(
         self,
-        history: list[dict[str, str]],
+        messages: list[ChatCompletionMessageParam],
         overrides: dict[str, Any],
         auth_claims: dict[str, Any],
         should_stream: bool = False,
@@ -97,25 +98,29 @@ async def run_until_final_call(
         include_gtpV_text = overrides.get("gpt4v_input") in ["textAndImages", "texts", None]
         include_gtpV_images = overrides.get("gpt4v_input") in ["textAndImages", "images", None]
 
-        original_user_query = history[-1]["content"]
+        original_user_query = messages[-1]["content"]
+        if not isinstance(original_user_query, str):
+            raise ValueError("The most recent message content must be a string.")
+        past_messages: list[ChatCompletionMessageParam] = messages[:-1]
 
         # STEP 1: Generate an optimized keyword search query based on the chat history and the last question
         user_query_request = "Generate search query for: " + original_user_query
 
-        query_messages = self.get_messages_from_history(
+        query_response_token_limit = 100
+        query_messages = build_messages(
+            model=self.gpt4v_model,
             system_prompt=self.query_prompt_template,
-            model_id=self.gpt4v_model,
-            history=history,
-            user_content=user_query_request,
-            max_tokens=self.chatgpt_token_limit - len(" ".join(user_query_request)),
             few_shots=self.query_prompt_few_shots,
+            past_messages=past_messages,
+            new_user_content=user_query_request,
+            max_tokens=self.chatgpt_token_limit - query_response_token_limit,
         )
 
         chat_completion: ChatCompletion = await self.openai_client.chat.completions.create(
             model=self.gpt4v_deployment if self.gpt4v_deployment else self.gpt4v_model,
             messages=query_messages,
             temperature=0.0,  # Minimize creativity for search query generation
-            max_tokens=100,
+            max_tokens=query_response_token_limit,
             n=1,
         )
 
@@ -159,9 +164,6 @@ async def run_until_final_call(
             self.follow_up_questions_prompt_content if overrides.get("suggest_followup_questions") else "",
         )
 
-        response_token_limit = 1024
-        messages_token_limit = self.chatgpt_token_limit - response_token_limit
-
         user_content: list[ChatCompletionContentPartParam] = [{"text": original_user_query, "type": "text"}]
         image_list: list[ChatCompletionContentPartImageParam] = []
 
@@ -174,12 +176,13 @@ async def run_until_final_call(
                     image_list.append({"image_url": url, "type": "image_url"})
             user_content.extend(image_list)
 
-        messages = self.get_messages_from_history(
+        response_token_limit = 1024
+        messages = build_messages(
+            model=self.gpt4v_model,
             system_prompt=system_message,
-            model_id=self.gpt4v_model,
-            history=history,
-            user_content=user_content,
-            max_tokens=messages_token_limit,
+            past_messages=messages[:-1],
+            new_user_content=user_content,
+            max_tokens=self.chatgpt_token_limit - response_token_limit,
         )
 
         data_points = {
diff --git a/app/backend/approaches/retrievethenread.py b/app/backend/approaches/retrievethenread.py
index ec985974c2..ca2117a9ea 100644
--- a/app/backend/approaches/retrievethenread.py
+++ b/app/backend/approaches/retrievethenread.py
@@ -3,10 +3,11 @@
 from azure.search.documents.aio import SearchClient
 from azure.search.documents.models import VectorQuery
 from openai import AsyncOpenAI
+from openai.types.chat import ChatCompletionMessageParam
+from openai_messages_token_helper import build_messages, get_token_limit
 
 from approaches.approach import Approach, ThoughtStep
 from core.authentication import AuthenticationHelper
-from core.messagebuilder import MessageBuilder
 
 
 class RetrieveThenReadApproach(Approach):
@@ -66,15 +67,18 @@ def __init__(
         self.content_field = content_field
         self.query_language = query_language
         self.query_speller = query_speller
+        self.chatgpt_token_limit = get_token_limit(chatgpt_model)
 
     async def run(
         self,
-        messages: list[dict],
+        messages: list[ChatCompletionMessageParam],
         stream: bool = False,  # Stream is not used in this approach
         session_state: Any = None,
         context: dict[str, Any] = {},
     ) -> Union[dict[str, Any], AsyncGenerator[dict[str, Any], None]]:
         q = messages[-1]["content"]
+        if not isinstance(q, str):
+            raise ValueError("The most recent message content must be a string.")
         overrides = context.get("overrides", {})
         auth_claims = context.get("auth_claims", {})
         has_text = overrides.get("retrieval_mode") in ["text", "hybrid", None]
@@ -105,29 +109,29 @@ async def run(
             minimum_reranker_score,
         )
 
-        user_content = [q]
-
-        template = overrides.get("prompt_template", self.system_chat_template)
-        model = self.chatgpt_model
-        message_builder = MessageBuilder(template, model)
-
         # Process results
         sources_content = self.get_sources_content(results, use_semantic_captions, use_image_citation=False)
 
         # Append user message
         content = "\n".join(sources_content)
         user_content = q + "\n" + f"Sources:\n {content}"
-        message_builder.insert_message("user", user_content)
-        message_builder.insert_message("assistant", self.answer)
-        message_builder.insert_message("user", self.question)
-        updated_messages = message_builder.messages
+
+        response_token_limit = 1024
+        updated_messages = build_messages(
+            model=self.chatgpt_model,
+            system_prompt=overrides.get("prompt_template", self.system_chat_template),
+            few_shots=[{"role": "user", "content": self.question}, {"role": "assistant", "content": self.answer}],
+            new_user_content=user_content,
+            max_tokens=self.chatgpt_token_limit - response_token_limit,
+        )
+
         chat_completion = (
             await self.openai_client.chat.completions.create(
                 # Azure OpenAI takes the deployment name as the model name
                 model=self.chatgpt_deployment if self.chatgpt_deployment else self.chatgpt_model,
                 messages=updated_messages,
                 temperature=overrides.get("temperature", 0.3),
-                max_tokens=1024,
+                max_tokens=response_token_limit,
                 n=1,
             )
         ).model_dump()
diff --git a/app/backend/approaches/retrievethenreadvision.py b/app/backend/approaches/retrievethenreadvision.py
index 5f1bcd18b1..9609277430 100644
--- a/app/backend/approaches/retrievethenreadvision.py
+++ b/app/backend/approaches/retrievethenreadvision.py
@@ -6,12 +6,13 @@
 from openai.types.chat import (
     ChatCompletionContentPartImageParam,
     ChatCompletionContentPartParam,
+    ChatCompletionMessageParam,
 )
+from openai_messages_token_helper import build_messages, get_token_limit
 
 from approaches.approach import Approach, ThoughtStep
 from core.authentication import AuthenticationHelper
 from core.imageshelper import fetch_image
-from core.messagebuilder import MessageBuilder
 
 
 class RetrieveThenReadVisionApproach(Approach):
@@ -66,15 +67,19 @@ def __init__(
         self.query_speller = query_speller
         self.vision_endpoint = vision_endpoint
         self.vision_token_provider = vision_token_provider
+        self.gpt4v_token_limit = get_token_limit(gpt4v_model)
 
     async def run(
         self,
-        messages: list[dict],
+        messages: list[ChatCompletionMessageParam],
         stream: bool = False,  # Stream is not used in this approach
         session_state: Any = None,
         context: dict[str, Any] = {},
     ) -> Union[dict[str, Any], AsyncGenerator[dict[str, Any], None]]:
         q = messages[-1]["content"]
+        if not isinstance(q, str):
+            raise ValueError("The most recent message content must be a string.")
+
         overrides = context.get("overrides", {})
         auth_claims = context.get("auth_claims", {})
         has_text = overrides.get("retrieval_mode") in ["text", "hybrid", None]
@@ -120,12 +125,7 @@ async def run(
         image_list: list[ChatCompletionContentPartImageParam] = []
         user_content: list[ChatCompletionContentPartParam] = [{"text": q, "type": "text"}]
 
-        template = overrides.get("prompt_template", self.system_chat_template_gpt4v)
-        model = self.gpt4v_model
-        message_builder = MessageBuilder(template, model)
-
         # Process results
-
         sources_content = self.get_sources_content(results, use_semantic_captions, use_image_citation=True)
 
         if include_gtpV_text:
@@ -138,15 +138,19 @@ async def run(
                     image_list.append({"image_url": url, "type": "image_url"})
             user_content.extend(image_list)
 
-        # Append user message
-        message_builder.insert_message("user", user_content)
-        updated_messages = message_builder.messages
+        response_token_limit = 1024
+        updated_messages = build_messages(
+            model=self.gpt4v_model,
+            system_prompt=overrides.get("prompt_template", self.system_chat_template_gpt4v),
+            new_user_content=user_content,
+            max_tokens=self.gpt4v_token_limit - response_token_limit,
+        )
         chat_completion = (
             await self.openai_client.chat.completions.create(
                 model=self.gpt4v_deployment if self.gpt4v_deployment else self.gpt4v_model,
                 messages=updated_messages,
                 temperature=overrides.get("temperature", 0.3),
-                max_tokens=1024,
+                max_tokens=response_token_limit,
                 n=1,
             )
         ).model_dump()
diff --git a/app/backend/core/imageshelper.py b/app/backend/core/imageshelper.py
index cf99a4d815..d35f659766 100644
--- a/app/backend/core/imageshelper.py
+++ b/app/backend/core/imageshelper.py
@@ -1,14 +1,10 @@
 import base64
 import logging
-import math
 import os
-import re
-from io import BytesIO
 from typing import Optional
 
 from azure.core.exceptions import ResourceNotFoundError
 from azure.storage.blob.aio import ContainerClient
-from PIL import Image
 from typing_extensions import Literal, Required, TypedDict
 
 from approaches.approach import Document
@@ -45,51 +41,3 @@ async def fetch_image(blob_container_client: ContainerClient, result: Document)
         else:
             return None
     return None
-
-
-def get_image_dims(image_uri: str) -> tuple[int, int]:
-    # From https://github.com/openai/openai-cookbook/pull/881/files
-    if re.match(r"data:image\/\w+;base64", image_uri):
-        image_uri = re.sub(r"data:image\/\w+;base64,", "", image_uri)
-        image = Image.open(BytesIO(base64.b64decode(image_uri)))
-        return image.size
-    else:
-        raise ValueError("Image must be a base64 string.")
-
-
-def calculate_image_token_cost(image_uri: str, detail: str = "auto") -> int:
-    # From https://github.com/openai/openai-cookbook/pull/881/files
-    # Based on https://platform.openai.com/docs/guides/vision
-    LOW_DETAIL_COST = 85
-    HIGH_DETAIL_COST_PER_TILE = 170
-    ADDITIONAL_COST = 85
-
-    if detail == "auto":
-        # assume high detail for now
-        detail = "high"
-
-    if detail == "low":
-        # Low detail images have a fixed cost
-        return LOW_DETAIL_COST
-    elif detail == "high":
-        # Calculate token cost for high detail images
-        width, height = get_image_dims(image_uri)
-        # Check if resizing is needed to fit within a 2048 x 2048 square
-        if max(width, height) > 2048:
-            # Resize dimensions to fit within a 2048 x 2048 square
-            ratio = 2048 / max(width, height)
-            width = int(width * ratio)
-            height = int(height * ratio)
-        # Further scale down to 768px on the shortest side
-        if min(width, height) > 768:
-            ratio = 768 / min(width, height)
-            width = int(width * ratio)
-            height = int(height * ratio)
-        # Calculate the number of 512px squares
-        num_squares = math.ceil(width / 512) * math.ceil(height / 512)
-        # Calculate the total token cost
-        total_cost = num_squares * HIGH_DETAIL_COST_PER_TILE + ADDITIONAL_COST
-        return total_cost
-    else:
-        # Invalid detail_option
-        raise ValueError("Invalid value for detail parameter. Use 'low' or 'high'.")
diff --git a/app/backend/core/messagebuilder.py b/app/backend/core/messagebuilder.py
deleted file mode 100644
index 719f8c8b39..0000000000
--- a/app/backend/core/messagebuilder.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import unicodedata
-from collections.abc import Mapping
-from typing import List, Union
-
-from openai.types.chat import (
-    ChatCompletionAssistantMessageParam,
-    ChatCompletionContentPartParam,
-    ChatCompletionMessageParam,
-    ChatCompletionSystemMessageParam,
-    ChatCompletionUserMessageParam,
-)
-
-from .modelhelper import num_tokens_from_messages
-
-
-class MessageBuilder:
-    """
-    A class for building and managing messages in a chat conversation.
-    Attributes:
-        message (list): A list of dictionaries representing chat messages.
-        model (str): The name of the ChatGPT model.
-        token_count (int): The total number of tokens in the conversation.
-    Methods:
-        __init__(self, system_content: str, chatgpt_model: str): Initializes the MessageBuilder instance.
-        insert_message(self, role: str, content: str, index: int = 1): Inserts a new message to the conversation.
-    """
-
-    def __init__(self, system_content: str, chatgpt_model: str):
-        self.messages: list[ChatCompletionMessageParam] = [
-            ChatCompletionSystemMessageParam(role="system", content=unicodedata.normalize("NFC", system_content))
-        ]
-        self.model = chatgpt_model
-
-    def insert_message(self, role: str, content: Union[str, List[ChatCompletionContentPartParam]], index: int = 1):
-        """
-        Inserts a message into the conversation at the specified index,
-        or at index 1 (after system message) if no index is specified.
-        Args:
-            role (str): The role of the message sender (either "user", "system", or "assistant").
-            content (str | List[ChatCompletionContentPartParam]): The content of the message.
-            index (int): The index at which to insert the message.
-        """
-        message: ChatCompletionMessageParam
-        if role == "user":
-            message = ChatCompletionUserMessageParam(role="user", content=self.normalize_content(content))
-        elif role == "system" and isinstance(content, str):
-            message = ChatCompletionSystemMessageParam(role="system", content=unicodedata.normalize("NFC", content))
-        elif role == "assistant" and isinstance(content, str):
-            message = ChatCompletionAssistantMessageParam(
-                role="assistant", content=unicodedata.normalize("NFC", content)
-            )
-        else:
-            raise ValueError(f"Invalid role: {role}")
-        self.messages.insert(index, message)
-
-    def count_tokens_for_message(self, message: Mapping[str, object]):
-        return num_tokens_from_messages(message, self.model)
-
-    def normalize_content(self, content: Union[str, List[ChatCompletionContentPartParam]]):
-        if isinstance(content, str):
-            return unicodedata.normalize("NFC", content)
-        elif isinstance(content, list):
-            for part in content:
-                if "image_url" not in part:
-                    part["text"] = unicodedata.normalize("NFC", part["text"])
-            return content
diff --git a/app/backend/core/modelhelper.py b/app/backend/core/modelhelper.py
deleted file mode 100644
index 47871f9ecb..0000000000
--- a/app/backend/core/modelhelper.py
+++ /dev/null
@@ -1,68 +0,0 @@
-from __future__ import annotations
-
-from collections.abc import Mapping
-
-import tiktoken
-
-from .imageshelper import calculate_image_token_cost
-
-MODELS_2_TOKEN_LIMITS = {
-    "gpt-35-turbo": 4000,
-    "gpt-3.5-turbo": 4000,
-    "gpt-35-turbo-16k": 16000,
-    "gpt-3.5-turbo-16k": 16000,
-    "gpt-4": 8100,
-    "gpt-4-32k": 32000,
-    "gpt-4v": 128000,
-}
-
-
-AOAI_2_OAI = {"gpt-35-turbo": "gpt-3.5-turbo", "gpt-35-turbo-16k": "gpt-3.5-turbo-16k", "gpt-4v": "gpt-4-turbo-vision"}
-
-
-def get_token_limit(model_id: str) -> int:
-    if model_id not in MODELS_2_TOKEN_LIMITS:
-        raise ValueError(f"Expected model gpt-35-turbo and above. Received: {model_id}")
-    return MODELS_2_TOKEN_LIMITS[model_id]
-
-
-def num_tokens_from_messages(message: Mapping[str, object], model: str) -> int:
-    """
-    Calculate the number of tokens required to encode a message.
-    Args:
-        message (Mapping): The message to encode, in a dictionary-like object.
-        model (str): The name of the model to use for encoding.
-    Returns:
-        int: The total number of tokens required to encode the message.
-    Example:
-        message = {'role': 'user', 'content': 'Hello, how are you?'}
-        model = 'gpt-3.5-turbo'
-        num_tokens_from_messages(message, model)
-        output: 11
-    """
-
-    encoding = tiktoken.encoding_for_model(get_oai_chatmodel_tiktok(model))
-    num_tokens = 2  # For "role" and "content" keys
-    for value in message.values():
-        if isinstance(value, list):
-            # For GPT-4-vision support, based on https://github.com/openai/openai-cookbook/pull/881/files
-            for item in value:
-                num_tokens += len(encoding.encode(item["type"]))
-                if item["type"] == "text":
-                    num_tokens += len(encoding.encode(item["text"]))
-                elif item["type"] == "image_url":
-                    num_tokens += calculate_image_token_cost(item["image_url"]["url"], item["image_url"]["detail"])
-        elif isinstance(value, str):
-            num_tokens += len(encoding.encode(value))
-        else:
-            raise ValueError(f"Could not encode unsupported message value type: {type(value)}")
-    return num_tokens
-
-
-def get_oai_chatmodel_tiktok(aoaimodel: str) -> str:
-    message = "Expected Azure OpenAI ChatGPT model name"
-    if aoaimodel == "" or aoaimodel is None:
-        raise ValueError(message)
-    if aoaimodel not in AOAI_2_OAI and aoaimodel not in MODELS_2_TOKEN_LIMITS:
-        raise ValueError(message)
-    return AOAI_2_OAI.get(aoaimodel, aoaimodel)
diff --git a/app/backend/requirements.in b/app/backend/requirements.in
index 18ff854f92..7c7100266b 100644
--- a/app/backend/requirements.in
+++ b/app/backend/requirements.in
@@ -27,3 +27,4 @@ PyMuPDF
 beautifulsoup4
 types-beautifulsoup4
 msgraph-sdk==1.1.0
+openai-messages-token-helper
diff --git a/app/backend/requirements.txt b/app/backend/requirements.txt
index f562474760..355c7a3f33 100644
--- a/app/backend/requirements.txt
+++ b/app/backend/requirements.txt
@@ -197,6 +197,10 @@ numpy==1.26.4
 oauthlib==3.2.2
     # via requests-oauthlib
 openai[datalib]==1.30.1
+    # via
+    #   -r requirements.in
+    #   openai-messages-token-helper
+openai-messages-token-helper==0.1.4
     # via -r requirements.in
 opentelemetry-api==1.24.0
     # via
@@ -251,7 +255,7 @@ opentelemetry-instrumentation-flask==0.45b0
     # via azure-monitor-opentelemetry
 opentelemetry-instrumentation-httpx==0.45b0
     # via -r requirements.in
-opentelemetry-instrumentation-openai==0.18.0
+opentelemetry-instrumentation-openai==0.18.2
     # via -r requirements.in
 opentelemetry-instrumentation-psycopg2==0.45b0
     # via azure-monitor-opentelemetry
@@ -267,7 +271,7 @@ opentelemetry-instrumentation-wsgi==0.45b0
     # via
     #   opentelemetry-instrumentation-django
     #   opentelemetry-instrumentation-flask
-opentelemetry-resource-detector-azure==0.1.4
+opentelemetry-resource-detector-azure==0.1.5
     # via azure-monitor-opentelemetry
 opentelemetry-sdk==1.24.0
     # via
@@ -316,7 +320,9 @@ pandas-stubs==2.2.2.240514
 pendulum==3.0.0
     # via microsoft-kiota-serialization-json
 pillow==10.3.0
-    # via -r requirements.in
+    # via
+    #   -r requirements.in
+    #   openai-messages-token-helper
 portalocker==2.8.2
     # via msal-extensions
 priority==2.0.0
@@ -334,9 +340,7 @@ pydantic==2.7.1
 pydantic-core==2.18.2
     # via pydantic
 pyjwt[crypto]==2.8.0
-    # via
-    #   msal
-    #   pyjwt
+    # via msal
 pymupdf==1.24.4
     # via -r requirements.in
 pymupdfb==1.24.3
@@ -392,6 +396,7 @@ tenacity==8.3.0
 tiktoken==0.7.0
     # via
     #   -r requirements.in
+    #   openai-messages-token-helper
     #   opentelemetry-instrumentation-openai
 time-machine==2.14.1
     # via pendulum
diff --git a/tests/test_chatapproach.py b/tests/test_chatapproach.py
index f917122a6b..f8f66c55ec 100644
--- a/tests/test_chatapproach.py
+++ b/tests/test_chatapproach.py
@@ -112,150 +112,6 @@ def test_get_search_query_returns_default(chat_approach):
     assert query == default_query
 
 
-def test_get_messages_from_history(chat_approach):
-    messages = chat_approach.get_messages_from_history(
-        system_prompt="You are a bot.",
-        model_id="gpt-35-turbo",
-        history=[
-            {"role": "user", "content": "What happens in a performance review?"},
-            {
-                "role": "assistant",
-                "content": "During the performance review at Contoso Electronics, the supervisor will discuss the employee's performance over the past year and provide feedback on areas for improvement. They will also provide an opportunity for the employee to discuss their goals and objectives for the upcoming year. The review is a two-way dialogue between managers and employees, and employees will receive a written summary of their performance review which will include a rating of their performance, feedback, and goals and objectives for the upcoming year [employee_handbook-3.pdf].",
-            },
-            {"role": "user", "content": "What does a Product Manager do?"},
-        ],
-        user_content="What does a Product Manager do?",
-        max_tokens=3000,
-    )
-    assert messages == [
-        {"role": "system", "content": "You are a bot."},
-        {"role": "user", "content": "What happens in a performance review?"},
-        {
-            "role": "assistant",
-            "content": "During the performance review at Contoso Electronics, the supervisor will discuss the employee's performance over the past year and provide feedback on areas for improvement. They will also provide an opportunity for the employee to discuss their goals and objectives for the upcoming year. The review is a two-way dialogue between managers and employees, and employees will receive a written summary of their performance review which will include a rating of their performance, feedback, and goals and objectives for the upcoming year [employee_handbook-3.pdf].",
-        },
-        {"role": "user", "content": "What does a Product Manager do?"},
-    ]
-
-
-def test_get_messages_from_history_truncated(chat_approach):
-    messages = chat_approach.get_messages_from_history(
-        system_prompt="You are a bot.",
-        model_id="gpt-35-turbo",
-        history=[
-            {"role": "user", "content": "What happens in a performance review?"},
-            {
-                "role": "assistant",
-                "content": "During the performance review at Contoso Electronics, the supervisor will discuss the employee's performance over the past year and provide feedback on areas for improvement. They will also provide an opportunity for the employee to discuss their goals and objectives for the upcoming year. The review is a two-way dialogue between managers and employees, and employees will receive a written summary of their performance review which will include a rating of their performance, feedback, and goals and objectives for the upcoming year [employee_handbook-3.pdf].",
-            },
-            {"role": "user", "content": "What does a Product Manager do?"},
-        ],
-        user_content="What does a Product Manager do?",
-        max_tokens=10,
-    )
-    assert messages == [
-        {"role": "system", "content": "You are a bot."},
-        {"role": "user", "content": "What does a Product Manager do?"},
-    ]
-
-
-def test_get_messages_from_history_truncated_longer(chat_approach):
-    messages = chat_approach.get_messages_from_history(
-        system_prompt="You are a bot.",  # 8 tokens
-        model_id="gpt-35-turbo",
-        history=[
-            {"role": "user", "content": "What happens in a performance review?"},  # 10 tokens
-            {
-                "role": "assistant",
-                "content": "During the performance review at Contoso Electronics, the supervisor will discuss the employee's performance over the past year and provide feedback on areas for improvement. They will also provide an opportunity for the employee to discuss their goals and objectives for the upcoming year. The review is a two-way dialogue between managers and employees, and employees will receive a written summary of their performance review which will include a rating of their performance, feedback, and goals and objectives for the upcoming year [employee_handbook-3.pdf].",
-            },  # 102 tokens
-            {"role": "user", "content": "Is there a dress code?"},  # 9 tokens
-            {
-                "role": "assistant",
-                "content": "Yes, there is a dress code at Contoso Electronics. Look sharp! [employee_handbook-1.pdf]",
-            },  # 26 tokens
-            {"role": "user", "content": "What does a Product Manager do?"},  # 10 tokens
-        ],
-        user_content="What does a Product Manager do?",
-        max_tokens=55,
-    )
-    assert messages == [
-        {"role": "system", "content": "You are a bot."},
-        {"role": "user", "content": "Is there a dress code?"},
-        {
-            "role": "assistant",
-            "content": "Yes, there is a dress code at Contoso Electronics. Look sharp! [employee_handbook-1.pdf]",
-        },
-        {"role": "user", "content": "What does a Product Manager do?"},
-    ]
-
-
-def test_get_messages_from_history_truncated_break_pair(chat_approach):
-    """Tests that the truncation breaks the pair of messages."""
-    messages = chat_approach.get_messages_from_history(
-        system_prompt="You are a bot.",  # 8 tokens
-        model_id="gpt-35-turbo",
-        history=[
-            {"role": "user", "content": "What happens in a performance review?"},  # 10 tokens
-            {
-                "role": "assistant",
-                "content": "The supervisor will discuss the employee's performance and provide feedback on areas for improvement. They will also provide an opportunity for the employee to discuss their goals and objectives for the upcoming year. The review is a two-way dialogue between managers and employees, and employees will receive a written summary of their performance review which will include a rating of their performance, feedback, and goals for the upcoming year [employee_handbook-3.pdf].",
-            },  # 87 tokens
-            {"role": "user", "content": "Is there a dress code?"},  # 9 tokens
-            {
-                "role": "assistant",
-                "content": "Yes, there is a dress code at Contoso Electronics. Look sharp! [employee_handbook-1.pdf]",
-            },  # 26 tokens
-            {"role": "user", "content": "What does a Product Manager do?"},  # 10 tokens
-        ],
-        user_content="What does a Product Manager do?",
-        max_tokens=147,
-    )
-    assert messages == [
-        {"role": "system", "content": "You are a bot."},
-        {
-            "role": "assistant",
-            "content": "The supervisor will discuss the employee's performance and provide feedback on areas for improvement. They will also provide an opportunity for the employee to discuss their goals and objectives for the upcoming year. The review is a two-way dialogue between managers and employees, and employees will receive a written summary of their performance review which will include a rating of their performance, feedback, and goals for the upcoming year [employee_handbook-3.pdf].",
-        },
-        {"role": "user", "content": "Is there a dress code?"},
-        {
-            "role": "assistant",
-            "content": "Yes, there is a dress code at Contoso Electronics. Look sharp! [employee_handbook-1.pdf]",
-        },
-        {"role": "user", "content": "What does a Product Manager do?"},
-    ]
-
-
-def test_get_messages_from_history_system_message(chat_approach):
-    """Tests that the system message token count is considered."""
-    messages = chat_approach.get_messages_from_history(
-        system_prompt="Assistant helps the company employees with their healthcare plan questions, and questions about the employee handbook. Be brief in your answers.",  # 24 tokens
-        model_id="gpt-35-turbo",
-        history=[
-            {"role": "user", "content": "What happens in a performance review?"},  # 10 tokens
-            {
-                "role": "assistant",
-                "content": "During the performance review at Contoso Electronics, the supervisor will discuss the employee's performance over the past year and provide feedback on areas for improvement. They will also provide an opportunity for the employee to discuss their goals and objectives for the upcoming year. The review is a two-way dialogue between managers and employees, and employees will receive a written summary of their performance review which will include a rating of their performance, feedback, and goals and objectives for the upcoming year [employee_handbook-3.pdf].",
-            },  # 102 tokens
-            {"role": "user", "content": "Is there a dress code?"},  # 9 tokens
-            {
-                "role": "assistant",
-                "content": "Yes, there is a dress code at Contoso Electronics. Look sharp! [employee_handbook-1.pdf]",
-            },  # 26 tokens
-            {"role": "user", "content": "What does a Product Manager do?"},  # 10 tokens
-        ],
-        user_content="What does a Product Manager do?",
-        max_tokens=36,
-    )
-    assert messages == [
-        {
-            "role": "system",
-            "content": "Assistant helps the company employees with their healthcare plan questions, and questions about the employee handbook. Be brief in your answers.",
-        },
-        {"role": "user", "content": "What does a Product Manager do?"},
-    ]
-
-
 def test_extract_followup_questions(chat_approach):
     content = "Here is answer to your question.<<What is the dress code?>>"
     pre_content, followup_questions = chat_approach.extract_followup_questions(content)
@@ -292,26 +148,6 @@ def test_extract_followup_questions_no_pre_content(chat_approach):
     assert followup_questions == ["What is the dress code?"]
 
 
-def test_get_messages_from_history_few_shots(chat_approach):
-    user_query_request = "What does a Product manager do?"
-    messages = chat_approach.get_messages_from_history(
-        system_prompt=chat_approach.query_prompt_template,
-        model_id=chat_approach.chatgpt_model,
-        user_content=user_query_request,
-        history=[],
-        max_tokens=chat_approach.chatgpt_token_limit - len(user_query_request),
-        few_shots=chat_approach.query_prompt_few_shots,
-    )
-    # Make sure messages are in the right order
-    assert messages[0]["role"] == "system"
-    assert messages[1]["role"] == "user"
-    assert messages[2]["role"] == "assistant"
-    assert messages[3]["role"] == "user"
-    assert messages[4]["role"] == "assistant"
-    assert messages[5]["role"] == "user"
-    assert messages[5]["content"] == user_query_request
-
-
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "minimum_search_score,minimum_reranker_score,expected_result_count",
diff --git a/tests/test_imageshelper.py b/tests/test_imageshelper.py
deleted file mode 100644
index 50f1e98d31..0000000000
--- a/tests/test_imageshelper.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import base64
-
-import pytest
-
-from core.imageshelper import calculate_image_token_cost, get_image_dims
-
-
-@pytest.fixture
-def small_image():
-    return "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z/C/HgAGgwJ/lK3Q6wAAAABJRU5ErkJggg=="
-
-
-@pytest.fixture
-def large_image():
-    large_image = open("tests/image_large.png", "rb").read()
-    img = base64.b64encode(large_image).decode("utf-8")
-    return f"data:image/png;base64,{img}"
-
-
-def test_calculate_image_token_cost(small_image, large_image):
-    assert calculate_image_token_cost(small_image, "low") == 85
-    assert calculate_image_token_cost(small_image, "high") == 255
-    assert calculate_image_token_cost(small_image) == 255
-    assert calculate_image_token_cost(large_image, "low") == 85
-    assert calculate_image_token_cost(large_image, "high") == 1105
-    with pytest.raises(ValueError, match="Invalid value for detail parameter."):
-        assert calculate_image_token_cost(large_image, "medium")
-
-
-def test_get_image_dims_small(small_image, large_image):
-    assert get_image_dims(small_image) == (1, 1)
-    assert get_image_dims(large_image) == (2050, 1238)
-    with pytest.raises(ValueError, match="Image must be a base64 string."):
-        assert get_image_dims("http://domain.com/image.png")
diff --git a/tests/test_messagebuilder.py b/tests/test_messagebuilder.py
deleted file mode 100644
index fbc09fa3ba..0000000000
--- a/tests/test_messagebuilder.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from core.messagebuilder import MessageBuilder
-
-
-def test_messagebuilder():
-    builder = MessageBuilder("You are a bot.", "gpt-35-turbo")
-    assert builder.messages == [
-        # 1 token, 1 token, 1 token, 5 tokens
-        {"role": "system", "content": "You are a bot."}
-    ]
-    assert builder.model == "gpt-35-turbo"
-    assert builder.count_tokens_for_message(builder.messages[0]) == 8
-
-
-def test_messagebuilder_append():
-    builder = MessageBuilder("You are a bot.", "gpt-35-turbo")
-    builder.insert_message("user", "Hello, how are you?")
-    assert builder.messages == [
-        # 1 token, 1 token, 1 token, 5 tokens
-        {"role": "system", "content": "You are a bot."},
-        # 1 token, 1 token, 1 token, 6 tokens
-        {"role": "user", "content": "Hello, how are you?"},
-    ]
-    assert builder.model == "gpt-35-turbo"
-    assert builder.count_tokens_for_message(builder.messages[0]) == 8
-    assert builder.count_tokens_for_message(builder.messages[1]) == 9
-
-
-def test_messagebuilder_unicode():
-    builder = MessageBuilder("a\u0301", "gpt-35-turbo")
-    assert builder.messages == [
-        # 1 token, 1 token, 1 token, 1 token
-        {"role": "system", "content": "á"}
-    ]
-    assert builder.model == "gpt-35-turbo"
-    assert builder.count_tokens_for_message(builder.messages[0]) == 4
-
-
-def test_messagebuilder_unicode_append():
-    builder = MessageBuilder("a\u0301", "gpt-35-turbo")
-    builder.insert_message("user", "a\u0301")
-    assert builder.messages == [
-        # 1 token, 1 token, 1 token, 1 token
-        {"role": "system", "content": "á"},
-        # 1 token, 1 token, 1 token, 1 token
-        {"role": "user", "content": "á"},
-    ]
-    assert builder.model == "gpt-35-turbo"
-    assert builder.count_tokens_for_message(builder.messages[0]) == 4
-    assert builder.count_tokens_for_message(builder.messages[1]) == 4
diff --git a/tests/test_modelhelper.py b/tests/test_modelhelper.py
deleted file mode 100644
index 6884e961a6..0000000000
--- a/tests/test_modelhelper.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import pytest
-
-from core.modelhelper import (
-    get_oai_chatmodel_tiktok,
-    get_token_limit,
-    num_tokens_from_messages,
-)
-
-
-def test_get_token_limit():
-    assert get_token_limit("gpt-35-turbo") == 4000
-    assert get_token_limit("gpt-3.5-turbo") == 4000
-    assert get_token_limit("gpt-35-turbo-16k") == 16000
-    assert get_token_limit("gpt-3.5-turbo-16k") == 16000
-    assert get_token_limit("gpt-4") == 8100
-    assert get_token_limit("gpt-4-32k") == 32000
-
-
-def test_get_token_limit_error():
-    with pytest.raises(ValueError, match="Expected model gpt-35-turbo and above"):
-        get_token_limit("gpt-3")
-
-
-def test_num_tokens_from_messages():
-    message = {
-        # 1 token : 1 token
-        "role": "user",
-        # 1 token : 5 tokens
-        "content": "Hello, how are you?",
-    }
-    model = "gpt-35-turbo"
-    assert num_tokens_from_messages(message, model) == 9
-
-
-def test_num_tokens_from_messages_gpt4():
-    message = {
-        # 1 token : 1 token
-        "role": "user",
-        # 1 token : 5 tokens
-        "content": "Hello, how are you?",
-    }
-    model = "gpt-4"
-    assert num_tokens_from_messages(message, model) == 9
-
-
-def test_num_tokens_from_messages_list():
-    message = {
-        # 1 token : 1 token
-        "role": "user",
-        # 1 token : 262 tokens
-        "content": [
-            {"type": "text", "text": "Describe this picture:"},  # 1 token  # 4 tokens
-            {
-                "type": "image_url",  # 2 tokens
-                "image_url": {
-                    "url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z/C/HgAGgwJ/lK3Q6wAAAABJRU5ErkJggg==",  # 255 tokens
-                    "detail": "auto",
-                },
-            },
-        ],
-    }
-    model = "gpt-4"
-    assert num_tokens_from_messages(message, model) == 265
-
-
-def test_num_tokens_from_messages_error():
-    message = {
-        # 1 token : 1 token
-        "role": "user",
-        # 1 token : 5 tokens
-        "content": {"key": "value"},
-    }
-    model = "gpt-35-turbo"
-    with pytest.raises(ValueError, match="Could not encode unsupported message value type"):
-        num_tokens_from_messages(message, model)
-
-
-def test_get_oai_chatmodel_tiktok_mapped():
-    assert get_oai_chatmodel_tiktok("gpt-35-turbo") == "gpt-3.5-turbo"
-    assert get_oai_chatmodel_tiktok("gpt-35-turbo-16k") == "gpt-3.5-turbo-16k"
-
-
-def test_get_oai_chatmodel_tiktok_unmapped():
-    assert get_oai_chatmodel_tiktok("gpt-4") == "gpt-4"
-
-
-def test_get_oai_chatmodel_tiktok_error():
-    with pytest.raises(ValueError, match="Expected Azure OpenAI ChatGPT model name"):
-        get_oai_chatmodel_tiktok("")
-    with pytest.raises(ValueError, match="Expected Azure OpenAI ChatGPT model name"):
-        get_oai_chatmodel_tiktok(None)
-    with pytest.raises(ValueError, match="Expected Azure OpenAI ChatGPT model name"):
-        get_oai_chatmodel_tiktok("gpt-3")