diff --git a/litellm/litellm_core_utils/README.md b/litellm/litellm_core_utils/README.md
new file mode 100644
index 000000000000..9cd3514536bc
--- /dev/null
+++ b/litellm/litellm_core_utils/README.md
@@ -0,0 +1,11 @@
+## Folder Contents
+
+This folder contains general-purpose utilities that are used in multiple places in the codebase. 
+
+Core files:
+- `streaming_handler.py`: The core streaming logic + streaming related helper utils 
+- `core_helpers.py`: code used in `types/` - e.g. `map_finish_reason`. 
+- `exception_mapping_utils.py`: utils for mapping exceptions to openai-compatible error types. 
+- `default_encoding.py`: code for loading the default encoding (tiktoken)
+- `get_llm_provider_logic.py`: code for inferring the LLM provider from a given model name. 
+
diff --git a/litellm/litellm_core_utils/core_helpers.py b/litellm/litellm_core_utils/core_helpers.py
index cddca61eec4a..816dff81ee92 100644
--- a/litellm/litellm_core_utils/core_helpers.py
+++ b/litellm/litellm_core_utils/core_helpers.py
@@ -3,6 +3,8 @@
 import os
 from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple, Union
 
+import httpx
+
 from litellm._logging import verbose_logger
 
 if TYPE_CHECKING:
@@ -99,3 +101,28 @@ def _get_parent_otel_span_from_kwargs(
             "Error in _get_parent_otel_span_from_kwargs: " + str(e)
         )
         return None
+
+
+def process_response_headers(response_headers: Union[httpx.Headers, dict]) -> dict:
+    from litellm.types.utils import OPENAI_RESPONSE_HEADERS
+
+    openai_headers = {}
+    processed_headers = {}
+    additional_headers = {}
+
+    for k, v in response_headers.items():
+        if k in OPENAI_RESPONSE_HEADERS:  # return openai-compatible headers
+            openai_headers[k] = v
+        if k.startswith(
+            "llm_provider-"
+        ):  # return raw provider headers (incl. openai-compatible ones)
+            processed_headers[k] = v
+        else:
+            additional_headers["{}-{}".format("llm_provider", k)] = v
+
+    additional_headers = {
+        **openai_headers,
+        **processed_headers,
+        **additional_headers,
+    }
+    return additional_headers
diff --git a/litellm/litellm_core_utils/default_encoding.py b/litellm/litellm_core_utils/default_encoding.py
new file mode 100644
index 000000000000..e093325829b7
--- /dev/null
+++ b/litellm/litellm_core_utils/default_encoding.py
@@ -0,0 +1,21 @@
+import os
+
+import litellm
+
+try:
+    # New and recommended way to access resources
+    from importlib import resources
+
+    filename = str(resources.files(litellm).joinpath("llms/tokenizers"))
+except (ImportError, AttributeError):
+    # Old way to access resources, which setuptools deprecated some time ago
+    import pkg_resources  # type: ignore
+
+    filename = pkg_resources.resource_filename(__name__, "llms/tokenizers")
+
+os.environ["TIKTOKEN_CACHE_DIR"] = os.getenv(
+    "CUSTOM_TIKTOKEN_CACHE_DIR", filename
+)  # use local copy of tiktoken b/c of - https://github.com/BerriAI/litellm/issues/1071
+import tiktoken
+
+encoding = tiktoken.get_encoding("cl100k_base")
diff --git a/litellm/litellm_core_utils/rules.py b/litellm/litellm_core_utils/rules.py
new file mode 100644
index 000000000000..beeb012d032f
--- /dev/null
+++ b/litellm/litellm_core_utils/rules.py
@@ -0,0 +1,50 @@
+from typing import Optional
+
+import litellm
+
+
+class Rules:
+    """
+    Fail calls based on the input or llm api output
+
+    Example usage:
+    import litellm
+    def my_custom_rule(input): # receives the model response
+            if "i don't think i can answer" in input: # trigger fallback if the model refuses to answer
+                    return False
+            return True
+
+    litellm.post_call_rules = [my_custom_rule] # have these be functions that can be called to fail a call
+
+    response = litellm.completion(model="gpt-3.5-turbo", messages=[{"role": "user",
+        "content": "Hey, how's it going?"}], fallbacks=["openrouter/mythomax"])
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def pre_call_rules(self, input: str, model: str):
+        for rule in litellm.pre_call_rules:
+            if callable(rule):
+                decision = rule(input)
+                if decision is False:
+                    raise litellm.APIResponseValidationError(message="LLM Response failed post-call-rule check", llm_provider="", model=model)  # type: ignore
+        return True
+
+    def post_call_rules(self, input: Optional[str], model: str) -> bool:
+        if input is None:
+            return True
+        for rule in litellm.post_call_rules:
+            if callable(rule):
+                decision = rule(input)
+                if isinstance(decision, bool):
+                    if decision is False:
+                        raise litellm.APIResponseValidationError(message="LLM Response failed post-call-rule check", llm_provider="", model=model)  # type: ignore
+                elif isinstance(decision, dict):
+                    decision_val = decision.get("decision", True)
+                    decision_message = decision.get(
+                        "message", "LLM Response failed post-call-rule check"
+                    )
+                    if decision_val is False:
+                        raise litellm.APIResponseValidationError(message=decision_message, llm_provider="", model=model)  # type: ignore
+        return True
diff --git a/litellm/litellm_core_utils/streaming_handler.py b/litellm/litellm_core_utils/streaming_handler.py
new file mode 100644
index 000000000000..5c18ff512b6c
--- /dev/null
+++ b/litellm/litellm_core_utils/streaming_handler.py
@@ -0,0 +1,2020 @@
+import asyncio
+import json
+import threading
+import time
+import traceback
+import uuid
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Callable, List, Optional
+
+import httpx
+from pydantic import BaseModel
+
+import litellm
+from litellm import verbose_logger
+from litellm.litellm_core_utils.redact_messages import (
+    LiteLLMLoggingObject,
+    redact_message_input_output_from_logging,
+)
+from litellm.types.utils import Delta
+from litellm.types.utils import GenericStreamingChunk as GChunk
+from litellm.types.utils import (
+    ModelResponse,
+    ModelResponseStream,
+    StreamingChoices,
+    Usage,
+)
+
+from ..exceptions import OpenAIError
+from .core_helpers import map_finish_reason, process_response_headers
+from .default_encoding import encoding
+from .exception_mapping_utils import exception_type
+from .rules import Rules
+
+MAX_THREADS = 100
+
+# Create a ThreadPoolExecutor
+executor = ThreadPoolExecutor(max_workers=MAX_THREADS)
+
+
+def print_verbose(print_statement):
+    try:
+        if litellm.set_verbose:
+            print(print_statement)  # noqa
+    except Exception:
+        pass
+
+
+class CustomStreamWrapper:
+    def __init__(
+        self,
+        completion_stream,
+        model,
+        logging_obj: Any,
+        custom_llm_provider: Optional[str] = None,
+        stream_options=None,
+        make_call: Optional[Callable] = None,
+        _response_headers: Optional[dict] = None,
+    ):
+        self.model = model
+        self.make_call = make_call
+        self.custom_llm_provider = custom_llm_provider
+        self.logging_obj: LiteLLMLoggingObject = logging_obj
+        self.completion_stream = completion_stream
+        self.sent_first_chunk = False
+        self.sent_last_chunk = False
+        self.system_fingerprint: Optional[str] = None
+        self.received_finish_reason: Optional[str] = None
+        self.intermittent_finish_reason: Optional[str] = (
+            None  # finish reasons that show up mid-stream
+        )
+        self.special_tokens = [
+            "<|assistant|>",
+            "<|system|>",
+            "<|user|>",
+            "<s>",
+            "</s>",
+            "<|im_end|>",
+            "<|im_start|>",
+        ]
+        self.holding_chunk = ""
+        self.complete_response = ""
+        self.response_uptil_now = ""
+        _model_info = (
+            self.logging_obj.model_call_details.get("litellm_params", {}).get(
+                "model_info", {}
+            )
+            or {}
+        )
+        self._hidden_params = {
+            "model_id": (_model_info.get("id", None)),
+        }  # returned as x-litellm-model-id response header in proxy
+
+        self._hidden_params["additional_headers"] = process_response_headers(
+            _response_headers or {}
+        )  # GUARANTEE OPENAI HEADERS IN RESPONSE
+
+        self._response_headers = _response_headers
+        self.response_id = None
+        self.logging_loop = None
+        self.rules = Rules()
+        self.stream_options = stream_options or getattr(
+            logging_obj, "stream_options", None
+        )
+        self.messages = getattr(logging_obj, "messages", None)
+        self.sent_stream_usage = False
+        self.send_stream_usage = (
+            True if self.check_send_stream_usage(self.stream_options) else False
+        )
+        self.tool_call = False
+        self.chunks: List = (
+            []
+        )  # keep track of the returned chunks - used for calculating the input/output tokens for stream options
+        self.is_function_call = self.check_is_function_call(logging_obj=logging_obj)
+
+    def __iter__(self):
+        return self
+
+    def __aiter__(self):
+        return self
+
+    def check_send_stream_usage(self, stream_options: Optional[dict]):
+        return (
+            stream_options is not None
+            and stream_options.get("include_usage", False) is True
+        )
+
+    def check_is_function_call(self, logging_obj) -> bool:
+        if hasattr(logging_obj, "optional_params") and isinstance(
+            logging_obj.optional_params, dict
+        ):
+            if (
+                "litellm_param_is_function_call" in logging_obj.optional_params
+                and logging_obj.optional_params["litellm_param_is_function_call"]
+                is True
+            ):
+                return True
+
+        return False
+
+    def process_chunk(self, chunk: str):
+        """
+        NLP Cloud streaming returns the entire response, for each chunk. Process this, to only return the delta.
+        """
+        try:
+            chunk = chunk.strip()
+            self.complete_response = self.complete_response.strip()
+
+            if chunk.startswith(self.complete_response):
+                # Remove last_sent_chunk only if it appears at the start of the new chunk
+                chunk = chunk[len(self.complete_response) :]
+
+            self.complete_response += chunk
+            return chunk
+        except Exception as e:
+            raise e
+
+    def safety_checker(self) -> None:
+        """
+        Fixes - https://github.com/BerriAI/litellm/issues/5158
+
+        if the model enters a loop and starts repeating the same chunk again, break out of loop and raise an internalservererror - allows for retries.
+
+        Raises - InternalServerError, if LLM enters infinite loop while streaming
+        """
+        if len(self.chunks) >= litellm.REPEATED_STREAMING_CHUNK_LIMIT:
+            # Get the last n chunks
+            last_chunks = self.chunks[-litellm.REPEATED_STREAMING_CHUNK_LIMIT :]
+
+            # Extract the relevant content from the chunks
+            last_contents = [chunk.choices[0].delta.content for chunk in last_chunks]
+
+            # Check if all extracted contents are identical
+            if all(content == last_contents[0] for content in last_contents):
+                if (
+                    last_contents[0] is not None
+                    and isinstance(last_contents[0], str)
+                    and len(last_contents[0]) > 2
+                ):  # ignore empty content - https://github.com/BerriAI/litellm/issues/5158#issuecomment-2287156946
+                    # All last n chunks are identical
+                    raise litellm.InternalServerError(
+                        message="The model is repeating the same chunk = {}.".format(
+                            last_contents[0]
+                        ),
+                        model="",
+                        llm_provider="",
+                    )
+
+    def check_special_tokens(self, chunk: str, finish_reason: Optional[str]):
+        """
+        Output parse <s> / </s> special tokens for sagemaker + hf streaming.
+        """
+        hold = False
+        if (
+            self.custom_llm_provider != "huggingface"
+            and self.custom_llm_provider != "sagemaker"
+        ):
+            return hold, chunk
+
+        if finish_reason:
+            for token in self.special_tokens:
+                if token in chunk:
+                    chunk = chunk.replace(token, "")
+            return hold, chunk
+
+        if self.sent_first_chunk is True:
+            return hold, chunk
+
+        curr_chunk = self.holding_chunk + chunk
+        curr_chunk = curr_chunk.strip()
+
+        for token in self.special_tokens:
+            if len(curr_chunk) < len(token) and curr_chunk in token:
+                hold = True
+                self.holding_chunk = curr_chunk
+            elif len(curr_chunk) >= len(token):
+                if token in curr_chunk:
+                    self.holding_chunk = curr_chunk.replace(token, "")
+                    hold = True
+            else:
+                pass
+
+        if hold is False:  # reset
+            self.holding_chunk = ""
+        return hold, curr_chunk
+
+    def handle_anthropic_text_chunk(self, chunk):
+        """
+        For old anthropic models - claude-1, claude-2.
+
+        Claude-3 is handled from within Anthropic.py VIA ModelResponseIterator()
+        """
+        str_line = chunk
+        if isinstance(chunk, bytes):  # Handle binary data
+            str_line = chunk.decode("utf-8")  # Convert bytes to string
+        text = ""
+        is_finished = False
+        finish_reason = None
+        if str_line.startswith("data:"):
+            data_json = json.loads(str_line[5:])
+            type_chunk = data_json.get("type", None)
+            if type_chunk == "completion":
+                text = data_json.get("completion")
+                finish_reason = data_json.get("stop_reason")
+                if finish_reason is not None:
+                    is_finished = True
+            return {
+                "text": text,
+                "is_finished": is_finished,
+                "finish_reason": finish_reason,
+            }
+        elif "error" in str_line:
+            raise ValueError(f"Unable to parse response. Original response: {str_line}")
+        else:
+            return {
+                "text": text,
+                "is_finished": is_finished,
+                "finish_reason": finish_reason,
+            }
+
+    def handle_predibase_chunk(self, chunk):
+        try:
+            if not isinstance(chunk, str):
+                chunk = chunk.decode(
+                    "utf-8"
+                )  # DO NOT REMOVE this: This is required for HF inference API + Streaming
+            text = ""
+            is_finished = False
+            finish_reason = ""
+            print_verbose(f"chunk: {chunk}")
+            if chunk.startswith("data:"):
+                data_json = json.loads(chunk[5:])
+                print_verbose(f"data json: {data_json}")
+                if "token" in data_json and "text" in data_json["token"]:
+                    text = data_json["token"]["text"]
+                if data_json.get("details", False) and data_json["details"].get(
+                    "finish_reason", False
+                ):
+                    is_finished = True
+                    finish_reason = data_json["details"]["finish_reason"]
+                elif data_json.get(
+                    "generated_text", False
+                ):  # if full generated text exists, then stream is complete
+                    text = ""  # don't return the final bos token
+                    is_finished = True
+                    finish_reason = "stop"
+                elif data_json.get("error", False):
+                    raise Exception(data_json.get("error"))
+                return {
+                    "text": text,
+                    "is_finished": is_finished,
+                    "finish_reason": finish_reason,
+                }
+            elif "error" in chunk:
+                raise ValueError(chunk)
+            return {
+                "text": text,
+                "is_finished": is_finished,
+                "finish_reason": finish_reason,
+            }
+        except Exception as e:
+            raise e
+
+    def handle_huggingface_chunk(self, chunk):
+        try:
+            if not isinstance(chunk, str):
+                chunk = chunk.decode(
+                    "utf-8"
+                )  # DO NOT REMOVE this: This is required for HF inference API + Streaming
+            text = ""
+            is_finished = False
+            finish_reason = ""
+            print_verbose(f"chunk: {chunk}")
+            if chunk.startswith("data:"):
+                data_json = json.loads(chunk[5:])
+                print_verbose(f"data json: {data_json}")
+                if "token" in data_json and "text" in data_json["token"]:
+                    text = data_json["token"]["text"]
+                if data_json.get("details", False) and data_json["details"].get(
+                    "finish_reason", False
+                ):
+                    is_finished = True
+                    finish_reason = data_json["details"]["finish_reason"]
+                elif data_json.get(
+                    "generated_text", False
+                ):  # if full generated text exists, then stream is complete
+                    text = ""  # don't return the final bos token
+                    is_finished = True
+                    finish_reason = "stop"
+                elif data_json.get("error", False):
+                    raise Exception(data_json.get("error"))
+                return {
+                    "text": text,
+                    "is_finished": is_finished,
+                    "finish_reason": finish_reason,
+                }
+            elif "error" in chunk:
+                raise ValueError(chunk)
+            return {
+                "text": text,
+                "is_finished": is_finished,
+                "finish_reason": finish_reason,
+            }
+        except Exception as e:
+            raise e
+
+    def handle_ai21_chunk(self, chunk):  # fake streaming
+        chunk = chunk.decode("utf-8")
+        data_json = json.loads(chunk)
+        try:
+            text = data_json["completions"][0]["data"]["text"]
+            is_finished = True
+            finish_reason = "stop"
+            return {
+                "text": text,
+                "is_finished": is_finished,
+                "finish_reason": finish_reason,
+            }
+        except Exception:
+            raise ValueError(f"Unable to parse response. Original response: {chunk}")
+
+    def handle_maritalk_chunk(self, chunk):  # fake streaming
+        chunk = chunk.decode("utf-8")
+        data_json = json.loads(chunk)
+        try:
+            text = data_json["answer"]
+            is_finished = True
+            finish_reason = "stop"
+            return {
+                "text": text,
+                "is_finished": is_finished,
+                "finish_reason": finish_reason,
+            }
+        except Exception:
+            raise ValueError(f"Unable to parse response. Original response: {chunk}")
+
+    def handle_nlp_cloud_chunk(self, chunk):
+        text = ""
+        is_finished = False
+        finish_reason = ""
+        try:
+            if "dolphin" in self.model:
+                chunk = self.process_chunk(chunk=chunk)
+            else:
+                data_json = json.loads(chunk)
+                chunk = data_json["generated_text"]
+            text = chunk
+            if "[DONE]" in text:
+                text = text.replace("[DONE]", "")
+                is_finished = True
+                finish_reason = "stop"
+            return {
+                "text": text,
+                "is_finished": is_finished,
+                "finish_reason": finish_reason,
+            }
+        except Exception:
+            raise ValueError(f"Unable to parse response. Original response: {chunk}")
+
+    def handle_aleph_alpha_chunk(self, chunk):
+        chunk = chunk.decode("utf-8")
+        data_json = json.loads(chunk)
+        try:
+            text = data_json["completions"][0]["completion"]
+            is_finished = True
+            finish_reason = "stop"
+            return {
+                "text": text,
+                "is_finished": is_finished,
+                "finish_reason": finish_reason,
+            }
+        except Exception:
+            raise ValueError(f"Unable to parse response. Original response: {chunk}")
+
+    def handle_cohere_chunk(self, chunk):
+        chunk = chunk.decode("utf-8")
+        data_json = json.loads(chunk)
+        try:
+            text = ""
+            is_finished = False
+            finish_reason = ""
+            index: Optional[int] = None
+            if "index" in data_json:
+                index = data_json.get("index")
+            if "text" in data_json:
+                text = data_json["text"]
+            elif "is_finished" in data_json:
+                is_finished = data_json["is_finished"]
+                finish_reason = data_json["finish_reason"]
+            else:
+                raise Exception(data_json)
+            return {
+                "index": index,
+                "text": text,
+                "is_finished": is_finished,
+                "finish_reason": finish_reason,
+            }
+        except Exception:
+            raise ValueError(f"Unable to parse response. Original response: {chunk}")
+
+    def handle_cohere_chat_chunk(self, chunk):
+        chunk = chunk.decode("utf-8")
+        data_json = json.loads(chunk)
+        print_verbose(f"chunk: {chunk}")
+        try:
+            text = ""
+            is_finished = False
+            finish_reason = ""
+            if "text" in data_json:
+                text = data_json["text"]
+            elif "is_finished" in data_json and data_json["is_finished"] is True:
+                is_finished = data_json["is_finished"]
+                finish_reason = data_json["finish_reason"]
+            else:
+                return
+            return {
+                "text": text,
+                "is_finished": is_finished,
+                "finish_reason": finish_reason,
+            }
+        except Exception:
+            raise ValueError(f"Unable to parse response. Original response: {chunk}")
+
+    def handle_azure_chunk(self, chunk):
+        is_finished = False
+        finish_reason = ""
+        text = ""
+        print_verbose(f"chunk: {chunk}")
+        if "data: [DONE]" in chunk:
+            text = ""
+            is_finished = True
+            finish_reason = "stop"
+            return {
+                "text": text,
+                "is_finished": is_finished,
+                "finish_reason": finish_reason,
+            }
+        elif chunk.startswith("data:"):
+            data_json = json.loads(chunk[5:])  # chunk.startswith("data:"):
+            try:
+                if len(data_json["choices"]) > 0:
+                    delta = data_json["choices"][0]["delta"]
+                    text = "" if delta is None else delta.get("content", "")
+                    if data_json["choices"][0].get("finish_reason", None):
+                        is_finished = True
+                        finish_reason = data_json["choices"][0]["finish_reason"]
+                print_verbose(
+                    f"text: {text}; is_finished: {is_finished}; finish_reason: {finish_reason}"
+                )
+                return {
+                    "text": text,
+                    "is_finished": is_finished,
+                    "finish_reason": finish_reason,
+                }
+            except Exception:
+                raise ValueError(
+                    f"Unable to parse response. Original response: {chunk}"
+                )
+        elif "error" in chunk:
+            raise ValueError(f"Unable to parse response. Original response: {chunk}")
+        else:
+            return {
+                "text": text,
+                "is_finished": is_finished,
+                "finish_reason": finish_reason,
+            }
+
+    def handle_replicate_chunk(self, chunk):
+        try:
+            text = ""
+            is_finished = False
+            finish_reason = ""
+            if "output" in chunk:
+                text = chunk["output"]
+            if "status" in chunk:
+                if chunk["status"] == "succeeded":
+                    is_finished = True
+                    finish_reason = "stop"
+            elif chunk.get("error", None):
+                raise Exception(chunk["error"])
+            return {
+                "text": text,
+                "is_finished": is_finished,
+                "finish_reason": finish_reason,
+            }
+        except Exception:
+            raise ValueError(f"Unable to parse response. Original response: {chunk}")
+
+    def handle_openai_chat_completion_chunk(self, chunk):
+        try:
+            print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n")
+            str_line = chunk
+            text = ""
+            is_finished = False
+            finish_reason = None
+            logprobs = None
+            usage = None
+            if str_line and str_line.choices and len(str_line.choices) > 0:
+                if (
+                    str_line.choices[0].delta is not None
+                    and str_line.choices[0].delta.content is not None
+                ):
+                    text = str_line.choices[0].delta.content
+                else:  # function/tool calling chunk - when content is None. in this case we just return the original chunk from openai
+                    pass
+                if str_line.choices[0].finish_reason:
+                    is_finished = True
+                    finish_reason = str_line.choices[0].finish_reason
+
+                # checking for logprobs
+                if (
+                    hasattr(str_line.choices[0], "logprobs")
+                    and str_line.choices[0].logprobs is not None
+                ):
+                    logprobs = str_line.choices[0].logprobs
+                else:
+                    logprobs = None
+
+            usage = getattr(str_line, "usage", None)
+
+            return {
+                "text": text,
+                "is_finished": is_finished,
+                "finish_reason": finish_reason,
+                "logprobs": logprobs,
+                "original_chunk": str_line,
+                "usage": usage,
+            }
+        except Exception as e:
+            raise e
+
+    def handle_azure_text_completion_chunk(self, chunk):
+        try:
+            print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n")
+            text = ""
+            is_finished = False
+            finish_reason = None
+            choices = getattr(chunk, "choices", [])
+            if len(choices) > 0:
+                text = choices[0].text
+                if choices[0].finish_reason is not None:
+                    is_finished = True
+                    finish_reason = choices[0].finish_reason
+            return {
+                "text": text,
+                "is_finished": is_finished,
+                "finish_reason": finish_reason,
+            }
+
+        except Exception as e:
+            raise e
+
+    def handle_openai_text_completion_chunk(self, chunk):
+        try:
+            print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n")
+            text = ""
+            is_finished = False
+            finish_reason = None
+            usage = None
+            choices = getattr(chunk, "choices", [])
+            if len(choices) > 0:
+                text = choices[0].text
+                if choices[0].finish_reason is not None:
+                    is_finished = True
+                    finish_reason = choices[0].finish_reason
+            usage = getattr(chunk, "usage", None)
+            return {
+                "text": text,
+                "is_finished": is_finished,
+                "finish_reason": finish_reason,
+                "usage": usage,
+            }
+
+        except Exception as e:
+            raise e
+
+    def handle_baseten_chunk(self, chunk):
+        try:
+            chunk = chunk.decode("utf-8")
+            if len(chunk) > 0:
+                if chunk.startswith("data:"):
+                    data_json = json.loads(chunk[5:])
+                    if "token" in data_json and "text" in data_json["token"]:
+                        return data_json["token"]["text"]
+                    else:
+                        return ""
+                data_json = json.loads(chunk)
+                if "model_output" in data_json:
+                    if (
+                        isinstance(data_json["model_output"], dict)
+                        and "data" in data_json["model_output"]
+                        and isinstance(data_json["model_output"]["data"], list)
+                    ):
+                        return data_json["model_output"]["data"][0]
+                    elif isinstance(data_json["model_output"], str):
+                        return data_json["model_output"]
+                    elif "completion" in data_json and isinstance(
+                        data_json["completion"], str
+                    ):
+                        return data_json["completion"]
+                    else:
+                        raise ValueError(
+                            f"Unable to parse response. Original response: {chunk}"
+                        )
+                else:
+                    return ""
+            else:
+                return ""
+        except Exception as e:
+            verbose_logger.exception(
+                "litellm.CustomStreamWrapper.handle_baseten_chunk(): Exception occured - {}".format(
+                    str(e)
+                )
+            )
+            return ""
+
+    def handle_cloudlfare_stream(self, chunk):
+        try:
+            print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n")
+            chunk = chunk.decode("utf-8")
+            str_line = chunk
+            text = ""
+            is_finished = False
+            finish_reason = None
+
+            if "[DONE]" in chunk:
+                return {"text": text, "is_finished": True, "finish_reason": "stop"}
+            elif str_line.startswith("data:"):
+                data_json = json.loads(str_line[5:])
+                print_verbose(f"delta content: {data_json}")
+                text = data_json["response"]
+                return {
+                    "text": text,
+                    "is_finished": is_finished,
+                    "finish_reason": finish_reason,
+                }
+            else:
+                return {
+                    "text": text,
+                    "is_finished": is_finished,
+                    "finish_reason": finish_reason,
+                }
+
+        except Exception as e:
+            raise e
+
+    def handle_ollama_stream(self, chunk):
+        try:
+            if isinstance(chunk, dict):
+                json_chunk = chunk
+            else:
+                json_chunk = json.loads(chunk)
+            if "error" in json_chunk:
+                raise Exception(f"Ollama Error - {json_chunk}")
+
+            text = ""
+            is_finished = False
+            finish_reason = None
+            if json_chunk["done"] is True:
+                text = ""
+                is_finished = True
+                finish_reason = "stop"
+                return {
+                    "text": text,
+                    "is_finished": is_finished,
+                    "finish_reason": finish_reason,
+                }
+            elif json_chunk["response"]:
+                print_verbose(f"delta content: {json_chunk}")
+                text = json_chunk["response"]
+                return {
+                    "text": text,
+                    "is_finished": is_finished,
+                    "finish_reason": finish_reason,
+                }
+            else:
+                raise Exception(f"Ollama Error - {json_chunk}")
+        except Exception as e:
+            raise e
+
+    def handle_ollama_chat_stream(self, chunk):
+        # for ollama_chat/ provider
+        try:
+            if isinstance(chunk, dict):
+                json_chunk = chunk
+            else:
+                json_chunk = json.loads(chunk)
+            if "error" in json_chunk:
+                raise Exception(f"Ollama Error - {json_chunk}")
+
+            text = ""
+            is_finished = False
+            finish_reason = None
+            if json_chunk["done"] is True:
+                text = ""
+                is_finished = True
+                finish_reason = "stop"
+                return {
+                    "text": text,
+                    "is_finished": is_finished,
+                    "finish_reason": finish_reason,
+                }
+            elif "message" in json_chunk:
+                print_verbose(f"delta content: {json_chunk}")
+                text = json_chunk["message"]["content"]
+                return {
+                    "text": text,
+                    "is_finished": is_finished,
+                    "finish_reason": finish_reason,
+                }
+            else:
+                raise Exception(f"Ollama Error - {json_chunk}")
+        except Exception as e:
+            raise e
+
+    def handle_watsonx_stream(self, chunk):
+        try:
+            if isinstance(chunk, dict):
+                parsed_response = chunk
+            elif isinstance(chunk, (str, bytes)):
+                if isinstance(chunk, bytes):
+                    chunk = chunk.decode("utf-8")
+                if "generated_text" in chunk:
+                    response = chunk.replace("data: ", "").strip()
+                    parsed_response = json.loads(response)
+                else:
+                    return {
+                        "text": "",
+                        "is_finished": False,
+                        "prompt_tokens": 0,
+                        "completion_tokens": 0,
+                    }
+            else:
+                print_verbose(f"chunk: {chunk} (Type: {type(chunk)})")
+                raise ValueError(
+                    f"Unable to parse response. Original response: {chunk}"
+                )
+            results = parsed_response.get("results", [])
+            if len(results) > 0:
+                text = results[0].get("generated_text", "")
+                finish_reason = results[0].get("stop_reason")
+                is_finished = finish_reason != "not_finished"
+                return {
+                    "text": text,
+                    "is_finished": is_finished,
+                    "finish_reason": finish_reason,
+                    "prompt_tokens": results[0].get("input_token_count", 0),
+                    "completion_tokens": results[0].get("generated_token_count", 0),
+                }
+            return {"text": "", "is_finished": False}
+        except Exception as e:
+            raise e
+
+    def handle_triton_stream(self, chunk):
+        try:
+            if isinstance(chunk, dict):
+                parsed_response = chunk
+            elif isinstance(chunk, (str, bytes)):
+                if isinstance(chunk, bytes):
+                    chunk = chunk.decode("utf-8")
+                if "text_output" in chunk:
+                    response = chunk.replace("data: ", "").strip()
+                    parsed_response = json.loads(response)
+                else:
+                    return {
+                        "text": "",
+                        "is_finished": False,
+                        "prompt_tokens": 0,
+                        "completion_tokens": 0,
+                    }
+            else:
+                print_verbose(f"chunk: {chunk} (Type: {type(chunk)})")
+                raise ValueError(
+                    f"Unable to parse response. Original response: {chunk}"
+                )
+            text = parsed_response.get("text_output", "")
+            finish_reason = parsed_response.get("stop_reason")
+            is_finished = parsed_response.get("is_finished", False)
+            return {
+                "text": text,
+                "is_finished": is_finished,
+                "finish_reason": finish_reason,
+                "prompt_tokens": parsed_response.get("input_token_count", 0),
+                "completion_tokens": parsed_response.get("generated_token_count", 0),
+            }
+            return {"text": "", "is_finished": False}
+        except Exception as e:
+            raise e
+
+    def handle_clarifai_completion_chunk(self, chunk):
+        try:
+            if isinstance(chunk, dict):
+                parsed_response = chunk
+            elif isinstance(chunk, (str, bytes)):
+                if isinstance(chunk, bytes):
+                    parsed_response = chunk.decode("utf-8")
+                else:
+                    parsed_response = chunk
+            else:
+                raise ValueError("Unable to parse streaming chunk")
+            if isinstance(parsed_response, dict):
+                data_json = parsed_response
+            else:
+                data_json = json.loads(parsed_response)
+            text = (
+                data_json.get("outputs", "")[0]
+                .get("data", "")
+                .get("text", "")
+                .get("raw", "")
+            )
+            len(
+                encoding.encode(
+                    data_json.get("outputs", "")[0]
+                    .get("input", "")
+                    .get("data", "")
+                    .get("text", "")
+                    .get("raw", "")
+                )
+            )
+            len(encoding.encode(text))
+            return {
+                "text": text,
+                "is_finished": True,
+            }
+        except Exception as e:
+            verbose_logger.exception(
+                "litellm.CustomStreamWrapper.handle_clarifai_chunk(): Exception occured - {}".format(
+                    str(e)
+                )
+            )
+            return ""
+
+    def model_response_creator(
+        self, chunk: Optional[dict] = None, hidden_params: Optional[dict] = None
+    ):
+        _model = self.model
+        _received_llm_provider = self.custom_llm_provider
+        _logging_obj_llm_provider = self.logging_obj.model_call_details.get("custom_llm_provider", None)  # type: ignore
+        if (
+            _received_llm_provider == "openai"
+            and _received_llm_provider != _logging_obj_llm_provider
+        ):
+            _model = "{}/{}".format(_logging_obj_llm_provider, _model)
+        if chunk is None:
+            chunk = {}
+        else:
+            # pop model keyword
+            chunk.pop("model", None)
+
+        model_response = ModelResponse(
+            stream=True, model=_model, stream_options=self.stream_options, **chunk
+        )
+        if self.response_id is not None:
+            model_response.id = self.response_id
+        else:
+            self.response_id = model_response.id  # type: ignore
+        if self.system_fingerprint is not None:
+            model_response.system_fingerprint = self.system_fingerprint
+        if hidden_params is not None:
+            model_response._hidden_params = hidden_params
+        model_response._hidden_params["custom_llm_provider"] = _logging_obj_llm_provider
+        model_response._hidden_params["created_at"] = time.time()
+        model_response._hidden_params = {
+            **model_response._hidden_params,
+            **self._hidden_params,
+        }
+
+        if (
+            len(model_response.choices) > 0
+            and getattr(model_response.choices[0], "delta") is not None
+        ):
+            # do nothing, if object instantiated
+            pass
+        else:
+            model_response.choices = [StreamingChoices(finish_reason=None)]
+        return model_response
+
+    def is_delta_empty(self, delta: Delta) -> bool:
+        is_empty = True
+        if delta.content is not None:
+            is_empty = False
+        elif delta.tool_calls is not None:
+            is_empty = False
+        elif delta.function_call is not None:
+            is_empty = False
+        return is_empty
+
+    def return_processed_chunk_logic(  # noqa
+        self,
+        completion_obj: dict,
+        model_response: ModelResponseStream,
+        response_obj: dict,
+    ):
+
+        print_verbose(
+            f"completion_obj: {completion_obj}, model_response.choices[0]: {model_response.choices[0]}, response_obj: {response_obj}"
+        )
+        if (
+            "content" in completion_obj
+            and (
+                isinstance(completion_obj["content"], str)
+                and len(completion_obj["content"]) > 0
+            )
+            or (
+                "tool_calls" in completion_obj
+                and completion_obj["tool_calls"] is not None
+                and len(completion_obj["tool_calls"]) > 0
+            )
+            or (
+                "function_call" in completion_obj
+                and completion_obj["function_call"] is not None
+            )
+        ):  # cannot set content of an OpenAI Object to be an empty string
+            self.safety_checker()
+            hold, model_response_str = self.check_special_tokens(
+                chunk=completion_obj["content"],
+                finish_reason=model_response.choices[0].finish_reason,
+            )  # filter out bos/eos tokens from openai-compatible hf endpoints
+            print_verbose(f"hold - {hold}, model_response_str - {model_response_str}")
+            if hold is False:
+                ## check if openai/azure chunk
+                original_chunk = response_obj.get("original_chunk", None)
+                if original_chunk:
+                    model_response.id = original_chunk.id
+                    self.response_id = original_chunk.id
+                    if len(original_chunk.choices) > 0:
+                        choices = []
+                        for choice in original_chunk.choices:
+                            try:
+                                if isinstance(choice, BaseModel):
+                                    choice_json = choice.model_dump()
+                                    choice_json.pop(
+                                        "finish_reason", None
+                                    )  # for mistral etc. which return a value in their last chunk (not-openai compatible).
+                                    print_verbose(f"choice_json: {choice_json}")
+                                    choices.append(StreamingChoices(**choice_json))
+                            except Exception:
+                                choices.append(StreamingChoices())
+                        print_verbose(f"choices in streaming: {choices}")
+                        setattr(model_response, "choices", choices)
+                    else:
+                        return
+                    model_response.system_fingerprint = (
+                        original_chunk.system_fingerprint
+                    )
+                    setattr(
+                        model_response,
+                        "citations",
+                        getattr(original_chunk, "citations", None),
+                    )
+                    print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}")
+                    if self.sent_first_chunk is False:
+                        model_response.choices[0].delta["role"] = "assistant"
+                        self.sent_first_chunk = True
+                    elif self.sent_first_chunk is True and hasattr(
+                        model_response.choices[0].delta, "role"
+                    ):
+                        _initial_delta = model_response.choices[0].delta.model_dump()
+                        _initial_delta.pop("role", None)
+                        model_response.choices[0].delta = Delta(**_initial_delta)
+                    print_verbose(
+                        f"model_response.choices[0].delta: {model_response.choices[0].delta}"
+                    )
+                else:
+                    ## else
+                    completion_obj["content"] = model_response_str
+                    if self.sent_first_chunk is False:
+                        completion_obj["role"] = "assistant"
+                        self.sent_first_chunk = True
+
+                    model_response.choices[0].delta = Delta(**completion_obj)
+                    _index: Optional[int] = completion_obj.get("index")
+                    if _index is not None:
+                        model_response.choices[0].index = _index
+                print_verbose(f"returning model_response: {model_response}")
+                return model_response
+            else:
+                return
+        elif self.received_finish_reason is not None:
+            if self.sent_last_chunk is True:
+                # Bedrock returns the guardrail trace in the last chunk - we want to return this here
+                if self.custom_llm_provider == "bedrock" and "trace" in model_response:
+                    return model_response
+
+                # Default - return StopIteration
+                raise StopIteration
+            # flush any remaining holding chunk
+            if len(self.holding_chunk) > 0:
+                if model_response.choices[0].delta.content is None:
+                    model_response.choices[0].delta.content = self.holding_chunk
+                else:
+                    model_response.choices[0].delta.content = (
+                        self.holding_chunk + model_response.choices[0].delta.content
+                    )
+                self.holding_chunk = ""
+            # if delta is None
+            _is_delta_empty = self.is_delta_empty(delta=model_response.choices[0].delta)
+
+            if _is_delta_empty:
+                # get any function call arguments
+                model_response.choices[0].finish_reason = map_finish_reason(
+                    finish_reason=self.received_finish_reason
+                )  # ensure consistent output to openai
+
+                self.sent_last_chunk = True
+
+            return model_response
+        elif (
+            model_response.choices[0].delta.tool_calls is not None
+            or model_response.choices[0].delta.function_call is not None
+        ):
+            if self.sent_first_chunk is False:
+                model_response.choices[0].delta["role"] = "assistant"
+                self.sent_first_chunk = True
+            return model_response
+        elif (
+            len(model_response.choices) > 0
+            and hasattr(model_response.choices[0].delta, "audio")
+            and model_response.choices[0].delta.audio is not None
+        ):
+            return model_response
+        else:
+            if hasattr(model_response, "usage"):
+                self.chunks.append(model_response)
+            return
+
+    def chunk_creator(self, chunk):  # type: ignore  # noqa: PLR0915
+        model_response = self.model_response_creator()
+        response_obj: dict = {}
+        try:
+            # return this for all models
+            completion_obj = {"content": ""}
+            from litellm.types.utils import GenericStreamingChunk as GChunk
+
+            if (
+                isinstance(chunk, dict)
+                and generic_chunk_has_all_required_fields(
+                    chunk=chunk
+                )  # check if chunk is a generic streaming chunk
+            ) or (
+                self.custom_llm_provider
+                and (
+                    self.custom_llm_provider == "anthropic"
+                    or self.custom_llm_provider in litellm._custom_providers
+                )
+            ):
+
+                if self.received_finish_reason is not None:
+                    if "provider_specific_fields" not in chunk:
+                        raise StopIteration
+                anthropic_response_obj: GChunk = chunk
+                completion_obj["content"] = anthropic_response_obj["text"]
+                if anthropic_response_obj["is_finished"]:
+                    self.received_finish_reason = anthropic_response_obj[
+                        "finish_reason"
+                    ]
+
+                if anthropic_response_obj["finish_reason"]:
+                    self.intermittent_finish_reason = anthropic_response_obj[
+                        "finish_reason"
+                    ]
+
+                if anthropic_response_obj["usage"] is not None:
+                    model_response.usage = litellm.Usage(
+                        **anthropic_response_obj["usage"]
+                    )
+
+                if (
+                    "tool_use" in anthropic_response_obj
+                    and anthropic_response_obj["tool_use"] is not None
+                ):
+                    completion_obj["tool_calls"] = [anthropic_response_obj["tool_use"]]
+
+                if (
+                    "provider_specific_fields" in anthropic_response_obj
+                    and anthropic_response_obj["provider_specific_fields"] is not None
+                ):
+                    for key, value in anthropic_response_obj[
+                        "provider_specific_fields"
+                    ].items():
+                        setattr(model_response, key, value)
+
+                response_obj = anthropic_response_obj
+            elif (
+                self.custom_llm_provider
+                and self.custom_llm_provider == "anthropic_text"
+            ):
+                response_obj = self.handle_anthropic_text_chunk(chunk)
+                completion_obj["content"] = response_obj["text"]
+                if response_obj["is_finished"]:
+                    self.received_finish_reason = response_obj["finish_reason"]
+            elif self.custom_llm_provider and self.custom_llm_provider == "clarifai":
+                response_obj = self.handle_clarifai_completion_chunk(chunk)
+                completion_obj["content"] = response_obj["text"]
+                if response_obj["is_finished"]:
+                    self.received_finish_reason = response_obj["finish_reason"]
+            elif self.model == "replicate" or self.custom_llm_provider == "replicate":
+                response_obj = self.handle_replicate_chunk(chunk)
+                completion_obj["content"] = response_obj["text"]
+                if response_obj["is_finished"]:
+                    self.received_finish_reason = response_obj["finish_reason"]
+            elif self.custom_llm_provider and self.custom_llm_provider == "huggingface":
+                response_obj = self.handle_huggingface_chunk(chunk)
+                completion_obj["content"] = response_obj["text"]
+                if response_obj["is_finished"]:
+                    self.received_finish_reason = response_obj["finish_reason"]
+            elif self.custom_llm_provider and self.custom_llm_provider == "predibase":
+                response_obj = self.handle_predibase_chunk(chunk)
+                completion_obj["content"] = response_obj["text"]
+                if response_obj["is_finished"]:
+                    self.received_finish_reason = response_obj["finish_reason"]
+            elif (
+                self.custom_llm_provider and self.custom_llm_provider == "baseten"
+            ):  # baseten doesn't provide streaming
+                completion_obj["content"] = self.handle_baseten_chunk(chunk)
+            elif (
+                self.custom_llm_provider and self.custom_llm_provider == "ai21"
+            ):  # ai21 doesn't provide streaming
+                response_obj = self.handle_ai21_chunk(chunk)
+                completion_obj["content"] = response_obj["text"]
+                if response_obj["is_finished"]:
+                    self.received_finish_reason = response_obj["finish_reason"]
+            elif self.custom_llm_provider and self.custom_llm_provider == "maritalk":
+                response_obj = self.handle_maritalk_chunk(chunk)
+                completion_obj["content"] = response_obj["text"]
+                if response_obj["is_finished"]:
+                    self.received_finish_reason = response_obj["finish_reason"]
+            elif self.custom_llm_provider and self.custom_llm_provider == "vllm":
+                completion_obj["content"] = chunk[0].outputs[0].text
+            elif (
+                self.custom_llm_provider and self.custom_llm_provider == "aleph_alpha"
+            ):  # aleph alpha doesn't provide streaming
+                response_obj = self.handle_aleph_alpha_chunk(chunk)
+                completion_obj["content"] = response_obj["text"]
+                if response_obj["is_finished"]:
+                    self.received_finish_reason = response_obj["finish_reason"]
+            elif self.custom_llm_provider == "nlp_cloud":
+                try:
+                    response_obj = self.handle_nlp_cloud_chunk(chunk)
+                    completion_obj["content"] = response_obj["text"]
+                    if response_obj["is_finished"]:
+                        self.received_finish_reason = response_obj["finish_reason"]
+                except Exception as e:
+                    if self.received_finish_reason:
+                        raise e
+                    else:
+                        if self.sent_first_chunk is False:
+                            raise Exception("An unknown error occurred with the stream")
+                        self.received_finish_reason = "stop"
+            elif self.custom_llm_provider == "vertex_ai":
+                import proto  # type: ignore
+
+                if hasattr(chunk, "candidates") is True:
+                    try:
+                        try:
+                            completion_obj["content"] = chunk.text
+                        except Exception as e:
+                            if "Part has no text." in str(e):
+                                ## check for function calling
+                                function_call = (
+                                    chunk.candidates[0].content.parts[0].function_call
+                                )
+
+                                args_dict = {}
+
+                                # Check if it's a RepeatedComposite instance
+                                for key, val in function_call.args.items():
+                                    if isinstance(
+                                        val,
+                                        proto.marshal.collections.repeated.RepeatedComposite,
+                                    ):
+                                        # If so, convert to list
+                                        args_dict[key] = [v for v in val]
+                                    else:
+                                        args_dict[key] = val
+
+                                try:
+                                    args_str = json.dumps(args_dict)
+                                except Exception as e:
+                                    raise e
+                                _delta_obj = litellm.utils.Delta(
+                                    content=None,
+                                    tool_calls=[
+                                        {
+                                            "id": f"call_{str(uuid.uuid4())}",
+                                            "function": {
+                                                "arguments": args_str,
+                                                "name": function_call.name,
+                                            },
+                                            "type": "function",
+                                        }
+                                    ],
+                                )
+                                _streaming_response = StreamingChoices(delta=_delta_obj)
+                                _model_response = ModelResponse(stream=True)
+                                _model_response.choices = [_streaming_response]
+                                response_obj = {"original_chunk": _model_response}
+                            else:
+                                raise e
+                        if (
+                            hasattr(chunk.candidates[0], "finish_reason")
+                            and chunk.candidates[0].finish_reason.name
+                            != "FINISH_REASON_UNSPECIFIED"
+                        ):  # every non-final chunk in vertex ai has this
+                            self.received_finish_reason = chunk.candidates[
+                                0
+                            ].finish_reason.name
+                    except Exception:
+                        if chunk.candidates[0].finish_reason.name == "SAFETY":
+                            raise Exception(
+                                f"The response was blocked by VertexAI. {str(chunk)}"
+                            )
+                else:
+                    completion_obj["content"] = str(chunk)
+            elif self.custom_llm_provider == "cohere":
+                response_obj = self.handle_cohere_chunk(chunk)
+                completion_obj["content"] = response_obj["text"]
+                if response_obj["is_finished"]:
+                    self.received_finish_reason = response_obj["finish_reason"]
+            elif self.custom_llm_provider == "cohere_chat":
+                response_obj = self.handle_cohere_chat_chunk(chunk)
+                if response_obj is None:
+                    return
+                completion_obj["content"] = response_obj["text"]
+                if response_obj["is_finished"]:
+                    self.received_finish_reason = response_obj["finish_reason"]
+
+            elif self.custom_llm_provider == "petals":
+                if len(self.completion_stream) == 0:
+                    if self.received_finish_reason is not None:
+                        raise StopIteration
+                    else:
+                        self.received_finish_reason = "stop"
+                chunk_size = 30
+                new_chunk = self.completion_stream[:chunk_size]
+                completion_obj["content"] = new_chunk
+                self.completion_stream = self.completion_stream[chunk_size:]
+            elif self.custom_llm_provider == "palm":
+                # fake streaming
+                response_obj = {}
+                if len(self.completion_stream) == 0:
+                    if self.received_finish_reason is not None:
+                        raise StopIteration
+                    else:
+                        self.received_finish_reason = "stop"
+                chunk_size = 30
+                new_chunk = self.completion_stream[:chunk_size]
+                completion_obj["content"] = new_chunk
+                self.completion_stream = self.completion_stream[chunk_size:]
+            elif self.custom_llm_provider == "ollama":
+                response_obj = self.handle_ollama_stream(chunk)
+                completion_obj["content"] = response_obj["text"]
+                print_verbose(f"completion obj content: {completion_obj['content']}")
+                if response_obj["is_finished"]:
+                    self.received_finish_reason = response_obj["finish_reason"]
+            elif self.custom_llm_provider == "ollama_chat":
+                response_obj = self.handle_ollama_chat_stream(chunk)
+                completion_obj["content"] = response_obj["text"]
+                print_verbose(f"completion obj content: {completion_obj['content']}")
+                if response_obj["is_finished"]:
+                    self.received_finish_reason = response_obj["finish_reason"]
+            elif self.custom_llm_provider == "cloudflare":
+                response_obj = self.handle_cloudlfare_stream(chunk)
+                completion_obj["content"] = response_obj["text"]
+                print_verbose(f"completion obj content: {completion_obj['content']}")
+                if response_obj["is_finished"]:
+                    self.received_finish_reason = response_obj["finish_reason"]
+            elif self.custom_llm_provider == "watsonx":
+                response_obj = self.handle_watsonx_stream(chunk)
+                completion_obj["content"] = response_obj["text"]
+                if response_obj["is_finished"]:
+                    self.received_finish_reason = response_obj["finish_reason"]
+            elif self.custom_llm_provider == "triton":
+                response_obj = self.handle_triton_stream(chunk)
+                completion_obj["content"] = response_obj["text"]
+                print_verbose(f"completion obj content: {completion_obj['content']}")
+                if response_obj["is_finished"]:
+                    self.received_finish_reason = response_obj["finish_reason"]
+            elif self.custom_llm_provider == "text-completion-openai":
+                response_obj = self.handle_openai_text_completion_chunk(chunk)
+                completion_obj["content"] = response_obj["text"]
+                print_verbose(f"completion obj content: {completion_obj['content']}")
+                if response_obj["is_finished"]:
+                    self.received_finish_reason = response_obj["finish_reason"]
+                if response_obj["usage"] is not None:
+                    model_response.usage = litellm.Usage(
+                        prompt_tokens=response_obj["usage"].prompt_tokens,
+                        completion_tokens=response_obj["usage"].completion_tokens,
+                        total_tokens=response_obj["usage"].total_tokens,
+                    )
+            elif self.custom_llm_provider == "text-completion-codestral":
+                response_obj = litellm.MistralTextCompletionConfig()._chunk_parser(
+                    chunk
+                )
+                completion_obj["content"] = response_obj["text"]
+                print_verbose(f"completion obj content: {completion_obj['content']}")
+                if response_obj["is_finished"]:
+                    self.received_finish_reason = response_obj["finish_reason"]
+                if "usage" in response_obj is not None:
+                    model_response.usage = litellm.Usage(
+                        prompt_tokens=response_obj["usage"].prompt_tokens,
+                        completion_tokens=response_obj["usage"].completion_tokens,
+                        total_tokens=response_obj["usage"].total_tokens,
+                    )
+            elif self.custom_llm_provider == "azure_text":
+                response_obj = self.handle_azure_text_completion_chunk(chunk)
+                completion_obj["content"] = response_obj["text"]
+                print_verbose(f"completion obj content: {completion_obj['content']}")
+                if response_obj["is_finished"]:
+                    self.received_finish_reason = response_obj["finish_reason"]
+            elif self.custom_llm_provider == "cached_response":
+                response_obj = {
+                    "text": chunk.choices[0].delta.content,
+                    "is_finished": True,
+                    "finish_reason": chunk.choices[0].finish_reason,
+                    "original_chunk": chunk,
+                    "tool_calls": (
+                        chunk.choices[0].delta.tool_calls
+                        if hasattr(chunk.choices[0].delta, "tool_calls")
+                        else None
+                    ),
+                }
+
+                completion_obj["content"] = response_obj["text"]
+                if response_obj["tool_calls"] is not None:
+                    completion_obj["tool_calls"] = response_obj["tool_calls"]
+                print_verbose(f"completion obj content: {completion_obj['content']}")
+                if hasattr(chunk, "id"):
+                    model_response.id = chunk.id
+                    self.response_id = chunk.id
+                if hasattr(chunk, "system_fingerprint"):
+                    self.system_fingerprint = chunk.system_fingerprint
+                if response_obj["is_finished"]:
+                    self.received_finish_reason = response_obj["finish_reason"]
+            else:  # openai / azure chat model
+                if self.custom_llm_provider == "azure":
+                    if hasattr(chunk, "model"):
+                        # for azure, we need to pass the model from the orignal chunk
+                        self.model = chunk.model
+                response_obj = self.handle_openai_chat_completion_chunk(chunk)
+                if response_obj is None:
+                    return
+                completion_obj["content"] = response_obj["text"]
+                print_verbose(f"completion obj content: {completion_obj['content']}")
+                if response_obj["is_finished"]:
+                    if response_obj["finish_reason"] == "error":
+                        raise Exception(
+                            "{} raised a streaming error - finish_reason: error, no content string given. Received Chunk={}".format(
+                                self.custom_llm_provider, response_obj
+                            )
+                        )
+                    self.received_finish_reason = response_obj["finish_reason"]
+                if response_obj.get("original_chunk", None) is not None:
+                    if hasattr(response_obj["original_chunk"], "id"):
+                        model_response.id = response_obj["original_chunk"].id
+                        self.response_id = model_response.id
+                    if hasattr(response_obj["original_chunk"], "system_fingerprint"):
+                        model_response.system_fingerprint = response_obj[
+                            "original_chunk"
+                        ].system_fingerprint
+                        self.system_fingerprint = response_obj[
+                            "original_chunk"
+                        ].system_fingerprint
+                if response_obj["logprobs"] is not None:
+                    model_response.choices[0].logprobs = response_obj["logprobs"]
+
+                if response_obj["usage"] is not None:
+                    if isinstance(response_obj["usage"], dict):
+                        model_response.usage = litellm.Usage(
+                            prompt_tokens=response_obj["usage"].get(
+                                "prompt_tokens", None
+                            )
+                            or None,
+                            completion_tokens=response_obj["usage"].get(
+                                "completion_tokens", None
+                            )
+                            or None,
+                            total_tokens=response_obj["usage"].get("total_tokens", None)
+                            or None,
+                        )
+                    elif isinstance(response_obj["usage"], BaseModel):
+                        model_response.usage = litellm.Usage(
+                            **response_obj["usage"].model_dump()
+                        )
+
+            model_response.model = self.model
+            print_verbose(
+                f"model_response finish reason 3: {self.received_finish_reason}; response_obj={response_obj}"
+            )
+            ## FUNCTION CALL PARSING
+            if (
+                response_obj is not None
+                and response_obj.get("original_chunk", None) is not None
+            ):  # function / tool calling branch - only set for openai/azure compatible endpoints
+                # enter this branch when no content has been passed in response
+                original_chunk = response_obj.get("original_chunk", None)
+                model_response.id = original_chunk.id
+                self.response_id = original_chunk.id
+                if original_chunk.choices and len(original_chunk.choices) > 0:
+                    delta = original_chunk.choices[0].delta
+                    if delta is not None and (
+                        delta.function_call is not None or delta.tool_calls is not None
+                    ):
+                        try:
+                            model_response.system_fingerprint = (
+                                original_chunk.system_fingerprint
+                            )
+                            ## AZURE - check if arguments is not None
+                            if (
+                                original_chunk.choices[0].delta.function_call
+                                is not None
+                            ):
+                                if (
+                                    getattr(
+                                        original_chunk.choices[0].delta.function_call,
+                                        "arguments",
+                                    )
+                                    is None
+                                ):
+                                    original_chunk.choices[
+                                        0
+                                    ].delta.function_call.arguments = ""
+                            elif original_chunk.choices[0].delta.tool_calls is not None:
+                                if isinstance(
+                                    original_chunk.choices[0].delta.tool_calls, list
+                                ):
+                                    for t in original_chunk.choices[0].delta.tool_calls:
+                                        if hasattr(t, "functions") and hasattr(
+                                            t.functions, "arguments"
+                                        ):
+                                            if (
+                                                getattr(
+                                                    t.function,
+                                                    "arguments",
+                                                )
+                                                is None
+                                            ):
+                                                t.function.arguments = ""
+                            _json_delta = delta.model_dump()
+                            print_verbose(f"_json_delta: {_json_delta}")
+                            if "role" not in _json_delta or _json_delta["role"] is None:
+                                _json_delta["role"] = (
+                                    "assistant"  # mistral's api returns role as None
+                                )
+                            if "tool_calls" in _json_delta and isinstance(
+                                _json_delta["tool_calls"], list
+                            ):
+                                for tool in _json_delta["tool_calls"]:
+                                    if (
+                                        isinstance(tool, dict)
+                                        and "function" in tool
+                                        and isinstance(tool["function"], dict)
+                                        and ("type" not in tool or tool["type"] is None)
+                                    ):
+                                        # if function returned but type set to None - mistral's api returns type: None
+                                        tool["type"] = "function"
+                            model_response.choices[0].delta = Delta(**_json_delta)
+                        except Exception as e:
+                            verbose_logger.exception(
+                                "litellm.CustomStreamWrapper.chunk_creator(): Exception occured - {}".format(
+                                    str(e)
+                                )
+                            )
+                            model_response.choices[0].delta = Delta()
+                    elif (
+                        delta is not None and getattr(delta, "audio", None) is not None
+                    ):
+                        model_response.choices[0].delta.audio = delta.audio
+                    else:
+                        try:
+                            delta = (
+                                dict()
+                                if original_chunk.choices[0].delta is None
+                                else dict(original_chunk.choices[0].delta)
+                            )
+                            print_verbose(f"original delta: {delta}")
+                            model_response.choices[0].delta = Delta(**delta)
+                            print_verbose(
+                                f"new delta: {model_response.choices[0].delta}"
+                            )
+                        except Exception:
+                            model_response.choices[0].delta = Delta()
+                else:
+                    if (
+                        self.stream_options is not None
+                        and self.stream_options["include_usage"] is True
+                    ):
+                        return model_response
+                    return
+            print_verbose(
+                f"model_response.choices[0].delta: {model_response.choices[0].delta}; completion_obj: {completion_obj}"
+            )
+            print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}")
+
+            ## CHECK FOR TOOL USE
+            if "tool_calls" in completion_obj and len(completion_obj["tool_calls"]) > 0:
+                if self.is_function_call is True:  # user passed in 'functions' param
+                    completion_obj["function_call"] = completion_obj["tool_calls"][0][
+                        "function"
+                    ]
+                    completion_obj["tool_calls"] = None
+
+                self.tool_call = True
+
+            ## RETURN ARG
+            return self.return_processed_chunk_logic(
+                completion_obj=completion_obj,
+                model_response=model_response,  # type: ignore
+                response_obj=response_obj,
+            )
+
+        except StopIteration:
+            raise StopIteration
+        except Exception as e:
+            traceback.format_exc()
+            e.message = str(e)
+            raise exception_type(
+                model=self.model,
+                custom_llm_provider=self.custom_llm_provider,
+                original_exception=e,
+            )
+
+    def set_logging_event_loop(self, loop):
+        """
+        import litellm, asyncio
+
+        loop = asyncio.get_event_loop() # 👈 gets the current event loop
+
+        response = litellm.completion(.., stream=True)
+
+        response.set_logging_event_loop(loop=loop) # 👈 enables async_success callbacks for sync logging
+
+        for chunk in response:
+            ...
+        """
+        self.logging_loop = loop
+
+    def run_success_logging_and_cache_storage(self, processed_chunk, cache_hit: bool):
+        """
+        Runs success logging in a thread and adds the response to the cache
+        """
+        if litellm.disable_streaming_logging is True:
+            """
+            [NOT RECOMMENDED]
+            Set this via `litellm.disable_streaming_logging = True`.
+
+            Disables streaming logging.
+            """
+            return
+        ## ASYNC LOGGING
+        # Create an event loop for the new thread
+        if self.logging_loop is not None:
+            future = asyncio.run_coroutine_threadsafe(
+                self.logging_obj.async_success_handler(
+                    processed_chunk, None, None, cache_hit
+                ),
+                loop=self.logging_loop,
+            )
+            future.result()
+        else:
+            asyncio.run(
+                self.logging_obj.async_success_handler(
+                    processed_chunk, None, None, cache_hit
+                )
+            )
+        ## SYNC LOGGING
+        self.logging_obj.success_handler(processed_chunk, None, None, cache_hit)
+
+        ## Sync store in cache
+        if self.logging_obj._llm_caching_handler is not None:
+            self.logging_obj._llm_caching_handler._sync_add_streaming_response_to_cache(
+                processed_chunk
+            )
+
+    def finish_reason_handler(self):
+        model_response = self.model_response_creator()
+        _finish_reason = self.received_finish_reason or self.intermittent_finish_reason
+        if _finish_reason is not None:
+            model_response.choices[0].finish_reason = _finish_reason
+        else:
+            model_response.choices[0].finish_reason = "stop"
+
+        ## if tool use
+        if (
+            model_response.choices[0].finish_reason == "stop" and self.tool_call
+        ):  # don't overwrite for other - potential error finish reasons
+            model_response.choices[0].finish_reason = "tool_calls"
+        return model_response
+
+    def __next__(self):  # noqa: PLR0915
+        cache_hit = False
+        if (
+            self.custom_llm_provider is not None
+            and self.custom_llm_provider == "cached_response"
+        ):
+            cache_hit = True
+        try:
+            if self.completion_stream is None:
+                self.fetch_sync_stream()
+            while True:
+                if (
+                    isinstance(self.completion_stream, str)
+                    or isinstance(self.completion_stream, bytes)
+                    or isinstance(self.completion_stream, ModelResponse)
+                ):
+                    chunk = self.completion_stream
+                else:
+                    chunk = next(self.completion_stream)
+                if chunk is not None and chunk != b"":
+                    print_verbose(
+                        f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}; custom_llm_provider: {self.custom_llm_provider}"
+                    )
+                    response: Optional[ModelResponse] = self.chunk_creator(chunk=chunk)
+                    print_verbose(f"PROCESSED CHUNK POST CHUNK CREATOR: {response}")
+
+                    if response is None:
+                        continue
+                    ## LOGGING
+                    threading.Thread(
+                        target=self.run_success_logging_and_cache_storage,
+                        args=(response, cache_hit),
+                    ).start()  # log response
+                    choice = response.choices[0]
+                    if isinstance(choice, StreamingChoices):
+                        self.response_uptil_now += choice.delta.get("content", "") or ""
+                    else:
+                        self.response_uptil_now += ""
+                    self.rules.post_call_rules(
+                        input=self.response_uptil_now, model=self.model
+                    )
+                    # HANDLE STREAM OPTIONS
+                    self.chunks.append(response)
+                    if hasattr(
+                        response, "usage"
+                    ):  # remove usage from chunk, only send on final chunk
+                        # Convert the object to a dictionary
+                        obj_dict = response.dict()
+
+                        # Remove an attribute (e.g., 'attr2')
+                        if "usage" in obj_dict:
+                            del obj_dict["usage"]
+
+                        # Create a new object without the removed attribute
+                        response = self.model_response_creator(
+                            chunk=obj_dict, hidden_params=response._hidden_params
+                        )
+                    # add usage as hidden param
+                    if self.sent_last_chunk is True and self.stream_options is None:
+                        usage = calculate_total_usage(chunks=self.chunks)
+                        response._hidden_params["usage"] = usage
+                    # RETURN RESULT
+                    return response
+
+        except StopIteration:
+            if self.sent_last_chunk is True:
+                complete_streaming_response = litellm.stream_chunk_builder(
+                    chunks=self.chunks, messages=self.messages
+                )
+                response = self.model_response_creator()
+                if complete_streaming_response is not None:
+                    setattr(
+                        response,
+                        "usage",
+                        getattr(complete_streaming_response, "usage"),
+                    )
+
+                ## LOGGING
+                threading.Thread(
+                    target=self.logging_obj.success_handler,
+                    args=(response, None, None, cache_hit),
+                ).start()  # log response
+
+                if self.sent_stream_usage is False and self.send_stream_usage is True:
+                    self.sent_stream_usage = True
+                    return response
+                raise  # Re-raise StopIteration
+            else:
+                self.sent_last_chunk = True
+                processed_chunk = self.finish_reason_handler()
+                if self.stream_options is None:  # add usage as hidden param
+                    usage = calculate_total_usage(chunks=self.chunks)
+                    processed_chunk._hidden_params["usage"] = usage
+                ## LOGGING
+                threading.Thread(
+                    target=self.run_success_logging_and_cache_storage,
+                    args=(processed_chunk, cache_hit),
+                ).start()  # log response
+                return processed_chunk
+        except Exception as e:
+            traceback_exception = traceback.format_exc()
+            # LOG FAILURE - handle streaming failure logging in the _next_ object, remove `handle_failure` once it's deprecated
+            threading.Thread(
+                target=self.logging_obj.failure_handler, args=(e, traceback_exception)
+            ).start()
+            if isinstance(e, OpenAIError):
+                raise e
+            else:
+                raise exception_type(
+                    model=self.model,
+                    original_exception=e,
+                    custom_llm_provider=self.custom_llm_provider,
+                )
+
+    def fetch_sync_stream(self):
+        if self.completion_stream is None and self.make_call is not None:
+            # Call make_call to get the completion stream
+            self.completion_stream = self.make_call(client=litellm.module_level_client)
+            self._stream_iter = self.completion_stream.__iter__()
+
+        return self.completion_stream
+
+    async def fetch_stream(self):
+        if self.completion_stream is None and self.make_call is not None:
+            # Call make_call to get the completion stream
+            self.completion_stream = await self.make_call(
+                client=litellm.module_level_aclient
+            )
+            self._stream_iter = self.completion_stream.__aiter__()
+
+        return self.completion_stream
+
+    async def __anext__(self):  # noqa: PLR0915
+        cache_hit = False
+        if (
+            self.custom_llm_provider is not None
+            and self.custom_llm_provider == "cached_response"
+        ):
+            cache_hit = True
+        try:
+            if self.completion_stream is None:
+                await self.fetch_stream()
+
+            if (
+                self.custom_llm_provider == "openai"
+                or self.custom_llm_provider == "azure"
+                or self.custom_llm_provider == "custom_openai"
+                or self.custom_llm_provider == "text-completion-openai"
+                or self.custom_llm_provider == "text-completion-codestral"
+                or self.custom_llm_provider == "azure_text"
+                or self.custom_llm_provider == "anthropic"
+                or self.custom_llm_provider == "anthropic_text"
+                or self.custom_llm_provider == "huggingface"
+                or self.custom_llm_provider == "ollama"
+                or self.custom_llm_provider == "ollama_chat"
+                or self.custom_llm_provider == "vertex_ai"
+                or self.custom_llm_provider == "vertex_ai_beta"
+                or self.custom_llm_provider == "sagemaker"
+                or self.custom_llm_provider == "sagemaker_chat"
+                or self.custom_llm_provider == "gemini"
+                or self.custom_llm_provider == "replicate"
+                or self.custom_llm_provider == "cached_response"
+                or self.custom_llm_provider == "predibase"
+                or self.custom_llm_provider == "databricks"
+                or self.custom_llm_provider == "bedrock"
+                or self.custom_llm_provider == "triton"
+                or self.custom_llm_provider == "watsonx"
+                or self.custom_llm_provider in litellm.openai_compatible_endpoints
+                or self.custom_llm_provider in litellm._custom_providers
+            ):
+                async for chunk in self.completion_stream:
+                    if chunk == "None" or chunk is None:
+                        raise Exception
+                    elif (
+                        self.custom_llm_provider == "gemini"
+                        and hasattr(chunk, "parts")
+                        and len(chunk.parts) == 0
+                    ):
+                        continue
+                    # chunk_creator() does logging/stream chunk building. We need to let it know its being called in_async_func, so we don't double add chunks.
+                    # __anext__ also calls async_success_handler, which does logging
+                    print_verbose(f"PROCESSED ASYNC CHUNK PRE CHUNK CREATOR: {chunk}")
+
+                    processed_chunk: Optional[ModelResponse] = self.chunk_creator(
+                        chunk=chunk
+                    )
+                    print_verbose(
+                        f"PROCESSED ASYNC CHUNK POST CHUNK CREATOR: {processed_chunk}"
+                    )
+                    if processed_chunk is None:
+                        continue
+                    ## LOGGING
+                    ## LOGGING
+                    executor.submit(
+                        self.logging_obj.success_handler,
+                        result=processed_chunk,
+                        start_time=None,
+                        end_time=None,
+                        cache_hit=cache_hit,
+                    )
+
+                    asyncio.create_task(
+                        self.logging_obj.async_success_handler(
+                            processed_chunk, cache_hit=cache_hit
+                        )
+                    )
+
+                    if self.logging_obj._llm_caching_handler is not None:
+                        asyncio.create_task(
+                            self.logging_obj._llm_caching_handler._add_streaming_response_to_cache(
+                                processed_chunk=processed_chunk,
+                            )
+                        )
+
+                    choice = processed_chunk.choices[0]
+                    if isinstance(choice, StreamingChoices):
+                        self.response_uptil_now += choice.delta.get("content", "") or ""
+                    else:
+                        self.response_uptil_now += ""
+                    self.rules.post_call_rules(
+                        input=self.response_uptil_now, model=self.model
+                    )
+                    self.chunks.append(processed_chunk)
+                    if hasattr(
+                        processed_chunk, "usage"
+                    ):  # remove usage from chunk, only send on final chunk
+                        # Convert the object to a dictionary
+                        obj_dict = processed_chunk.dict()
+
+                        # Remove an attribute (e.g., 'attr2')
+                        if "usage" in obj_dict:
+                            del obj_dict["usage"]
+
+                        # Create a new object without the removed attribute
+                        processed_chunk = self.model_response_creator(chunk=obj_dict)
+                    print_verbose(f"final returned processed chunk: {processed_chunk}")
+                    return processed_chunk
+                raise StopAsyncIteration
+            else:  # temporary patch for non-aiohttp async calls
+                # example - boto3 bedrock llms
+                while True:
+                    if isinstance(self.completion_stream, str) or isinstance(
+                        self.completion_stream, bytes
+                    ):
+                        chunk = self.completion_stream
+                    else:
+                        chunk = next(self.completion_stream)
+                    if chunk is not None and chunk != b"":
+                        print_verbose(f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}")
+                        processed_chunk: Optional[ModelResponse] = self.chunk_creator(
+                            chunk=chunk
+                        )
+                        print_verbose(
+                            f"PROCESSED CHUNK POST CHUNK CREATOR: {processed_chunk}"
+                        )
+                        if processed_chunk is None:
+                            continue
+                        ## LOGGING
+                        threading.Thread(
+                            target=self.logging_obj.success_handler,
+                            args=(processed_chunk, None, None, cache_hit),
+                        ).start()  # log processed_chunk
+                        asyncio.create_task(
+                            self.logging_obj.async_success_handler(
+                                processed_chunk, cache_hit=cache_hit
+                            )
+                        )
+
+                        choice = processed_chunk.choices[0]
+                        if isinstance(choice, StreamingChoices):
+                            self.response_uptil_now += (
+                                choice.delta.get("content", "") or ""
+                            )
+                        else:
+                            self.response_uptil_now += ""
+                        self.rules.post_call_rules(
+                            input=self.response_uptil_now, model=self.model
+                        )
+                        # RETURN RESULT
+                        self.chunks.append(processed_chunk)
+                        return processed_chunk
+        except (StopAsyncIteration, StopIteration):
+            if self.sent_last_chunk is True:
+                # log the final chunk with accurate streaming values
+                complete_streaming_response = litellm.stream_chunk_builder(
+                    chunks=self.chunks, messages=self.messages
+                )
+                response = self.model_response_creator()
+                if complete_streaming_response is not None:
+                    setattr(
+                        response,
+                        "usage",
+                        getattr(complete_streaming_response, "usage"),
+                    )
+                ## LOGGING
+                threading.Thread(
+                    target=self.logging_obj.success_handler,
+                    args=(response, None, None, cache_hit),
+                ).start()  # log response
+                asyncio.create_task(
+                    self.logging_obj.async_success_handler(
+                        response, cache_hit=cache_hit
+                    )
+                )
+                if self.sent_stream_usage is False and self.send_stream_usage is True:
+                    self.sent_stream_usage = True
+                    return response
+                raise StopAsyncIteration  # Re-raise StopIteration
+            else:
+                self.sent_last_chunk = True
+                processed_chunk = self.finish_reason_handler()
+                ## LOGGING
+                threading.Thread(
+                    target=self.logging_obj.success_handler,
+                    args=(processed_chunk, None, None, cache_hit),
+                ).start()  # log response
+                asyncio.create_task(
+                    self.logging_obj.async_success_handler(
+                        processed_chunk, cache_hit=cache_hit
+                    )
+                )
+                return processed_chunk
+        except httpx.TimeoutException as e:  # if httpx read timeout error occues
+            traceback_exception = traceback.format_exc()
+            ## ADD DEBUG INFORMATION - E.G. LITELLM REQUEST TIMEOUT
+            traceback_exception += "\nLiteLLM Default Request Timeout - {}".format(
+                litellm.request_timeout
+            )
+            if self.logging_obj is not None:
+                ## LOGGING
+                threading.Thread(
+                    target=self.logging_obj.failure_handler,
+                    args=(e, traceback_exception),
+                ).start()  # log response
+                # Handle any exceptions that might occur during streaming
+                asyncio.create_task(
+                    self.logging_obj.async_failure_handler(e, traceback_exception)
+                )
+            raise e
+        except Exception as e:
+            traceback_exception = traceback.format_exc()
+            if self.logging_obj is not None:
+                ## LOGGING
+                threading.Thread(
+                    target=self.logging_obj.failure_handler,
+                    args=(e, traceback_exception),
+                ).start()  # log response
+                # Handle any exceptions that might occur during streaming
+                asyncio.create_task(
+                    self.logging_obj.async_failure_handler(e, traceback_exception)  # type: ignore
+                )
+            ## Map to OpenAI Exception
+            raise exception_type(
+                model=self.model,
+                custom_llm_provider=self.custom_llm_provider,
+                original_exception=e,
+                completion_kwargs={},
+                extra_kwargs={},
+            )
+
+
+def calculate_total_usage(chunks: List[ModelResponse]) -> Usage:
+    """Assume most recent usage chunk has total usage uptil then."""
+    prompt_tokens: int = 0
+    completion_tokens: int = 0
+    for chunk in chunks:
+        if "usage" in chunk:
+            if "prompt_tokens" in chunk["usage"]:
+                prompt_tokens = chunk["usage"].get("prompt_tokens", 0) or 0
+            if "completion_tokens" in chunk["usage"]:
+                completion_tokens = chunk["usage"].get("completion_tokens", 0) or 0
+
+    returned_usage_chunk = Usage(
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+        total_tokens=prompt_tokens + completion_tokens,
+    )
+
+    return returned_usage_chunk
+
+
+def generic_chunk_has_all_required_fields(chunk: dict) -> bool:
+    """
+    Checks if the provided chunk dictionary contains all required fields for GenericStreamingChunk.
+
+    :param chunk: The dictionary to check.
+    :return: True if all required fields are present, False otherwise.
+    """
+    _all_fields = GChunk.__annotations__
+
+    decision = all(key in _all_fields for key in chunk)
+    return decision
diff --git a/litellm/litellm_core_utils/streaming_utils.py b/litellm/litellm_core_utils/streaming_utils.py
deleted file mode 100644
index c41b4f64c4df..000000000000
--- a/litellm/litellm_core_utils/streaming_utils.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from litellm.types.utils import GenericStreamingChunk as GChunk
-
-
-def generic_chunk_has_all_required_fields(chunk: dict) -> bool:
-    """
-    Checks if the provided chunk dictionary contains all required fields for GenericStreamingChunk.
-
-    :param chunk: The dictionary to check.
-    :return: True if all required fields are present, False otherwise.
-    """
-    _all_fields = GChunk.__annotations__
-
-    decision = all(key in _all_fields for key in chunk)
-    return decision
diff --git a/litellm/llms/databricks/streaming_utils.py b/litellm/llms/databricks/streaming_utils.py
index a87ab39bba6a..502f4a091299 100644
--- a/litellm/llms/databricks/streaming_utils.py
+++ b/litellm/llms/databricks/streaming_utils.py
@@ -1,5 +1,5 @@
 import json
-from typing import Optional
+from typing import List, Optional
 
 import litellm
 from litellm import verbose_logger
@@ -10,7 +10,7 @@
     ChatCompletionToolCallFunctionChunk,
     ChatCompletionUsageBlock,
 )
-from litellm.types.utils import GenericStreamingChunk
+from litellm.types.utils import GenericStreamingChunk, ModelResponse, Usage
 
 
 class ModelResponseIterator:
diff --git a/litellm/proxy/management_endpoints/team_endpoints.py b/litellm/proxy/management_endpoints/team_endpoints.py
index 74289c90a350..8dcd0c7ebb79 100644
--- a/litellm/proxy/management_endpoints/team_endpoints.py
+++ b/litellm/proxy/management_endpoints/team_endpoints.py
@@ -1281,12 +1281,20 @@ async def list_team(
             where={"team_id": team.team_id}
         )
 
-        returned_responses.append(
-            TeamListResponseObject(
-                **team.model_dump(),
-                team_memberships=_team_memberships,
-                keys=keys,
+        try:
+            returned_responses.append(
+                TeamListResponseObject(
+                    **team.model_dump(),
+                    team_memberships=_team_memberships,
+                    keys=keys,
+                )
             )
-        )
+        except Exception as e:
+            team_exception = """Invalid team object for team_id: {}. team_object={}.
+            Error: {}
+            """.format(
+                team.team_id, team.model_dump(), str(e)
+            )
+            raise HTTPException(status_code=400, detail={"error": team_exception})
 
     return returned_responses
diff --git a/litellm/utils.py b/litellm/utils.py
index efda579d672d..f2360884c093 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -35,6 +35,7 @@
 import uuid
 from dataclasses import dataclass, field
 from functools import lru_cache, wraps
+from importlib import resources
 from inspect import iscoroutine
 from os.path import abspath, dirname, join
 
@@ -49,6 +50,7 @@
 from openai.lib import _parsing, _pydantic
 from openai.types.chat.completion_create_params import ResponseFormat
 from pydantic import BaseModel
+from tiktoken import Encoding
 from tokenizers import Tokenizer
 
 import litellm
@@ -59,7 +61,11 @@
 from litellm.caching.caching import DualCache
 from litellm.caching.caching_handler import CachingHandlerResponse, LLMCachingHandler
 from litellm.integrations.custom_logger import CustomLogger
-from litellm.litellm_core_utils.core_helpers import map_finish_reason
+from litellm.litellm_core_utils.core_helpers import (
+    map_finish_reason,
+    process_response_headers,
+)
+from litellm.litellm_core_utils.default_encoding import encoding
 from litellm.litellm_core_utils.exception_mapping_utils import (
     _get_response_headers,
     exception_type,
@@ -87,6 +93,8 @@
     LiteLLMLoggingObject,
     redact_message_input_output_from_logging,
 )
+from litellm.litellm_core_utils.rules import Rules
+from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
 from litellm.litellm_core_utils.token_counter import get_modified_max_tokens
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.secret_managers.main import get_secret
@@ -123,25 +131,6 @@
     Usage,
 )
 
-try:
-    # New and recommended way to access resources
-    from importlib import resources
-
-    filename = str(resources.files(litellm).joinpath("llms/tokenizers"))
-except (ImportError, AttributeError):
-    # Old way to access resources, which setuptools deprecated some time ago
-    import pkg_resources  # type: ignore
-
-    filename = pkg_resources.resource_filename(__name__, "llms/tokenizers")
-
-os.environ["TIKTOKEN_CACHE_DIR"] = os.getenv(
-    "CUSTOM_TIKTOKEN_CACHE_DIR", filename
-)  # use local copy of tiktoken b/c of - https://github.com/BerriAI/litellm/issues/1071
-from tiktoken import Encoding
-
-encoding = tiktoken.get_encoding("cl100k_base")
-from importlib import resources
-
 with resources.open_text("litellm.llms.tokenizers", "anthropic_tokenizer.json") as f:
     json_data = json.load(f)
 # Convert to str (if necessary)
@@ -276,56 +265,6 @@ def print_verbose(
         pass
 
 
-####### RULES ###################
-
-
-class Rules:
-    """
-    Fail calls based on the input or llm api output
-
-    Example usage:
-    import litellm
-    def my_custom_rule(input): # receives the model response
-            if "i don't think i can answer" in input: # trigger fallback if the model refuses to answer
-                    return False
-            return True
-
-    litellm.post_call_rules = [my_custom_rule] # have these be functions that can be called to fail a call
-
-    response = litellm.completion(model="gpt-3.5-turbo", messages=[{"role": "user",
-        "content": "Hey, how's it going?"}], fallbacks=["openrouter/mythomax"])
-    """
-
-    def __init__(self) -> None:
-        pass
-
-    def pre_call_rules(self, input: str, model: str):
-        for rule in litellm.pre_call_rules:
-            if callable(rule):
-                decision = rule(input)
-                if decision is False:
-                    raise litellm.APIResponseValidationError(message="LLM Response failed post-call-rule check", llm_provider="", model=model)  # type: ignore
-        return True
-
-    def post_call_rules(self, input: Optional[str], model: str) -> bool:
-        if input is None:
-            return True
-        for rule in litellm.post_call_rules:
-            if callable(rule):
-                decision = rule(input)
-                if isinstance(decision, bool):
-                    if decision is False:
-                        raise litellm.APIResponseValidationError(message="LLM Response failed post-call-rule check", llm_provider="", model=model)  # type: ignore
-                elif isinstance(decision, dict):
-                    decision_val = decision.get("decision", True)
-                    decision_message = decision.get(
-                        "message", "LLM Response failed post-call-rule check"
-                    )
-                    if decision_val is False:
-                        raise litellm.APIResponseValidationError(message=decision_message, llm_provider="", model=model)  # type: ignore
-        return True
-
-
 ####### CLIENT ###################
 # make it easy to log if completion/embedding runs succeeded or failed + see what happened | Non-Blocking
 def custom_llm_setup():
@@ -5568,2042 +5507,2025 @@ def get_model_list():
 # wraps the completion stream to return the correct format for the model
 # replicate/anthropic/cohere
 
+# class CustomStreamWrapper:
+#     def __init__(
+#         self,
+#         completion_stream,
+#         model,
+#         logging_obj: Any,
+#         custom_llm_provider: Optional[str] = None,
+#         stream_options=None,
+#         make_call: Optional[Callable] = None,
+#         _response_headers: Optional[dict] = None,
+#     ):
+#         self.model = model
+#         self.make_call = make_call
+#         self.custom_llm_provider = custom_llm_provider
+#         self.logging_obj: LiteLLMLoggingObject = logging_obj
+#         self.completion_stream = completion_stream
+#         self.sent_first_chunk = False
+#         self.sent_last_chunk = False
+#         self.system_fingerprint: Optional[str] = None
+#         self.received_finish_reason: Optional[str] = None
+#         self.special_tokens = [
+#             "<|assistant|>",
+#             "<|system|>",
+#             "<|user|>",
+#             "<s>",
+#             "</s>",
+#             "<|im_end|>",
+#             "<|im_start|>",
+#         ]
+#         self.holding_chunk = ""
+#         self.complete_response = ""
+#         self.response_uptil_now = ""
+#         _model_info = (
+#             self.logging_obj.model_call_details.get("litellm_params", {}).get(
+#                 "model_info", {}
+#             )
+#             or {}
+#         )
+#         self._hidden_params = {
+#             "model_id": (_model_info.get("id", None)),
+#         }  # returned as x-litellm-model-id response header in proxy
+
+#         self._hidden_params["additional_headers"] = process_response_headers(
+#             _response_headers or {}
+#         )  # GUARANTEE OPENAI HEADERS IN RESPONSE
+
+#         self._response_headers = _response_headers
+#         self.response_id = None
+#         self.logging_loop = None
+#         self.rules = Rules()
+#         self.stream_options = stream_options or getattr(
+#             logging_obj, "stream_options", None
+#         )
+#         self.messages = getattr(logging_obj, "messages", None)
+#         self.sent_stream_usage = False
+#         self.send_stream_usage = (
+#             True if self.check_send_stream_usage(self.stream_options) else False
+#         )
+#         self.tool_call = False
+#         self.chunks: List = (
+#             []
+#         )  # keep track of the returned chunks - used for calculating the input/output tokens for stream options
+#         self.is_function_call = self.check_is_function_call(logging_obj=logging_obj)
+
+#     def __iter__(self):
+#         return self
+
+#     def __aiter__(self):
+#         return self
+
+#     def check_send_stream_usage(self, stream_options: Optional[dict]):
+#         return (
+#             stream_options is not None
+#             and stream_options.get("include_usage", False) is True
+#         )
+
+#     def check_is_function_call(self, logging_obj) -> bool:
+#         if hasattr(logging_obj, "optional_params") and isinstance(
+#             logging_obj.optional_params, dict
+#         ):
+#             if (
+#                 "litellm_param_is_function_call" in logging_obj.optional_params
+#                 and logging_obj.optional_params["litellm_param_is_function_call"]
+#                 is True
+#             ):
+#                 return True
+
+#         return False
+
+#     def process_chunk(self, chunk: str):
+#         """
+#         NLP Cloud streaming returns the entire response, for each chunk. Process this, to only return the delta.
+#         """
+#         try:
+#             chunk = chunk.strip()
+#             self.complete_response = self.complete_response.strip()
+
+#             if chunk.startswith(self.complete_response):
+#                 # Remove last_sent_chunk only if it appears at the start of the new chunk
+#                 chunk = chunk[len(self.complete_response) :]
+
+#             self.complete_response += chunk
+#             return chunk
+#         except Exception as e:
+#             raise e
+
+#     def safety_checker(self) -> None:
+#         """
+#         Fixes - https://github.com/BerriAI/litellm/issues/5158
+
+#         if the model enters a loop and starts repeating the same chunk again, break out of loop and raise an internalservererror - allows for retries.
+
+#         Raises - InternalServerError, if LLM enters infinite loop while streaming
+#         """
+#         if len(self.chunks) >= litellm.REPEATED_STREAMING_CHUNK_LIMIT:
+#             # Get the last n chunks
+#             last_chunks = self.chunks[-litellm.REPEATED_STREAMING_CHUNK_LIMIT :]
+
+#             # Extract the relevant content from the chunks
+#             last_contents = [chunk.choices[0].delta.content for chunk in last_chunks]
+
+#             # Check if all extracted contents are identical
+#             if all(content == last_contents[0] for content in last_contents):
+#                 if (
+#                     last_contents[0] is not None
+#                     and isinstance(last_contents[0], str)
+#                     and len(last_contents[0]) > 2
+#                 ):  # ignore empty content - https://github.com/BerriAI/litellm/issues/5158#issuecomment-2287156946
+#                     # All last n chunks are identical
+#                     raise litellm.InternalServerError(
+#                         message="The model is repeating the same chunk = {}.".format(
+#                             last_contents[0]
+#                         ),
+#                         model="",
+#                         llm_provider="",
+#                     )
+
+#     def check_special_tokens(self, chunk: str, finish_reason: Optional[str]):
+#         """
+#         Output parse <s> / </s> special tokens for sagemaker + hf streaming.
+#         """
+#         hold = False
+#         if (
+#             self.custom_llm_provider != "huggingface"
+#             and self.custom_llm_provider != "sagemaker"
+#         ):
+#             return hold, chunk
+
+#         if finish_reason:
+#             for token in self.special_tokens:
+#                 if token in chunk:
+#                     chunk = chunk.replace(token, "")
+#             return hold, chunk
+
+#         if self.sent_first_chunk is True:
+#             return hold, chunk
+
+#         curr_chunk = self.holding_chunk + chunk
+#         curr_chunk = curr_chunk.strip()
+
+#         for token in self.special_tokens:
+#             if len(curr_chunk) < len(token) and curr_chunk in token:
+#                 hold = True
+#                 self.holding_chunk = curr_chunk
+#             elif len(curr_chunk) >= len(token):
+#                 if token in curr_chunk:
+#                     self.holding_chunk = curr_chunk.replace(token, "")
+#                     hold = True
+#             else:
+#                 pass
+
+#         if hold is False:  # reset
+#             self.holding_chunk = ""
+#         return hold, curr_chunk
+
+#     def handle_anthropic_text_chunk(self, chunk):
+#         """
+#         For old anthropic models - claude-1, claude-2.
+
+#         Claude-3 is handled from within Anthropic.py VIA ModelResponseIterator()
+#         """
+#         str_line = chunk
+#         if isinstance(chunk, bytes):  # Handle binary data
+#             str_line = chunk.decode("utf-8")  # Convert bytes to string
+#         text = ""
+#         is_finished = False
+#         finish_reason = None
+#         if str_line.startswith("data:"):
+#             data_json = json.loads(str_line[5:])
+#             type_chunk = data_json.get("type", None)
+#             if type_chunk == "completion":
+#                 text = data_json.get("completion")
+#                 finish_reason = data_json.get("stop_reason")
+#                 if finish_reason is not None:
+#                     is_finished = True
+#             return {
+#                 "text": text,
+#                 "is_finished": is_finished,
+#                 "finish_reason": finish_reason,
+#             }
+#         elif "error" in str_line:
+#             raise ValueError(f"Unable to parse response. Original response: {str_line}")
+#         else:
+#             return {
+#                 "text": text,
+#                 "is_finished": is_finished,
+#                 "finish_reason": finish_reason,
+#             }
+
+#     def handle_vertexai_anthropic_chunk(self, chunk):
+#         """
+#         - MessageStartEvent(message=Message(id='msg_01LeRRgvX4gwkX3ryBVgtuYZ', content=[], model='claude-3-sonnet-20240229', role='assistant', stop_reason=None, stop_sequence=None, type='message', usage=Usage(input_tokens=8, output_tokens=1)), type='message_start'); custom_llm_provider: vertex_ai
+#         - ContentBlockStartEvent(content_block=ContentBlock(text='', type='text'), index=0, type='content_block_start'); custom_llm_provider: vertex_ai
+#         - ContentBlockDeltaEvent(delta=TextDelta(text='Hello', type='text_delta'), index=0, type='content_block_delta'); custom_llm_provider: vertex_ai
+#         """
+#         text = ""
+#         prompt_tokens = None
+#         completion_tokens = None
+#         is_finished = False
+#         finish_reason = None
+#         type_chunk = getattr(chunk, "type", None)
+#         if type_chunk == "message_start":
+#             message = getattr(chunk, "message", None)
+#             text = ""  # lets us return a chunk with usage to user
+#             _usage = getattr(message, "usage", None)
+#             if _usage is not None:
+#                 prompt_tokens = getattr(_usage, "input_tokens", None)
+#                 completion_tokens = getattr(_usage, "output_tokens", None)
+#         elif type_chunk == "content_block_delta":
+#             """
+#             Anthropic content chunk
+#             chunk = {'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': 'Hello'}}
+#             """
+#             delta = getattr(chunk, "delta", None)
+#             if delta is not None:
+#                 text = getattr(delta, "text", "")
+#             else:
+#                 text = ""
+#         elif type_chunk == "message_delta":
+#             """
+#             Anthropic
+#             chunk = {'type': 'message_delta', 'delta': {'stop_reason': 'max_tokens', 'stop_sequence': None}, 'usage': {'output_tokens': 10}}
+#             """
+#             # TODO - get usage from this chunk, set in response
+#             delta = getattr(chunk, "delta", None)
+#             if delta is not None:
+#                 finish_reason = getattr(delta, "stop_reason", "stop")
+#                 is_finished = True
+#             _usage = getattr(chunk, "usage", None)
+#             if _usage is not None:
+#                 prompt_tokens = getattr(_usage, "input_tokens", None)
+#                 completion_tokens = getattr(_usage, "output_tokens", None)
+
+#         return {
+#             "text": text,
+#             "is_finished": is_finished,
+#             "finish_reason": finish_reason,
+#             "prompt_tokens": prompt_tokens,
+#             "completion_tokens": completion_tokens,
+#         }
 
-def calculate_total_usage(chunks: List[ModelResponse]) -> Usage:
-    """Assume most recent usage chunk has total usage uptil then."""
-    prompt_tokens: int = 0
-    completion_tokens: int = 0
-    for chunk in chunks:
-        if "usage" in chunk:
-            if "prompt_tokens" in chunk["usage"]:
-                prompt_tokens = chunk["usage"].get("prompt_tokens", 0) or 0
-            if "completion_tokens" in chunk["usage"]:
-                completion_tokens = chunk["usage"].get("completion_tokens", 0) or 0
-
-    returned_usage_chunk = Usage(
-        prompt_tokens=prompt_tokens,
-        completion_tokens=completion_tokens,
-        total_tokens=prompt_tokens + completion_tokens,
-    )
-
-    return returned_usage_chunk
-
-
-class CustomStreamWrapper:
-    def __init__(
-        self,
-        completion_stream,
-        model,
-        logging_obj: Any,
-        custom_llm_provider: Optional[str] = None,
-        stream_options=None,
-        make_call: Optional[Callable] = None,
-        _response_headers: Optional[dict] = None,
-    ):
-        self.model = model
-        self.make_call = make_call
-        self.custom_llm_provider = custom_llm_provider
-        self.logging_obj: LiteLLMLoggingObject = logging_obj
-        self.completion_stream = completion_stream
-        self.sent_first_chunk = False
-        self.sent_last_chunk = False
-        self.system_fingerprint: Optional[str] = None
-        self.received_finish_reason: Optional[str] = None
-        self.special_tokens = [
-            "<|assistant|>",
-            "<|system|>",
-            "<|user|>",
-            "<s>",
-            "</s>",
-            "<|im_end|>",
-            "<|im_start|>",
-        ]
-        self.holding_chunk = ""
-        self.complete_response = ""
-        self.response_uptil_now = ""
-        _model_info = (
-            self.logging_obj.model_call_details.get("litellm_params", {}).get(
-                "model_info", {}
-            )
-            or {}
-        )
-        self._hidden_params = {
-            "model_id": (_model_info.get("id", None)),
-        }  # returned as x-litellm-model-id response header in proxy
-
-        self._hidden_params["additional_headers"] = process_response_headers(
-            _response_headers or {}
-        )  # GUARANTEE OPENAI HEADERS IN RESPONSE
-
-        self._response_headers = _response_headers
-        self.response_id = None
-        self.logging_loop = None
-        self.rules = Rules()
-        self.stream_options = stream_options or getattr(
-            logging_obj, "stream_options", None
-        )
-        self.messages = getattr(logging_obj, "messages", None)
-        self.sent_stream_usage = False
-        self.send_stream_usage = (
-            True if self.check_send_stream_usage(self.stream_options) else False
-        )
-        self.tool_call = False
-        self.chunks: List = (
-            []
-        )  # keep track of the returned chunks - used for calculating the input/output tokens for stream options
-        self.is_function_call = self.check_is_function_call(logging_obj=logging_obj)
-
-    def __iter__(self):
-        return self
-
-    def __aiter__(self):
-        return self
-
-    def check_send_stream_usage(self, stream_options: Optional[dict]):
-        return (
-            stream_options is not None
-            and stream_options.get("include_usage", False) is True
-        )
-
-    def check_is_function_call(self, logging_obj) -> bool:
-        if hasattr(logging_obj, "optional_params") and isinstance(
-            logging_obj.optional_params, dict
-        ):
-            if (
-                "litellm_param_is_function_call" in logging_obj.optional_params
-                and logging_obj.optional_params["litellm_param_is_function_call"]
-                is True
-            ):
-                return True
-
-        return False
-
-    def process_chunk(self, chunk: str):
-        """
-        NLP Cloud streaming returns the entire response, for each chunk. Process this, to only return the delta.
-        """
-        try:
-            chunk = chunk.strip()
-            self.complete_response = self.complete_response.strip()
-
-            if chunk.startswith(self.complete_response):
-                # Remove last_sent_chunk only if it appears at the start of the new chunk
-                chunk = chunk[len(self.complete_response) :]
-
-            self.complete_response += chunk
-            return chunk
-        except Exception as e:
-            raise e
-
-    def safety_checker(self) -> None:
-        """
-        Fixes - https://github.com/BerriAI/litellm/issues/5158
-
-        if the model enters a loop and starts repeating the same chunk again, break out of loop and raise an internalservererror - allows for retries.
-
-        Raises - InternalServerError, if LLM enters infinite loop while streaming
-        """
-        if len(self.chunks) >= litellm.REPEATED_STREAMING_CHUNK_LIMIT:
-            # Get the last n chunks
-            last_chunks = self.chunks[-litellm.REPEATED_STREAMING_CHUNK_LIMIT :]
-
-            # Extract the relevant content from the chunks
-            last_contents = [chunk.choices[0].delta.content for chunk in last_chunks]
-
-            # Check if all extracted contents are identical
-            if all(content == last_contents[0] for content in last_contents):
-                if (
-                    last_contents[0] is not None
-                    and isinstance(last_contents[0], str)
-                    and len(last_contents[0]) > 2
-                ):  # ignore empty content - https://github.com/BerriAI/litellm/issues/5158#issuecomment-2287156946
-                    # All last n chunks are identical
-                    raise litellm.InternalServerError(
-                        message="The model is repeating the same chunk = {}.".format(
-                            last_contents[0]
-                        ),
-                        model="",
-                        llm_provider="",
-                    )
-
-    def check_special_tokens(self, chunk: str, finish_reason: Optional[str]):
-        """
-        Output parse <s> / </s> special tokens for sagemaker + hf streaming.
-        """
-        hold = False
-        if (
-            self.custom_llm_provider != "huggingface"
-            and self.custom_llm_provider != "sagemaker"
-        ):
-            return hold, chunk
-
-        if finish_reason:
-            for token in self.special_tokens:
-                if token in chunk:
-                    chunk = chunk.replace(token, "")
-            return hold, chunk
-
-        if self.sent_first_chunk is True:
-            return hold, chunk
-
-        curr_chunk = self.holding_chunk + chunk
-        curr_chunk = curr_chunk.strip()
-
-        for token in self.special_tokens:
-            if len(curr_chunk) < len(token) and curr_chunk in token:
-                hold = True
-                self.holding_chunk = curr_chunk
-            elif len(curr_chunk) >= len(token):
-                if token in curr_chunk:
-                    self.holding_chunk = curr_chunk.replace(token, "")
-                    hold = True
-            else:
-                pass
-
-        if hold is False:  # reset
-            self.holding_chunk = ""
-        return hold, curr_chunk
-
-    def handle_anthropic_text_chunk(self, chunk):
-        """
-        For old anthropic models - claude-1, claude-2.
-
-        Claude-3 is handled from within Anthropic.py VIA ModelResponseIterator()
-        """
-        str_line = chunk
-        if isinstance(chunk, bytes):  # Handle binary data
-            str_line = chunk.decode("utf-8")  # Convert bytes to string
-        text = ""
-        is_finished = False
-        finish_reason = None
-        if str_line.startswith("data:"):
-            data_json = json.loads(str_line[5:])
-            type_chunk = data_json.get("type", None)
-            if type_chunk == "completion":
-                text = data_json.get("completion")
-                finish_reason = data_json.get("stop_reason")
-                if finish_reason is not None:
-                    is_finished = True
-            return {
-                "text": text,
-                "is_finished": is_finished,
-                "finish_reason": finish_reason,
-            }
-        elif "error" in str_line:
-            raise ValueError(f"Unable to parse response. Original response: {str_line}")
-        else:
-            return {
-                "text": text,
-                "is_finished": is_finished,
-                "finish_reason": finish_reason,
-            }
-
-    def handle_vertexai_anthropic_chunk(self, chunk):
-        """
-        - MessageStartEvent(message=Message(id='msg_01LeRRgvX4gwkX3ryBVgtuYZ', content=[], model='claude-3-sonnet-20240229', role='assistant', stop_reason=None, stop_sequence=None, type='message', usage=Usage(input_tokens=8, output_tokens=1)), type='message_start'); custom_llm_provider: vertex_ai
-        - ContentBlockStartEvent(content_block=ContentBlock(text='', type='text'), index=0, type='content_block_start'); custom_llm_provider: vertex_ai
-        - ContentBlockDeltaEvent(delta=TextDelta(text='Hello', type='text_delta'), index=0, type='content_block_delta'); custom_llm_provider: vertex_ai
-        """
-        text = ""
-        prompt_tokens = None
-        completion_tokens = None
-        is_finished = False
-        finish_reason = None
-        type_chunk = getattr(chunk, "type", None)
-        if type_chunk == "message_start":
-            message = getattr(chunk, "message", None)
-            text = ""  # lets us return a chunk with usage to user
-            _usage = getattr(message, "usage", None)
-            if _usage is not None:
-                prompt_tokens = getattr(_usage, "input_tokens", None)
-                completion_tokens = getattr(_usage, "output_tokens", None)
-        elif type_chunk == "content_block_delta":
-            """
-            Anthropic content chunk
-            chunk = {'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': 'Hello'}}
-            """
-            delta = getattr(chunk, "delta", None)
-            if delta is not None:
-                text = getattr(delta, "text", "")
-            else:
-                text = ""
-        elif type_chunk == "message_delta":
-            """
-            Anthropic
-            chunk = {'type': 'message_delta', 'delta': {'stop_reason': 'max_tokens', 'stop_sequence': None}, 'usage': {'output_tokens': 10}}
-            """
-            # TODO - get usage from this chunk, set in response
-            delta = getattr(chunk, "delta", None)
-            if delta is not None:
-                finish_reason = getattr(delta, "stop_reason", "stop")
-                is_finished = True
-            _usage = getattr(chunk, "usage", None)
-            if _usage is not None:
-                prompt_tokens = getattr(_usage, "input_tokens", None)
-                completion_tokens = getattr(_usage, "output_tokens", None)
-
-        return {
-            "text": text,
-            "is_finished": is_finished,
-            "finish_reason": finish_reason,
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-        }
-
-    def handle_predibase_chunk(self, chunk):
-        try:
-            if not isinstance(chunk, str):
-                chunk = chunk.decode(
-                    "utf-8"
-                )  # DO NOT REMOVE this: This is required for HF inference API + Streaming
-            text = ""
-            is_finished = False
-            finish_reason = ""
-            print_verbose(f"chunk: {chunk}")
-            if chunk.startswith("data:"):
-                data_json = json.loads(chunk[5:])
-                print_verbose(f"data json: {data_json}")
-                if "token" in data_json and "text" in data_json["token"]:
-                    text = data_json["token"]["text"]
-                if data_json.get("details", False) and data_json["details"].get(
-                    "finish_reason", False
-                ):
-                    is_finished = True
-                    finish_reason = data_json["details"]["finish_reason"]
-                elif data_json.get(
-                    "generated_text", False
-                ):  # if full generated text exists, then stream is complete
-                    text = ""  # don't return the final bos token
-                    is_finished = True
-                    finish_reason = "stop"
-                elif data_json.get("error", False):
-                    raise Exception(data_json.get("error"))
-                return {
-                    "text": text,
-                    "is_finished": is_finished,
-                    "finish_reason": finish_reason,
-                }
-            elif "error" in chunk:
-                raise ValueError(chunk)
-            return {
-                "text": text,
-                "is_finished": is_finished,
-                "finish_reason": finish_reason,
-            }
-        except Exception as e:
-            raise e
-
-    def handle_huggingface_chunk(self, chunk):
-        try:
-            if not isinstance(chunk, str):
-                chunk = chunk.decode(
-                    "utf-8"
-                )  # DO NOT REMOVE this: This is required for HF inference API + Streaming
-            text = ""
-            is_finished = False
-            finish_reason = ""
-            print_verbose(f"chunk: {chunk}")
-            if chunk.startswith("data:"):
-                data_json = json.loads(chunk[5:])
-                print_verbose(f"data json: {data_json}")
-                if "token" in data_json and "text" in data_json["token"]:
-                    text = data_json["token"]["text"]
-                if data_json.get("details", False) and data_json["details"].get(
-                    "finish_reason", False
-                ):
-                    is_finished = True
-                    finish_reason = data_json["details"]["finish_reason"]
-                elif data_json.get(
-                    "generated_text", False
-                ):  # if full generated text exists, then stream is complete
-                    text = ""  # don't return the final bos token
-                    is_finished = True
-                    finish_reason = "stop"
-                elif data_json.get("error", False):
-                    raise Exception(data_json.get("error"))
-                return {
-                    "text": text,
-                    "is_finished": is_finished,
-                    "finish_reason": finish_reason,
-                }
-            elif "error" in chunk:
-                raise ValueError(chunk)
-            return {
-                "text": text,
-                "is_finished": is_finished,
-                "finish_reason": finish_reason,
-            }
-        except Exception as e:
-            raise e
-
-    def handle_ai21_chunk(self, chunk):  # fake streaming
-        chunk = chunk.decode("utf-8")
-        data_json = json.loads(chunk)
-        try:
-            text = data_json["completions"][0]["data"]["text"]
-            is_finished = True
-            finish_reason = "stop"
-            return {
-                "text": text,
-                "is_finished": is_finished,
-                "finish_reason": finish_reason,
-            }
-        except Exception:
-            raise ValueError(f"Unable to parse response. Original response: {chunk}")
-
-    def handle_maritalk_chunk(self, chunk):  # fake streaming
-        chunk = chunk.decode("utf-8")
-        data_json = json.loads(chunk)
-        try:
-            text = data_json["answer"]
-            is_finished = True
-            finish_reason = "stop"
-            return {
-                "text": text,
-                "is_finished": is_finished,
-                "finish_reason": finish_reason,
-            }
-        except Exception:
-            raise ValueError(f"Unable to parse response. Original response: {chunk}")
-
-    def handle_nlp_cloud_chunk(self, chunk):
-        text = ""
-        is_finished = False
-        finish_reason = ""
-        try:
-            if "dolphin" in self.model:
-                chunk = self.process_chunk(chunk=chunk)
-            else:
-                data_json = json.loads(chunk)
-                chunk = data_json["generated_text"]
-            text = chunk
-            if "[DONE]" in text:
-                text = text.replace("[DONE]", "")
-                is_finished = True
-                finish_reason = "stop"
-            return {
-                "text": text,
-                "is_finished": is_finished,
-                "finish_reason": finish_reason,
-            }
-        except Exception:
-            raise ValueError(f"Unable to parse response. Original response: {chunk}")
-
-    def handle_aleph_alpha_chunk(self, chunk):
-        chunk = chunk.decode("utf-8")
-        data_json = json.loads(chunk)
-        try:
-            text = data_json["completions"][0]["completion"]
-            is_finished = True
-            finish_reason = "stop"
-            return {
-                "text": text,
-                "is_finished": is_finished,
-                "finish_reason": finish_reason,
-            }
-        except Exception:
-            raise ValueError(f"Unable to parse response. Original response: {chunk}")
-
-    def handle_cohere_chunk(self, chunk):
-        chunk = chunk.decode("utf-8")
-        data_json = json.loads(chunk)
-        try:
-            text = ""
-            is_finished = False
-            finish_reason = ""
-            index: Optional[int] = None
-            if "index" in data_json:
-                index = data_json.get("index")
-            if "text" in data_json:
-                text = data_json["text"]
-            elif "is_finished" in data_json:
-                is_finished = data_json["is_finished"]
-                finish_reason = data_json["finish_reason"]
-            else:
-                raise Exception(data_json)
-            return {
-                "index": index,
-                "text": text,
-                "is_finished": is_finished,
-                "finish_reason": finish_reason,
-            }
-        except Exception:
-            raise ValueError(f"Unable to parse response. Original response: {chunk}")
-
-    def handle_cohere_chat_chunk(self, chunk):
-        chunk = chunk.decode("utf-8")
-        data_json = json.loads(chunk)
-        print_verbose(f"chunk: {chunk}")
-        try:
-            text = ""
-            is_finished = False
-            finish_reason = ""
-            if "text" in data_json:
-                text = data_json["text"]
-            elif "is_finished" in data_json and data_json["is_finished"] is True:
-                is_finished = data_json["is_finished"]
-                finish_reason = data_json["finish_reason"]
-            else:
-                return
-            return {
-                "text": text,
-                "is_finished": is_finished,
-                "finish_reason": finish_reason,
-            }
-        except Exception:
-            raise ValueError(f"Unable to parse response. Original response: {chunk}")
-
-    def handle_azure_chunk(self, chunk):
-        is_finished = False
-        finish_reason = ""
-        text = ""
-        print_verbose(f"chunk: {chunk}")
-        if "data: [DONE]" in chunk:
-            text = ""
-            is_finished = True
-            finish_reason = "stop"
-            return {
-                "text": text,
-                "is_finished": is_finished,
-                "finish_reason": finish_reason,
-            }
-        elif chunk.startswith("data:"):
-            data_json = json.loads(chunk[5:])  # chunk.startswith("data:"):
-            try:
-                if len(data_json["choices"]) > 0:
-                    delta = data_json["choices"][0]["delta"]
-                    text = "" if delta is None else delta.get("content", "")
-                    if data_json["choices"][0].get("finish_reason", None):
-                        is_finished = True
-                        finish_reason = data_json["choices"][0]["finish_reason"]
-                print_verbose(
-                    f"text: {text}; is_finished: {is_finished}; finish_reason: {finish_reason}"
-                )
-                return {
-                    "text": text,
-                    "is_finished": is_finished,
-                    "finish_reason": finish_reason,
-                }
-            except Exception:
-                raise ValueError(
-                    f"Unable to parse response. Original response: {chunk}"
-                )
-        elif "error" in chunk:
-            raise ValueError(f"Unable to parse response. Original response: {chunk}")
-        else:
-            return {
-                "text": text,
-                "is_finished": is_finished,
-                "finish_reason": finish_reason,
-            }
-
-    def handle_replicate_chunk(self, chunk):
-        try:
-            text = ""
-            is_finished = False
-            finish_reason = ""
-            if "output" in chunk:
-                text = chunk["output"]
-            if "status" in chunk:
-                if chunk["status"] == "succeeded":
-                    is_finished = True
-                    finish_reason = "stop"
-            elif chunk.get("error", None):
-                raise Exception(chunk["error"])
-            return {
-                "text": text,
-                "is_finished": is_finished,
-                "finish_reason": finish_reason,
-            }
-        except Exception:
-            raise ValueError(f"Unable to parse response. Original response: {chunk}")
-
-    def handle_openai_chat_completion_chunk(self, chunk):
-        try:
-            print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n")
-            str_line = chunk
-            text = ""
-            is_finished = False
-            finish_reason = None
-            logprobs = None
-            usage = None
-            if str_line and str_line.choices and len(str_line.choices) > 0:
-                if (
-                    str_line.choices[0].delta is not None
-                    and str_line.choices[0].delta.content is not None
-                ):
-                    text = str_line.choices[0].delta.content
-                else:  # function/tool calling chunk - when content is None. in this case we just return the original chunk from openai
-                    pass
-                if str_line.choices[0].finish_reason:
-                    is_finished = True
-                    finish_reason = str_line.choices[0].finish_reason
-
-                # checking for logprobs
-                if (
-                    hasattr(str_line.choices[0], "logprobs")
-                    and str_line.choices[0].logprobs is not None
-                ):
-                    logprobs = str_line.choices[0].logprobs
-                else:
-                    logprobs = None
-
-            usage = getattr(str_line, "usage", None)
-
-            return {
-                "text": text,
-                "is_finished": is_finished,
-                "finish_reason": finish_reason,
-                "logprobs": logprobs,
-                "original_chunk": str_line,
-                "usage": usage,
-            }
-        except Exception as e:
-            raise e
-
-    def handle_azure_text_completion_chunk(self, chunk):
-        try:
-            print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n")
-            text = ""
-            is_finished = False
-            finish_reason = None
-            choices = getattr(chunk, "choices", [])
-            if len(choices) > 0:
-                text = choices[0].text
-                if choices[0].finish_reason is not None:
-                    is_finished = True
-                    finish_reason = choices[0].finish_reason
-            return {
-                "text": text,
-                "is_finished": is_finished,
-                "finish_reason": finish_reason,
-            }
-
-        except Exception as e:
-            raise e
-
-    def handle_openai_text_completion_chunk(self, chunk):
-        try:
-            print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n")
-            text = ""
-            is_finished = False
-            finish_reason = None
-            usage = None
-            choices = getattr(chunk, "choices", [])
-            if len(choices) > 0:
-                text = choices[0].text
-                if choices[0].finish_reason is not None:
-                    is_finished = True
-                    finish_reason = choices[0].finish_reason
-            usage = getattr(chunk, "usage", None)
-            return {
-                "text": text,
-                "is_finished": is_finished,
-                "finish_reason": finish_reason,
-                "usage": usage,
-            }
-
-        except Exception as e:
-            raise e
-
-    def handle_baseten_chunk(self, chunk):
-        try:
-            chunk = chunk.decode("utf-8")
-            if len(chunk) > 0:
-                if chunk.startswith("data:"):
-                    data_json = json.loads(chunk[5:])
-                    if "token" in data_json and "text" in data_json["token"]:
-                        return data_json["token"]["text"]
-                    else:
-                        return ""
-                data_json = json.loads(chunk)
-                if "model_output" in data_json:
-                    if (
-                        isinstance(data_json["model_output"], dict)
-                        and "data" in data_json["model_output"]
-                        and isinstance(data_json["model_output"]["data"], list)
-                    ):
-                        return data_json["model_output"]["data"][0]
-                    elif isinstance(data_json["model_output"], str):
-                        return data_json["model_output"]
-                    elif "completion" in data_json and isinstance(
-                        data_json["completion"], str
-                    ):
-                        return data_json["completion"]
-                    else:
-                        raise ValueError(
-                            f"Unable to parse response. Original response: {chunk}"
-                        )
-                else:
-                    return ""
-            else:
-                return ""
-        except Exception as e:
-            verbose_logger.exception(
-                "litellm.CustomStreamWrapper.handle_baseten_chunk(): Exception occured - {}".format(
-                    str(e)
-                )
-            )
-            return ""
-
-    def handle_cloudlfare_stream(self, chunk):
-        try:
-            print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n")
-            chunk = chunk.decode("utf-8")
-            str_line = chunk
-            text = ""
-            is_finished = False
-            finish_reason = None
-
-            if "[DONE]" in chunk:
-                return {"text": text, "is_finished": True, "finish_reason": "stop"}
-            elif str_line.startswith("data:"):
-                data_json = json.loads(str_line[5:])
-                print_verbose(f"delta content: {data_json}")
-                text = data_json["response"]
-                return {
-                    "text": text,
-                    "is_finished": is_finished,
-                    "finish_reason": finish_reason,
-                }
-            else:
-                return {
-                    "text": text,
-                    "is_finished": is_finished,
-                    "finish_reason": finish_reason,
-                }
-
-        except Exception as e:
-            raise e
-
-    def handle_ollama_stream(self, chunk):
-        try:
-            if isinstance(chunk, dict):
-                json_chunk = chunk
-            else:
-                json_chunk = json.loads(chunk)
-            if "error" in json_chunk:
-                raise Exception(f"Ollama Error - {json_chunk}")
-
-            text = ""
-            is_finished = False
-            finish_reason = None
-            if json_chunk["done"] is True:
-                text = ""
-                is_finished = True
-                finish_reason = "stop"
-                return {
-                    "text": text,
-                    "is_finished": is_finished,
-                    "finish_reason": finish_reason,
-                }
-            elif json_chunk["response"]:
-                print_verbose(f"delta content: {json_chunk}")
-                text = json_chunk["response"]
-                return {
-                    "text": text,
-                    "is_finished": is_finished,
-                    "finish_reason": finish_reason,
-                }
-            else:
-                raise Exception(f"Ollama Error - {json_chunk}")
-        except Exception as e:
-            raise e
-
-    def handle_ollama_chat_stream(self, chunk):
-        # for ollama_chat/ provider
-        try:
-            if isinstance(chunk, dict):
-                json_chunk = chunk
-            else:
-                json_chunk = json.loads(chunk)
-            if "error" in json_chunk:
-                raise Exception(f"Ollama Error - {json_chunk}")
-
-            text = ""
-            is_finished = False
-            finish_reason = None
-            if json_chunk["done"] is True:
-                text = ""
-                is_finished = True
-                finish_reason = "stop"
-                return {
-                    "text": text,
-                    "is_finished": is_finished,
-                    "finish_reason": finish_reason,
-                }
-            elif "message" in json_chunk:
-                print_verbose(f"delta content: {json_chunk}")
-                text = json_chunk["message"]["content"]
-                return {
-                    "text": text,
-                    "is_finished": is_finished,
-                    "finish_reason": finish_reason,
-                }
-            else:
-                raise Exception(f"Ollama Error - {json_chunk}")
-        except Exception as e:
-            raise e
-
-    def handle_watsonx_stream(self, chunk):
-        try:
-            if isinstance(chunk, dict):
-                parsed_response = chunk
-            elif isinstance(chunk, (str, bytes)):
-                if isinstance(chunk, bytes):
-                    chunk = chunk.decode("utf-8")
-                if "generated_text" in chunk:
-                    response = chunk.replace("data: ", "").strip()
-                    parsed_response = json.loads(response)
-                else:
-                    return {
-                        "text": "",
-                        "is_finished": False,
-                        "prompt_tokens": 0,
-                        "completion_tokens": 0,
-                    }
-            else:
-                print_verbose(f"chunk: {chunk} (Type: {type(chunk)})")
-                raise ValueError(
-                    f"Unable to parse response. Original response: {chunk}"
-                )
-            results = parsed_response.get("results", [])
-            if len(results) > 0:
-                text = results[0].get("generated_text", "")
-                finish_reason = results[0].get("stop_reason")
-                is_finished = finish_reason != "not_finished"
-                return {
-                    "text": text,
-                    "is_finished": is_finished,
-                    "finish_reason": finish_reason,
-                    "prompt_tokens": results[0].get("input_token_count", 0),
-                    "completion_tokens": results[0].get("generated_token_count", 0),
-                }
-            return {"text": "", "is_finished": False}
-        except Exception as e:
-            raise e
-
-    def handle_triton_stream(self, chunk):
-        try:
-            if isinstance(chunk, dict):
-                parsed_response = chunk
-            elif isinstance(chunk, (str, bytes)):
-                if isinstance(chunk, bytes):
-                    chunk = chunk.decode("utf-8")
-                if "text_output" in chunk:
-                    response = chunk.replace("data: ", "").strip()
-                    parsed_response = json.loads(response)
-                else:
-                    return {
-                        "text": "",
-                        "is_finished": False,
-                        "prompt_tokens": 0,
-                        "completion_tokens": 0,
-                    }
-            else:
-                print_verbose(f"chunk: {chunk} (Type: {type(chunk)})")
-                raise ValueError(
-                    f"Unable to parse response. Original response: {chunk}"
-                )
-            text = parsed_response.get("text_output", "")
-            finish_reason = parsed_response.get("stop_reason")
-            is_finished = parsed_response.get("is_finished", False)
-            return {
-                "text": text,
-                "is_finished": is_finished,
-                "finish_reason": finish_reason,
-                "prompt_tokens": parsed_response.get("input_token_count", 0),
-                "completion_tokens": parsed_response.get("generated_token_count", 0),
-            }
-            return {"text": "", "is_finished": False}
-        except Exception as e:
-            raise e
-
-    def handle_clarifai_completion_chunk(self, chunk):
-        try:
-            if isinstance(chunk, dict):
-                parsed_response = chunk
-            elif isinstance(chunk, (str, bytes)):
-                if isinstance(chunk, bytes):
-                    parsed_response = chunk.decode("utf-8")
-                else:
-                    parsed_response = chunk
-            else:
-                raise ValueError("Unable to parse streaming chunk")
-            if isinstance(parsed_response, dict):
-                data_json = parsed_response
-            else:
-                data_json = json.loads(parsed_response)
-            text = (
-                data_json.get("outputs", "")[0]
-                .get("data", "")
-                .get("text", "")
-                .get("raw", "")
-            )
-            len(
-                encoding.encode(
-                    data_json.get("outputs", "")[0]
-                    .get("input", "")
-                    .get("data", "")
-                    .get("text", "")
-                    .get("raw", "")
-                )
-            )
-            len(encoding.encode(text))
-            return {
-                "text": text,
-                "is_finished": True,
-            }
-        except Exception as e:
-            verbose_logger.exception(
-                "litellm.CustomStreamWrapper.handle_clarifai_chunk(): Exception occured - {}".format(
-                    str(e)
-                )
-            )
-            return ""
-
-    def model_response_creator(
-        self, chunk: Optional[dict] = None, hidden_params: Optional[dict] = None
-    ):
-        _model = self.model
-        _received_llm_provider = self.custom_llm_provider
-        _logging_obj_llm_provider = self.logging_obj.model_call_details.get("custom_llm_provider", None)  # type: ignore
-        if (
-            _received_llm_provider == "openai"
-            and _received_llm_provider != _logging_obj_llm_provider
-        ):
-            _model = "{}/{}".format(_logging_obj_llm_provider, _model)
-        if chunk is None:
-            chunk = {}
-        else:
-            # pop model keyword
-            chunk.pop("model", None)
-
-        model_response = ModelResponse(
-            stream=True, model=_model, stream_options=self.stream_options, **chunk
-        )
-        if self.response_id is not None:
-            model_response.id = self.response_id
-        else:
-            self.response_id = model_response.id  # type: ignore
-        if self.system_fingerprint is not None:
-            model_response.system_fingerprint = self.system_fingerprint
-        if hidden_params is not None:
-            model_response._hidden_params = hidden_params
-        model_response._hidden_params["custom_llm_provider"] = _logging_obj_llm_provider
-        model_response._hidden_params["created_at"] = time.time()
-        model_response._hidden_params = {
-            **model_response._hidden_params,
-            **self._hidden_params,
-        }
-
-        if (
-            len(model_response.choices) > 0
-            and getattr(model_response.choices[0], "delta") is not None
-        ):
-            # do nothing, if object instantiated
-            pass
-        else:
-            model_response.choices = [StreamingChoices(finish_reason=None)]
-        return model_response
-
-    def is_delta_empty(self, delta: Delta) -> bool:
-        is_empty = True
-        if delta.content is not None:
-            is_empty = False
-        elif delta.tool_calls is not None:
-            is_empty = False
-        elif delta.function_call is not None:
-            is_empty = False
-        return is_empty
-
-    def return_processed_chunk_logic(  # noqa
-        self,
-        completion_obj: dict,
-        model_response: ModelResponseStream,
-        response_obj: dict,
-    ):
-
-        print_verbose(
-            f"completion_obj: {completion_obj}, model_response.choices[0]: {model_response.choices[0]}, response_obj: {response_obj}"
-        )
-        if (
-            "content" in completion_obj
-            and (
-                isinstance(completion_obj["content"], str)
-                and len(completion_obj["content"]) > 0
-            )
-            or (
-                "tool_calls" in completion_obj
-                and completion_obj["tool_calls"] is not None
-                and len(completion_obj["tool_calls"]) > 0
-            )
-            or (
-                "function_call" in completion_obj
-                and completion_obj["function_call"] is not None
-            )
-        ):  # cannot set content of an OpenAI Object to be an empty string
-            self.safety_checker()
-            hold, model_response_str = self.check_special_tokens(
-                chunk=completion_obj["content"],
-                finish_reason=model_response.choices[0].finish_reason,
-            )  # filter out bos/eos tokens from openai-compatible hf endpoints
-            print_verbose(f"hold - {hold}, model_response_str - {model_response_str}")
-            if hold is False:
-                ## check if openai/azure chunk
-                original_chunk = response_obj.get("original_chunk", None)
-                if original_chunk:
-                    model_response.id = original_chunk.id
-                    self.response_id = original_chunk.id
-                    if len(original_chunk.choices) > 0:
-                        choices = []
-                        for choice in original_chunk.choices:
-                            try:
-                                if isinstance(choice, BaseModel):
-                                    choice_json = choice.model_dump()
-                                    choice_json.pop(
-                                        "finish_reason", None
-                                    )  # for mistral etc. which return a value in their last chunk (not-openai compatible).
-                                    print_verbose(f"choice_json: {choice_json}")
-                                    choices.append(StreamingChoices(**choice_json))
-                            except Exception:
-                                choices.append(StreamingChoices())
-                        print_verbose(f"choices in streaming: {choices}")
-                        setattr(model_response, "choices", choices)
-                    else:
-                        return
-                    model_response.system_fingerprint = (
-                        original_chunk.system_fingerprint
-                    )
-                    setattr(
-                        model_response,
-                        "citations",
-                        getattr(original_chunk, "citations", None),
-                    )
-                    print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}")
-                    if self.sent_first_chunk is False:
-                        model_response.choices[0].delta["role"] = "assistant"
-                        self.sent_first_chunk = True
-                    elif self.sent_first_chunk is True and hasattr(
-                        model_response.choices[0].delta, "role"
-                    ):
-                        _initial_delta = model_response.choices[0].delta.model_dump()
-                        _initial_delta.pop("role", None)
-                        model_response.choices[0].delta = Delta(**_initial_delta)
-                    print_verbose(
-                        f"model_response.choices[0].delta: {model_response.choices[0].delta}"
-                    )
-                else:
-                    ## else
-                    completion_obj["content"] = model_response_str
-                    if self.sent_first_chunk is False:
-                        completion_obj["role"] = "assistant"
-                        self.sent_first_chunk = True
-
-                    model_response.choices[0].delta = Delta(**completion_obj)
-                    _index: Optional[int] = completion_obj.get("index")
-                    if _index is not None:
-                        model_response.choices[0].index = _index
-                print_verbose(f"returning model_response: {model_response}")
-                return model_response
-            else:
-                return
-        elif self.received_finish_reason is not None:
-            if self.sent_last_chunk is True:
-                # Bedrock returns the guardrail trace in the last chunk - we want to return this here
-                if self.custom_llm_provider == "bedrock" and "trace" in model_response:
-                    return model_response
-
-                # Default - return StopIteration
-                raise StopIteration
-            # flush any remaining holding chunk
-            if len(self.holding_chunk) > 0:
-                if model_response.choices[0].delta.content is None:
-                    model_response.choices[0].delta.content = self.holding_chunk
-                else:
-                    model_response.choices[0].delta.content = (
-                        self.holding_chunk + model_response.choices[0].delta.content
-                    )
-                self.holding_chunk = ""
-            # if delta is None
-            _is_delta_empty = self.is_delta_empty(delta=model_response.choices[0].delta)
-
-            if _is_delta_empty:
-                # get any function call arguments
-                model_response.choices[0].finish_reason = map_finish_reason(
-                    finish_reason=self.received_finish_reason
-                )  # ensure consistent output to openai
-
-                self.sent_last_chunk = True
-
-            return model_response
-        elif (
-            model_response.choices[0].delta.tool_calls is not None
-            or model_response.choices[0].delta.function_call is not None
-        ):
-            if self.sent_first_chunk is False:
-                model_response.choices[0].delta["role"] = "assistant"
-                self.sent_first_chunk = True
-            return model_response
-        elif (
-            len(model_response.choices) > 0
-            and hasattr(model_response.choices[0].delta, "audio")
-            and model_response.choices[0].delta.audio is not None
-        ):
-            return model_response
-        else:
-            if hasattr(model_response, "usage"):
-                self.chunks.append(model_response)
-            return
-
-    def chunk_creator(self, chunk):  # type: ignore  # noqa: PLR0915
-        model_response = self.model_response_creator()
-        response_obj: dict = {}
-        try:
-            # return this for all models
-            completion_obj = {"content": ""}
-            from litellm.litellm_core_utils.streaming_utils import (
-                generic_chunk_has_all_required_fields,
-            )
-            from litellm.types.utils import GenericStreamingChunk as GChunk
-
-            if (
-                isinstance(chunk, dict)
-                and generic_chunk_has_all_required_fields(
-                    chunk=chunk
-                )  # check if chunk is a generic streaming chunk
-            ) or (
-                self.custom_llm_provider
-                and (
-                    self.custom_llm_provider == "anthropic"
-                    or self.custom_llm_provider in litellm._custom_providers
-                )
-            ):
-
-                if self.received_finish_reason is not None:
-                    if "provider_specific_fields" not in chunk:
-                        raise StopIteration
-                anthropic_response_obj: GChunk = chunk
-                completion_obj["content"] = anthropic_response_obj["text"]
-                if anthropic_response_obj["is_finished"]:
-                    self.received_finish_reason = anthropic_response_obj[
-                        "finish_reason"
-                    ]
-
-                if anthropic_response_obj["usage"] is not None:
-                    model_response.usage = litellm.Usage(
-                        **anthropic_response_obj["usage"]
-                    )
-
-                if (
-                    "tool_use" in anthropic_response_obj
-                    and anthropic_response_obj["tool_use"] is not None
-                ):
-                    completion_obj["tool_calls"] = [anthropic_response_obj["tool_use"]]
-
-                if (
-                    "provider_specific_fields" in anthropic_response_obj
-                    and anthropic_response_obj["provider_specific_fields"] is not None
-                ):
-                    for key, value in anthropic_response_obj[
-                        "provider_specific_fields"
-                    ].items():
-                        setattr(model_response, key, value)
-
-                response_obj = anthropic_response_obj
-            elif (
-                self.custom_llm_provider
-                and self.custom_llm_provider == "anthropic_text"
-            ):
-                response_obj = self.handle_anthropic_text_chunk(chunk)
-                completion_obj["content"] = response_obj["text"]
-                if response_obj["is_finished"]:
-                    self.received_finish_reason = response_obj["finish_reason"]
-            elif self.custom_llm_provider and self.custom_llm_provider == "clarifai":
-                response_obj = self.handle_clarifai_completion_chunk(chunk)
-                completion_obj["content"] = response_obj["text"]
-                if response_obj["is_finished"]:
-                    self.received_finish_reason = response_obj["finish_reason"]
-            elif self.model == "replicate" or self.custom_llm_provider == "replicate":
-                response_obj = self.handle_replicate_chunk(chunk)
-                completion_obj["content"] = response_obj["text"]
-                if response_obj["is_finished"]:
-                    self.received_finish_reason = response_obj["finish_reason"]
-            elif self.custom_llm_provider and self.custom_llm_provider == "huggingface":
-                response_obj = self.handle_huggingface_chunk(chunk)
-                completion_obj["content"] = response_obj["text"]
-                if response_obj["is_finished"]:
-                    self.received_finish_reason = response_obj["finish_reason"]
-            elif self.custom_llm_provider and self.custom_llm_provider == "predibase":
-                response_obj = self.handle_predibase_chunk(chunk)
-                completion_obj["content"] = response_obj["text"]
-                if response_obj["is_finished"]:
-                    self.received_finish_reason = response_obj["finish_reason"]
-            elif (
-                self.custom_llm_provider and self.custom_llm_provider == "baseten"
-            ):  # baseten doesn't provide streaming
-                completion_obj["content"] = self.handle_baseten_chunk(chunk)
-            elif (
-                self.custom_llm_provider and self.custom_llm_provider == "ai21"
-            ):  # ai21 doesn't provide streaming
-                response_obj = self.handle_ai21_chunk(chunk)
-                completion_obj["content"] = response_obj["text"]
-                if response_obj["is_finished"]:
-                    self.received_finish_reason = response_obj["finish_reason"]
-            elif self.custom_llm_provider and self.custom_llm_provider == "maritalk":
-                response_obj = self.handle_maritalk_chunk(chunk)
-                completion_obj["content"] = response_obj["text"]
-                if response_obj["is_finished"]:
-                    self.received_finish_reason = response_obj["finish_reason"]
-            elif self.custom_llm_provider and self.custom_llm_provider == "vllm":
-                completion_obj["content"] = chunk[0].outputs[0].text
-            elif (
-                self.custom_llm_provider and self.custom_llm_provider == "aleph_alpha"
-            ):  # aleph alpha doesn't provide streaming
-                response_obj = self.handle_aleph_alpha_chunk(chunk)
-                completion_obj["content"] = response_obj["text"]
-                if response_obj["is_finished"]:
-                    self.received_finish_reason = response_obj["finish_reason"]
-            elif self.custom_llm_provider == "nlp_cloud":
-                try:
-                    response_obj = self.handle_nlp_cloud_chunk(chunk)
-                    completion_obj["content"] = response_obj["text"]
-                    if response_obj["is_finished"]:
-                        self.received_finish_reason = response_obj["finish_reason"]
-                except Exception as e:
-                    if self.received_finish_reason:
-                        raise e
-                    else:
-                        if self.sent_first_chunk is False:
-                            raise Exception("An unknown error occurred with the stream")
-                        self.received_finish_reason = "stop"
-            elif self.custom_llm_provider == "vertex_ai":
-                import proto  # type: ignore
-
-                if self.model.startswith("claude-3"):
-                    response_obj = self.handle_vertexai_anthropic_chunk(chunk=chunk)
-                    if response_obj is None:
-                        return
-                    completion_obj["content"] = response_obj["text"]
-                    setattr(model_response, "usage", Usage())
-                    if response_obj.get("prompt_tokens", None) is not None:
-                        model_response.usage.prompt_tokens = response_obj[
-                            "prompt_tokens"
-                        ]
-                    if response_obj.get("completion_tokens", None) is not None:
-                        model_response.usage.completion_tokens = response_obj[
-                            "completion_tokens"
-                        ]
-                    if hasattr(model_response.usage, "prompt_tokens"):
-                        model_response.usage.total_tokens = (
-                            getattr(model_response.usage, "total_tokens", 0)
-                            + model_response.usage.prompt_tokens
-                        )
-                    if hasattr(model_response.usage, "completion_tokens"):
-                        model_response.usage.total_tokens = (
-                            getattr(model_response.usage, "total_tokens", 0)
-                            + model_response.usage.completion_tokens
-                        )
-
-                    if response_obj["is_finished"]:
-                        self.received_finish_reason = response_obj["finish_reason"]
-                elif hasattr(chunk, "candidates") is True:
-                    try:
-                        try:
-                            completion_obj["content"] = chunk.text
-                        except Exception as e:
-                            if "Part has no text." in str(e):
-                                ## check for function calling
-                                function_call = (
-                                    chunk.candidates[0].content.parts[0].function_call
-                                )
-
-                                args_dict = {}
-
-                                # Check if it's a RepeatedComposite instance
-                                for key, val in function_call.args.items():
-                                    if isinstance(
-                                        val,
-                                        proto.marshal.collections.repeated.RepeatedComposite,
-                                    ):
-                                        # If so, convert to list
-                                        args_dict[key] = [v for v in val]
-                                    else:
-                                        args_dict[key] = val
-
-                                try:
-                                    args_str = json.dumps(args_dict)
-                                except Exception as e:
-                                    raise e
-                                _delta_obj = litellm.utils.Delta(
-                                    content=None,
-                                    tool_calls=[
-                                        {
-                                            "id": f"call_{str(uuid.uuid4())}",
-                                            "function": {
-                                                "arguments": args_str,
-                                                "name": function_call.name,
-                                            },
-                                            "type": "function",
-                                        }
-                                    ],
-                                )
-                                _streaming_response = StreamingChoices(delta=_delta_obj)
-                                _model_response = ModelResponse(stream=True)
-                                _model_response.choices = [_streaming_response]
-                                response_obj = {"original_chunk": _model_response}
-                            else:
-                                raise e
-                        if (
-                            hasattr(chunk.candidates[0], "finish_reason")
-                            and chunk.candidates[0].finish_reason.name
-                            != "FINISH_REASON_UNSPECIFIED"
-                        ):  # every non-final chunk in vertex ai has this
-                            self.received_finish_reason = chunk.candidates[
-                                0
-                            ].finish_reason.name
-                    except Exception:
-                        if chunk.candidates[0].finish_reason.name == "SAFETY":
-                            raise Exception(
-                                f"The response was blocked by VertexAI. {str(chunk)}"
-                            )
-                else:
-                    completion_obj["content"] = str(chunk)
-            elif self.custom_llm_provider == "cohere":
-                response_obj = self.handle_cohere_chunk(chunk)
-                completion_obj["content"] = response_obj["text"]
-                if response_obj["is_finished"]:
-                    self.received_finish_reason = response_obj["finish_reason"]
-            elif self.custom_llm_provider == "cohere_chat":
-                response_obj = self.handle_cohere_chat_chunk(chunk)
-                if response_obj is None:
-                    return
-                completion_obj["content"] = response_obj["text"]
-                if response_obj["is_finished"]:
-                    self.received_finish_reason = response_obj["finish_reason"]
-
-            elif self.custom_llm_provider == "petals":
-                if len(self.completion_stream) == 0:
-                    if self.received_finish_reason is not None:
-                        raise StopIteration
-                    else:
-                        self.received_finish_reason = "stop"
-                chunk_size = 30
-                new_chunk = self.completion_stream[:chunk_size]
-                completion_obj["content"] = new_chunk
-                self.completion_stream = self.completion_stream[chunk_size:]
-            elif self.custom_llm_provider == "palm":
-                # fake streaming
-                response_obj = {}
-                if len(self.completion_stream) == 0:
-                    if self.received_finish_reason is not None:
-                        raise StopIteration
-                    else:
-                        self.received_finish_reason = "stop"
-                chunk_size = 30
-                new_chunk = self.completion_stream[:chunk_size]
-                completion_obj["content"] = new_chunk
-                self.completion_stream = self.completion_stream[chunk_size:]
-            elif self.custom_llm_provider == "ollama":
-                response_obj = self.handle_ollama_stream(chunk)
-                completion_obj["content"] = response_obj["text"]
-                print_verbose(f"completion obj content: {completion_obj['content']}")
-                if response_obj["is_finished"]:
-                    self.received_finish_reason = response_obj["finish_reason"]
-            elif self.custom_llm_provider == "ollama_chat":
-                response_obj = self.handle_ollama_chat_stream(chunk)
-                completion_obj["content"] = response_obj["text"]
-                print_verbose(f"completion obj content: {completion_obj['content']}")
-                if response_obj["is_finished"]:
-                    self.received_finish_reason = response_obj["finish_reason"]
-            elif self.custom_llm_provider == "cloudflare":
-                response_obj = self.handle_cloudlfare_stream(chunk)
-                completion_obj["content"] = response_obj["text"]
-                print_verbose(f"completion obj content: {completion_obj['content']}")
-                if response_obj["is_finished"]:
-                    self.received_finish_reason = response_obj["finish_reason"]
-            elif self.custom_llm_provider == "watsonx":
-                response_obj = self.handle_watsonx_stream(chunk)
-                completion_obj["content"] = response_obj["text"]
-                if response_obj["is_finished"]:
-                    self.received_finish_reason = response_obj["finish_reason"]
-            elif self.custom_llm_provider == "triton":
-                response_obj = self.handle_triton_stream(chunk)
-                completion_obj["content"] = response_obj["text"]
-                print_verbose(f"completion obj content: {completion_obj['content']}")
-                if response_obj["is_finished"]:
-                    self.received_finish_reason = response_obj["finish_reason"]
-            elif self.custom_llm_provider == "text-completion-openai":
-                response_obj = self.handle_openai_text_completion_chunk(chunk)
-                completion_obj["content"] = response_obj["text"]
-                print_verbose(f"completion obj content: {completion_obj['content']}")
-                if response_obj["is_finished"]:
-                    self.received_finish_reason = response_obj["finish_reason"]
-                if response_obj["usage"] is not None:
-                    model_response.usage = litellm.Usage(
-                        prompt_tokens=response_obj["usage"].prompt_tokens,
-                        completion_tokens=response_obj["usage"].completion_tokens,
-                        total_tokens=response_obj["usage"].total_tokens,
-                    )
-            elif self.custom_llm_provider == "text-completion-codestral":
-                response_obj = litellm.MistralTextCompletionConfig()._chunk_parser(
-                    chunk
-                )
-                completion_obj["content"] = response_obj["text"]
-                print_verbose(f"completion obj content: {completion_obj['content']}")
-                if response_obj["is_finished"]:
-                    self.received_finish_reason = response_obj["finish_reason"]
-                if "usage" in response_obj is not None:
-                    model_response.usage = litellm.Usage(
-                        prompt_tokens=response_obj["usage"].prompt_tokens,
-                        completion_tokens=response_obj["usage"].completion_tokens,
-                        total_tokens=response_obj["usage"].total_tokens,
-                    )
-            elif self.custom_llm_provider == "azure_text":
-                response_obj = self.handle_azure_text_completion_chunk(chunk)
-                completion_obj["content"] = response_obj["text"]
-                print_verbose(f"completion obj content: {completion_obj['content']}")
-                if response_obj["is_finished"]:
-                    self.received_finish_reason = response_obj["finish_reason"]
-            elif self.custom_llm_provider == "cached_response":
-                response_obj = {
-                    "text": chunk.choices[0].delta.content,
-                    "is_finished": True,
-                    "finish_reason": chunk.choices[0].finish_reason,
-                    "original_chunk": chunk,
-                    "tool_calls": (
-                        chunk.choices[0].delta.tool_calls
-                        if hasattr(chunk.choices[0].delta, "tool_calls")
-                        else None
-                    ),
-                }
-
-                completion_obj["content"] = response_obj["text"]
-                if response_obj["tool_calls"] is not None:
-                    completion_obj["tool_calls"] = response_obj["tool_calls"]
-                print_verbose(f"completion obj content: {completion_obj['content']}")
-                if hasattr(chunk, "id"):
-                    model_response.id = chunk.id
-                    self.response_id = chunk.id
-                if hasattr(chunk, "system_fingerprint"):
-                    self.system_fingerprint = chunk.system_fingerprint
-                if response_obj["is_finished"]:
-                    self.received_finish_reason = response_obj["finish_reason"]
-            else:  # openai / azure chat model
-                if self.custom_llm_provider == "azure":
-                    if hasattr(chunk, "model"):
-                        # for azure, we need to pass the model from the orignal chunk
-                        self.model = chunk.model
-                response_obj = self.handle_openai_chat_completion_chunk(chunk)
-                if response_obj is None:
-                    return
-                completion_obj["content"] = response_obj["text"]
-                print_verbose(f"completion obj content: {completion_obj['content']}")
-                if response_obj["is_finished"]:
-                    if response_obj["finish_reason"] == "error":
-                        raise Exception(
-                            "{} raised a streaming error - finish_reason: error, no content string given. Received Chunk={}".format(
-                                self.custom_llm_provider, response_obj
-                            )
-                        )
-                    self.received_finish_reason = response_obj["finish_reason"]
-                if response_obj.get("original_chunk", None) is not None:
-                    if hasattr(response_obj["original_chunk"], "id"):
-                        model_response.id = response_obj["original_chunk"].id
-                        self.response_id = model_response.id
-                    if hasattr(response_obj["original_chunk"], "system_fingerprint"):
-                        model_response.system_fingerprint = response_obj[
-                            "original_chunk"
-                        ].system_fingerprint
-                        self.system_fingerprint = response_obj[
-                            "original_chunk"
-                        ].system_fingerprint
-                if response_obj["logprobs"] is not None:
-                    model_response.choices[0].logprobs = response_obj["logprobs"]
-
-                if response_obj["usage"] is not None:
-                    if isinstance(response_obj["usage"], dict):
-                        model_response.usage = litellm.Usage(
-                            prompt_tokens=response_obj["usage"].get(
-                                "prompt_tokens", None
-                            )
-                            or None,
-                            completion_tokens=response_obj["usage"].get(
-                                "completion_tokens", None
-                            )
-                            or None,
-                            total_tokens=response_obj["usage"].get("total_tokens", None)
-                            or None,
-                        )
-                    elif isinstance(response_obj["usage"], BaseModel):
-                        model_response.usage = litellm.Usage(
-                            **response_obj["usage"].model_dump()
-                        )
-
-            model_response.model = self.model
-            print_verbose(
-                f"model_response finish reason 3: {self.received_finish_reason}; response_obj={response_obj}"
-            )
-            ## FUNCTION CALL PARSING
-            if (
-                response_obj is not None
-                and response_obj.get("original_chunk", None) is not None
-            ):  # function / tool calling branch - only set for openai/azure compatible endpoints
-                # enter this branch when no content has been passed in response
-                original_chunk = response_obj.get("original_chunk", None)
-                model_response.id = original_chunk.id
-                self.response_id = original_chunk.id
-                if original_chunk.choices and len(original_chunk.choices) > 0:
-                    delta = original_chunk.choices[0].delta
-                    if delta is not None and (
-                        delta.function_call is not None or delta.tool_calls is not None
-                    ):
-                        try:
-                            model_response.system_fingerprint = (
-                                original_chunk.system_fingerprint
-                            )
-                            ## AZURE - check if arguments is not None
-                            if (
-                                original_chunk.choices[0].delta.function_call
-                                is not None
-                            ):
-                                if (
-                                    getattr(
-                                        original_chunk.choices[0].delta.function_call,
-                                        "arguments",
-                                    )
-                                    is None
-                                ):
-                                    original_chunk.choices[
-                                        0
-                                    ].delta.function_call.arguments = ""
-                            elif original_chunk.choices[0].delta.tool_calls is not None:
-                                if isinstance(
-                                    original_chunk.choices[0].delta.tool_calls, list
-                                ):
-                                    for t in original_chunk.choices[0].delta.tool_calls:
-                                        if hasattr(t, "functions") and hasattr(
-                                            t.functions, "arguments"
-                                        ):
-                                            if (
-                                                getattr(
-                                                    t.function,
-                                                    "arguments",
-                                                )
-                                                is None
-                                            ):
-                                                t.function.arguments = ""
-                            _json_delta = delta.model_dump()
-                            print_verbose(f"_json_delta: {_json_delta}")
-                            if "role" not in _json_delta or _json_delta["role"] is None:
-                                _json_delta["role"] = (
-                                    "assistant"  # mistral's api returns role as None
-                                )
-                            if "tool_calls" in _json_delta and isinstance(
-                                _json_delta["tool_calls"], list
-                            ):
-                                for tool in _json_delta["tool_calls"]:
-                                    if (
-                                        isinstance(tool, dict)
-                                        and "function" in tool
-                                        and isinstance(tool["function"], dict)
-                                        and ("type" not in tool or tool["type"] is None)
-                                    ):
-                                        # if function returned but type set to None - mistral's api returns type: None
-                                        tool["type"] = "function"
-                            model_response.choices[0].delta = Delta(**_json_delta)
-                        except Exception as e:
-                            verbose_logger.exception(
-                                "litellm.CustomStreamWrapper.chunk_creator(): Exception occured - {}".format(
-                                    str(e)
-                                )
-                            )
-                            model_response.choices[0].delta = Delta()
-                    elif (
-                        delta is not None and getattr(delta, "audio", None) is not None
-                    ):
-                        model_response.choices[0].delta.audio = delta.audio
-                    else:
-                        try:
-                            delta = (
-                                dict()
-                                if original_chunk.choices[0].delta is None
-                                else dict(original_chunk.choices[0].delta)
-                            )
-                            print_verbose(f"original delta: {delta}")
-                            model_response.choices[0].delta = Delta(**delta)
-                            print_verbose(
-                                f"new delta: {model_response.choices[0].delta}"
-                            )
-                        except Exception:
-                            model_response.choices[0].delta = Delta()
-                else:
-                    if (
-                        self.stream_options is not None
-                        and self.stream_options["include_usage"] is True
-                    ):
-                        return model_response
-                    return
-            print_verbose(
-                f"model_response.choices[0].delta: {model_response.choices[0].delta}; completion_obj: {completion_obj}"
-            )
-            print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}")
-
-            ## CHECK FOR TOOL USE
-            if "tool_calls" in completion_obj and len(completion_obj["tool_calls"]) > 0:
-                if self.is_function_call is True:  # user passed in 'functions' param
-                    completion_obj["function_call"] = completion_obj["tool_calls"][0][
-                        "function"
-                    ]
-                    completion_obj["tool_calls"] = None
-
-                self.tool_call = True
-
-            ## RETURN ARG
-            return self.return_processed_chunk_logic(
-                completion_obj=completion_obj,
-                model_response=model_response,  # type: ignore
-                response_obj=response_obj,
-            )
-
-        except StopIteration:
-            raise StopIteration
-        except Exception as e:
-            traceback.format_exc()
-            e.message = str(e)
-            raise exception_type(
-                model=self.model,
-                custom_llm_provider=self.custom_llm_provider,
-                original_exception=e,
-            )
-
-    def set_logging_event_loop(self, loop):
-        """
-        import litellm, asyncio
-
-        loop = asyncio.get_event_loop() # 👈 gets the current event loop
-
-        response = litellm.completion(.., stream=True)
-
-        response.set_logging_event_loop(loop=loop) # 👈 enables async_success callbacks for sync logging
-
-        for chunk in response:
-            ...
-        """
-        self.logging_loop = loop
-
-    def run_success_logging_and_cache_storage(self, processed_chunk, cache_hit: bool):
-        """
-        Runs success logging in a thread and adds the response to the cache
-        """
-        if litellm.disable_streaming_logging is True:
-            """
-            [NOT RECOMMENDED]
-            Set this via `litellm.disable_streaming_logging = True`.
-
-            Disables streaming logging.
-            """
-            return
-        ## ASYNC LOGGING
-        # Create an event loop for the new thread
-        if self.logging_loop is not None:
-            future = asyncio.run_coroutine_threadsafe(
-                self.logging_obj.async_success_handler(
-                    processed_chunk, None, None, cache_hit
-                ),
-                loop=self.logging_loop,
-            )
-            future.result()
-        else:
-            asyncio.run(
-                self.logging_obj.async_success_handler(
-                    processed_chunk, None, None, cache_hit
-                )
-            )
-        ## SYNC LOGGING
-        self.logging_obj.success_handler(processed_chunk, None, None, cache_hit)
-
-        ## Sync store in cache
-        if self.logging_obj._llm_caching_handler is not None:
-            self.logging_obj._llm_caching_handler._sync_add_streaming_response_to_cache(
-                processed_chunk
-            )
-
-    def finish_reason_handler(self):
-        model_response = self.model_response_creator()
-        if self.received_finish_reason is not None:
-            model_response.choices[0].finish_reason = map_finish_reason(
-                finish_reason=self.received_finish_reason
-            )
-        else:
-            model_response.choices[0].finish_reason = "stop"
-
-        ## if tool use
-        if (
-            model_response.choices[0].finish_reason == "stop" and self.tool_call
-        ):  # don't overwrite for other - potential error finish reasons
-            model_response.choices[0].finish_reason = "tool_calls"
-        return model_response
-
-    def __next__(self):  # noqa: PLR0915
-        cache_hit = False
-        if (
-            self.custom_llm_provider is not None
-            and self.custom_llm_provider == "cached_response"
-        ):
-            cache_hit = True
-        try:
-            if self.completion_stream is None:
-                self.fetch_sync_stream()
-            while True:
-                if (
-                    isinstance(self.completion_stream, str)
-                    or isinstance(self.completion_stream, bytes)
-                    or isinstance(self.completion_stream, ModelResponse)
-                ):
-                    chunk = self.completion_stream
-                else:
-                    chunk = next(self.completion_stream)
-                if chunk is not None and chunk != b"":
-                    print_verbose(
-                        f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}; custom_llm_provider: {self.custom_llm_provider}"
-                    )
-                    response: Optional[ModelResponse] = self.chunk_creator(chunk=chunk)
-                    print_verbose(f"PROCESSED CHUNK POST CHUNK CREATOR: {response}")
-
-                    if response is None:
-                        continue
-                    ## LOGGING
-                    threading.Thread(
-                        target=self.run_success_logging_and_cache_storage,
-                        args=(response, cache_hit),
-                    ).start()  # log response
-                    choice = response.choices[0]
-                    if isinstance(choice, StreamingChoices):
-                        self.response_uptil_now += choice.delta.get("content", "") or ""
-                    else:
-                        self.response_uptil_now += ""
-                    self.rules.post_call_rules(
-                        input=self.response_uptil_now, model=self.model
-                    )
-                    # HANDLE STREAM OPTIONS
-                    self.chunks.append(response)
-                    if hasattr(
-                        response, "usage"
-                    ):  # remove usage from chunk, only send on final chunk
-                        # Convert the object to a dictionary
-                        obj_dict = response.dict()
-
-                        # Remove an attribute (e.g., 'attr2')
-                        if "usage" in obj_dict:
-                            del obj_dict["usage"]
-
-                        # Create a new object without the removed attribute
-                        response = self.model_response_creator(
-                            chunk=obj_dict, hidden_params=response._hidden_params
-                        )
-                    # add usage as hidden param
-                    if self.sent_last_chunk is True and self.stream_options is None:
-                        usage = calculate_total_usage(chunks=self.chunks)
-                        response._hidden_params["usage"] = usage
-                    # RETURN RESULT
-                    return response
-
-        except StopIteration:
-            if self.sent_last_chunk is True:
-                complete_streaming_response = litellm.stream_chunk_builder(
-                    chunks=self.chunks, messages=self.messages
-                )
-                response = self.model_response_creator()
-                if complete_streaming_response is not None:
-                    setattr(
-                        response,
-                        "usage",
-                        getattr(complete_streaming_response, "usage"),
-                    )
-
-                ## LOGGING
-                threading.Thread(
-                    target=self.logging_obj.success_handler,
-                    args=(response, None, None, cache_hit),
-                ).start()  # log response
-
-                if self.sent_stream_usage is False and self.send_stream_usage is True:
-                    self.sent_stream_usage = True
-                    return response
-                raise  # Re-raise StopIteration
-            else:
-                self.sent_last_chunk = True
-                processed_chunk = self.finish_reason_handler()
-                if self.stream_options is None:  # add usage as hidden param
-                    usage = calculate_total_usage(chunks=self.chunks)
-                    processed_chunk._hidden_params["usage"] = usage
-                ## LOGGING
-                threading.Thread(
-                    target=self.run_success_logging_and_cache_storage,
-                    args=(processed_chunk, cache_hit),
-                ).start()  # log response
-                return processed_chunk
-        except Exception as e:
-            traceback_exception = traceback.format_exc()
-            # LOG FAILURE - handle streaming failure logging in the _next_ object, remove `handle_failure` once it's deprecated
-            threading.Thread(
-                target=self.logging_obj.failure_handler, args=(e, traceback_exception)
-            ).start()
-            if isinstance(e, OpenAIError):
-                raise e
-            else:
-                raise exception_type(
-                    model=self.model,
-                    original_exception=e,
-                    custom_llm_provider=self.custom_llm_provider,
-                )
-
-    def fetch_sync_stream(self):
-        if self.completion_stream is None and self.make_call is not None:
-            # Call make_call to get the completion stream
-            self.completion_stream = self.make_call(client=litellm.module_level_client)
-            self._stream_iter = self.completion_stream.__iter__()
-
-        return self.completion_stream
-
-    async def fetch_stream(self):
-        if self.completion_stream is None and self.make_call is not None:
-            # Call make_call to get the completion stream
-            self.completion_stream = await self.make_call(
-                client=litellm.module_level_aclient
-            )
-            self._stream_iter = self.completion_stream.__aiter__()
-
-        return self.completion_stream
-
-    async def __anext__(self):  # noqa: PLR0915
-        cache_hit = False
-        if (
-            self.custom_llm_provider is not None
-            and self.custom_llm_provider == "cached_response"
-        ):
-            cache_hit = True
-        try:
-            if self.completion_stream is None:
-                await self.fetch_stream()
-
-            if (
-                self.custom_llm_provider == "openai"
-                or self.custom_llm_provider == "azure"
-                or self.custom_llm_provider == "custom_openai"
-                or self.custom_llm_provider == "text-completion-openai"
-                or self.custom_llm_provider == "text-completion-codestral"
-                or self.custom_llm_provider == "azure_text"
-                or self.custom_llm_provider == "anthropic"
-                or self.custom_llm_provider == "anthropic_text"
-                or self.custom_llm_provider == "huggingface"
-                or self.custom_llm_provider == "ollama"
-                or self.custom_llm_provider == "ollama_chat"
-                or self.custom_llm_provider == "vertex_ai"
-                or self.custom_llm_provider == "vertex_ai_beta"
-                or self.custom_llm_provider == "sagemaker"
-                or self.custom_llm_provider == "sagemaker_chat"
-                or self.custom_llm_provider == "gemini"
-                or self.custom_llm_provider == "replicate"
-                or self.custom_llm_provider == "cached_response"
-                or self.custom_llm_provider == "predibase"
-                or self.custom_llm_provider == "databricks"
-                or self.custom_llm_provider == "bedrock"
-                or self.custom_llm_provider == "triton"
-                or self.custom_llm_provider == "watsonx"
-                or self.custom_llm_provider in litellm.openai_compatible_endpoints
-                or self.custom_llm_provider in litellm._custom_providers
-            ):
-                async for chunk in self.completion_stream:
-                    if chunk == "None" or chunk is None:
-                        raise Exception
-                    elif (
-                        self.custom_llm_provider == "gemini"
-                        and hasattr(chunk, "parts")
-                        and len(chunk.parts) == 0
-                    ):
-                        continue
-                    # chunk_creator() does logging/stream chunk building. We need to let it know its being called in_async_func, so we don't double add chunks.
-                    # __anext__ also calls async_success_handler, which does logging
-                    print_verbose(f"PROCESSED ASYNC CHUNK PRE CHUNK CREATOR: {chunk}")
-
-                    processed_chunk: Optional[ModelResponse] = self.chunk_creator(
-                        chunk=chunk
-                    )
-                    print_verbose(
-                        f"PROCESSED ASYNC CHUNK POST CHUNK CREATOR: {processed_chunk}"
-                    )
-                    if processed_chunk is None:
-                        continue
-                    ## LOGGING
-                    ## LOGGING
-                    executor.submit(
-                        self.logging_obj.success_handler,
-                        result=processed_chunk,
-                        start_time=None,
-                        end_time=None,
-                        cache_hit=cache_hit,
-                    )
-
-                    asyncio.create_task(
-                        self.logging_obj.async_success_handler(
-                            processed_chunk, cache_hit=cache_hit
-                        )
-                    )
-
-                    if self.logging_obj._llm_caching_handler is not None:
-                        asyncio.create_task(
-                            self.logging_obj._llm_caching_handler._add_streaming_response_to_cache(
-                                processed_chunk=processed_chunk,
-                            )
-                        )
-
-                    choice = processed_chunk.choices[0]
-                    if isinstance(choice, StreamingChoices):
-                        self.response_uptil_now += choice.delta.get("content", "") or ""
-                    else:
-                        self.response_uptil_now += ""
-                    self.rules.post_call_rules(
-                        input=self.response_uptil_now, model=self.model
-                    )
-                    self.chunks.append(processed_chunk)
-                    if hasattr(
-                        processed_chunk, "usage"
-                    ):  # remove usage from chunk, only send on final chunk
-                        # Convert the object to a dictionary
-                        obj_dict = processed_chunk.dict()
-
-                        # Remove an attribute (e.g., 'attr2')
-                        if "usage" in obj_dict:
-                            del obj_dict["usage"]
-
-                        # Create a new object without the removed attribute
-                        processed_chunk = self.model_response_creator(chunk=obj_dict)
-                    print_verbose(f"final returned processed chunk: {processed_chunk}")
-                    return processed_chunk
-                raise StopAsyncIteration
-            else:  # temporary patch for non-aiohttp async calls
-                # example - boto3 bedrock llms
-                while True:
-                    if isinstance(self.completion_stream, str) or isinstance(
-                        self.completion_stream, bytes
-                    ):
-                        chunk = self.completion_stream
-                    else:
-                        chunk = next(self.completion_stream)
-                    if chunk is not None and chunk != b"":
-                        print_verbose(f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}")
-                        processed_chunk: Optional[ModelResponse] = self.chunk_creator(
-                            chunk=chunk
-                        )
-                        print_verbose(
-                            f"PROCESSED CHUNK POST CHUNK CREATOR: {processed_chunk}"
-                        )
-                        if processed_chunk is None:
-                            continue
-                        ## LOGGING
-                        threading.Thread(
-                            target=self.logging_obj.success_handler,
-                            args=(processed_chunk, None, None, cache_hit),
-                        ).start()  # log processed_chunk
-                        asyncio.create_task(
-                            self.logging_obj.async_success_handler(
-                                processed_chunk, cache_hit=cache_hit
-                            )
-                        )
+#     def handle_predibase_chunk(self, chunk):
+#         try:
+#             if not isinstance(chunk, str):
+#                 chunk = chunk.decode(
+#                     "utf-8"
+#                 )  # DO NOT REMOVE this: This is required for HF inference API + Streaming
+#             text = ""
+#             is_finished = False
+#             finish_reason = ""
+#             print_verbose(f"chunk: {chunk}")
+#             if chunk.startswith("data:"):
+#                 data_json = json.loads(chunk[5:])
+#                 print_verbose(f"data json: {data_json}")
+#                 if "token" in data_json and "text" in data_json["token"]:
+#                     text = data_json["token"]["text"]
+#                 if data_json.get("details", False) and data_json["details"].get(
+#                     "finish_reason", False
+#                 ):
+#                     is_finished = True
+#                     finish_reason = data_json["details"]["finish_reason"]
+#                 elif data_json.get(
+#                     "generated_text", False
+#                 ):  # if full generated text exists, then stream is complete
+#                     text = ""  # don't return the final bos token
+#                     is_finished = True
+#                     finish_reason = "stop"
+#                 elif data_json.get("error", False):
+#                     raise Exception(data_json.get("error"))
+#                 return {
+#                     "text": text,
+#                     "is_finished": is_finished,
+#                     "finish_reason": finish_reason,
+#                 }
+#             elif "error" in chunk:
+#                 raise ValueError(chunk)
+#             return {
+#                 "text": text,
+#                 "is_finished": is_finished,
+#                 "finish_reason": finish_reason,
+#             }
+#         except Exception as e:
+#             raise e
+
+#     def handle_huggingface_chunk(self, chunk):
+#         try:
+#             if not isinstance(chunk, str):
+#                 chunk = chunk.decode(
+#                     "utf-8"
+#                 )  # DO NOT REMOVE this: This is required for HF inference API + Streaming
+#             text = ""
+#             is_finished = False
+#             finish_reason = ""
+#             print_verbose(f"chunk: {chunk}")
+#             if chunk.startswith("data:"):
+#                 data_json = json.loads(chunk[5:])
+#                 print_verbose(f"data json: {data_json}")
+#                 if "token" in data_json and "text" in data_json["token"]:
+#                     text = data_json["token"]["text"]
+#                 if data_json.get("details", False) and data_json["details"].get(
+#                     "finish_reason", False
+#                 ):
+#                     is_finished = True
+#                     finish_reason = data_json["details"]["finish_reason"]
+#                 elif data_json.get(
+#                     "generated_text", False
+#                 ):  # if full generated text exists, then stream is complete
+#                     text = ""  # don't return the final bos token
+#                     is_finished = True
+#                     finish_reason = "stop"
+#                 elif data_json.get("error", False):
+#                     raise Exception(data_json.get("error"))
+#                 return {
+#                     "text": text,
+#                     "is_finished": is_finished,
+#                     "finish_reason": finish_reason,
+#                 }
+#             elif "error" in chunk:
+#                 raise ValueError(chunk)
+#             return {
+#                 "text": text,
+#                 "is_finished": is_finished,
+#                 "finish_reason": finish_reason,
+#             }
+#         except Exception as e:
+#             raise e
+
+#     def handle_ai21_chunk(self, chunk):  # fake streaming
+#         chunk = chunk.decode("utf-8")
+#         data_json = json.loads(chunk)
+#         try:
+#             text = data_json["completions"][0]["data"]["text"]
+#             is_finished = True
+#             finish_reason = "stop"
+#             return {
+#                 "text": text,
+#                 "is_finished": is_finished,
+#                 "finish_reason": finish_reason,
+#             }
+#         except Exception:
+#             raise ValueError(f"Unable to parse response. Original response: {chunk}")
+
+#     def handle_maritalk_chunk(self, chunk):  # fake streaming
+#         chunk = chunk.decode("utf-8")
+#         data_json = json.loads(chunk)
+#         try:
+#             text = data_json["answer"]
+#             is_finished = True
+#             finish_reason = "stop"
+#             return {
+#                 "text": text,
+#                 "is_finished": is_finished,
+#                 "finish_reason": finish_reason,
+#             }
+#         except Exception:
+#             raise ValueError(f"Unable to parse response. Original response: {chunk}")
+
+#     def handle_nlp_cloud_chunk(self, chunk):
+#         text = ""
+#         is_finished = False
+#         finish_reason = ""
+#         try:
+#             if "dolphin" in self.model:
+#                 chunk = self.process_chunk(chunk=chunk)
+#             else:
+#                 data_json = json.loads(chunk)
+#                 chunk = data_json["generated_text"]
+#             text = chunk
+#             if "[DONE]" in text:
+#                 text = text.replace("[DONE]", "")
+#                 is_finished = True
+#                 finish_reason = "stop"
+#             return {
+#                 "text": text,
+#                 "is_finished": is_finished,
+#                 "finish_reason": finish_reason,
+#             }
+#         except Exception:
+#             raise ValueError(f"Unable to parse response. Original response: {chunk}")
+
+#     def handle_aleph_alpha_chunk(self, chunk):
+#         chunk = chunk.decode("utf-8")
+#         data_json = json.loads(chunk)
+#         try:
+#             text = data_json["completions"][0]["completion"]
+#             is_finished = True
+#             finish_reason = "stop"
+#             return {
+#                 "text": text,
+#                 "is_finished": is_finished,
+#                 "finish_reason": finish_reason,
+#             }
+#         except Exception:
+#             raise ValueError(f"Unable to parse response. Original response: {chunk}")
+
+#     def handle_cohere_chunk(self, chunk):
+#         chunk = chunk.decode("utf-8")
+#         data_json = json.loads(chunk)
+#         try:
+#             text = ""
+#             is_finished = False
+#             finish_reason = ""
+#             index: Optional[int] = None
+#             if "index" in data_json:
+#                 index = data_json.get("index")
+#             if "text" in data_json:
+#                 text = data_json["text"]
+#             elif "is_finished" in data_json:
+#                 is_finished = data_json["is_finished"]
+#                 finish_reason = data_json["finish_reason"]
+#             else:
+#                 raise Exception(data_json)
+#             return {
+#                 "index": index,
+#                 "text": text,
+#                 "is_finished": is_finished,
+#                 "finish_reason": finish_reason,
+#             }
+#         except Exception:
+#             raise ValueError(f"Unable to parse response. Original response: {chunk}")
+
+#     def handle_cohere_chat_chunk(self, chunk):
+#         chunk = chunk.decode("utf-8")
+#         data_json = json.loads(chunk)
+#         print_verbose(f"chunk: {chunk}")
+#         try:
+#             text = ""
+#             is_finished = False
+#             finish_reason = ""
+#             if "text" in data_json:
+#                 text = data_json["text"]
+#             elif "is_finished" in data_json and data_json["is_finished"] is True:
+#                 is_finished = data_json["is_finished"]
+#                 finish_reason = data_json["finish_reason"]
+#             else:
+#                 return
+#             return {
+#                 "text": text,
+#                 "is_finished": is_finished,
+#                 "finish_reason": finish_reason,
+#             }
+#         except Exception:
+#             raise ValueError(f"Unable to parse response. Original response: {chunk}")
+
+#     def handle_azure_chunk(self, chunk):
+#         is_finished = False
+#         finish_reason = ""
+#         text = ""
+#         print_verbose(f"chunk: {chunk}")
+#         if "data: [DONE]" in chunk:
+#             text = ""
+#             is_finished = True
+#             finish_reason = "stop"
+#             return {
+#                 "text": text,
+#                 "is_finished": is_finished,
+#                 "finish_reason": finish_reason,
+#             }
+#         elif chunk.startswith("data:"):
+#             data_json = json.loads(chunk[5:])  # chunk.startswith("data:"):
+#             try:
+#                 if len(data_json["choices"]) > 0:
+#                     delta = data_json["choices"][0]["delta"]
+#                     text = "" if delta is None else delta.get("content", "")
+#                     if data_json["choices"][0].get("finish_reason", None):
+#                         is_finished = True
+#                         finish_reason = data_json["choices"][0]["finish_reason"]
+#                 print_verbose(
+#                     f"text: {text}; is_finished: {is_finished}; finish_reason: {finish_reason}"
+#                 )
+#                 return {
+#                     "text": text,
+#                     "is_finished": is_finished,
+#                     "finish_reason": finish_reason,
+#                 }
+#             except Exception:
+#                 raise ValueError(
+#                     f"Unable to parse response. Original response: {chunk}"
+#                 )
+#         elif "error" in chunk:
+#             raise ValueError(f"Unable to parse response. Original response: {chunk}")
+#         else:
+#             return {
+#                 "text": text,
+#                 "is_finished": is_finished,
+#                 "finish_reason": finish_reason,
+#             }
+
+#     def handle_replicate_chunk(self, chunk):
+#         try:
+#             text = ""
+#             is_finished = False
+#             finish_reason = ""
+#             if "output" in chunk:
+#                 text = chunk["output"]
+#             if "status" in chunk:
+#                 if chunk["status"] == "succeeded":
+#                     is_finished = True
+#                     finish_reason = "stop"
+#             elif chunk.get("error", None):
+#                 raise Exception(chunk["error"])
+#             return {
+#                 "text": text,
+#                 "is_finished": is_finished,
+#                 "finish_reason": finish_reason,
+#             }
+#         except Exception:
+#             raise ValueError(f"Unable to parse response. Original response: {chunk}")
+
+#     def handle_openai_chat_completion_chunk(self, chunk):
+#         try:
+#             print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n")
+#             str_line = chunk
+#             text = ""
+#             is_finished = False
+#             finish_reason = None
+#             logprobs = None
+#             usage = None
+#             if str_line and str_line.choices and len(str_line.choices) > 0:
+#                 if (
+#                     str_line.choices[0].delta is not None
+#                     and str_line.choices[0].delta.content is not None
+#                 ):
+#                     text = str_line.choices[0].delta.content
+#                 else:  # function/tool calling chunk - when content is None. in this case we just return the original chunk from openai
+#                     pass
+#                 if str_line.choices[0].finish_reason:
+#                     is_finished = True
+#                     finish_reason = str_line.choices[0].finish_reason
+
+#                 # checking for logprobs
+#                 if (
+#                     hasattr(str_line.choices[0], "logprobs")
+#                     and str_line.choices[0].logprobs is not None
+#                 ):
+#                     logprobs = str_line.choices[0].logprobs
+#                 else:
+#                     logprobs = None
+
+#             usage = getattr(str_line, "usage", None)
+
+#             return {
+#                 "text": text,
+#                 "is_finished": is_finished,
+#                 "finish_reason": finish_reason,
+#                 "logprobs": logprobs,
+#                 "original_chunk": str_line,
+#                 "usage": usage,
+#             }
+#         except Exception as e:
+#             raise e
+
+#     def handle_azure_text_completion_chunk(self, chunk):
+#         try:
+#             print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n")
+#             text = ""
+#             is_finished = False
+#             finish_reason = None
+#             choices = getattr(chunk, "choices", [])
+#             if len(choices) > 0:
+#                 text = choices[0].text
+#                 if choices[0].finish_reason is not None:
+#                     is_finished = True
+#                     finish_reason = choices[0].finish_reason
+#             return {
+#                 "text": text,
+#                 "is_finished": is_finished,
+#                 "finish_reason": finish_reason,
+#             }
+
+#         except Exception as e:
+#             raise e
+
+#     def handle_openai_text_completion_chunk(self, chunk):
+#         try:
+#             print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n")
+#             text = ""
+#             is_finished = False
+#             finish_reason = None
+#             usage = None
+#             choices = getattr(chunk, "choices", [])
+#             if len(choices) > 0:
+#                 text = choices[0].text
+#                 if choices[0].finish_reason is not None:
+#                     is_finished = True
+#                     finish_reason = choices[0].finish_reason
+#             usage = getattr(chunk, "usage", None)
+#             return {
+#                 "text": text,
+#                 "is_finished": is_finished,
+#                 "finish_reason": finish_reason,
+#                 "usage": usage,
+#             }
+
+#         except Exception as e:
+#             raise e
+
+#     def handle_baseten_chunk(self, chunk):
+#         try:
+#             chunk = chunk.decode("utf-8")
+#             if len(chunk) > 0:
+#                 if chunk.startswith("data:"):
+#                     data_json = json.loads(chunk[5:])
+#                     if "token" in data_json and "text" in data_json["token"]:
+#                         return data_json["token"]["text"]
+#                     else:
+#                         return ""
+#                 data_json = json.loads(chunk)
+#                 if "model_output" in data_json:
+#                     if (
+#                         isinstance(data_json["model_output"], dict)
+#                         and "data" in data_json["model_output"]
+#                         and isinstance(data_json["model_output"]["data"], list)
+#                     ):
+#                         return data_json["model_output"]["data"][0]
+#                     elif isinstance(data_json["model_output"], str):
+#                         return data_json["model_output"]
+#                     elif "completion" in data_json and isinstance(
+#                         data_json["completion"], str
+#                     ):
+#                         return data_json["completion"]
+#                     else:
+#                         raise ValueError(
+#                             f"Unable to parse response. Original response: {chunk}"
+#                         )
+#                 else:
+#                     return ""
+#             else:
+#                 return ""
+#         except Exception as e:
+#             verbose_logger.exception(
+#                 "litellm.CustomStreamWrapper.handle_baseten_chunk(): Exception occured - {}".format(
+#                     str(e)
+#                 )
+#             )
+#             return ""
+
+#     def handle_cloudlfare_stream(self, chunk):
+#         try:
+#             print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n")
+#             chunk = chunk.decode("utf-8")
+#             str_line = chunk
+#             text = ""
+#             is_finished = False
+#             finish_reason = None
+
+#             if "[DONE]" in chunk:
+#                 return {"text": text, "is_finished": True, "finish_reason": "stop"}
+#             elif str_line.startswith("data:"):
+#                 data_json = json.loads(str_line[5:])
+#                 print_verbose(f"delta content: {data_json}")
+#                 text = data_json["response"]
+#                 return {
+#                     "text": text,
+#                     "is_finished": is_finished,
+#                     "finish_reason": finish_reason,
+#                 }
+#             else:
+#                 return {
+#                     "text": text,
+#                     "is_finished": is_finished,
+#                     "finish_reason": finish_reason,
+#                 }
+
+#         except Exception as e:
+#             raise e
+
+#     def handle_ollama_stream(self, chunk):
+#         try:
+#             if isinstance(chunk, dict):
+#                 json_chunk = chunk
+#             else:
+#                 json_chunk = json.loads(chunk)
+#             if "error" in json_chunk:
+#                 raise Exception(f"Ollama Error - {json_chunk}")
+
+#             text = ""
+#             is_finished = False
+#             finish_reason = None
+#             if json_chunk["done"] is True:
+#                 text = ""
+#                 is_finished = True
+#                 finish_reason = "stop"
+#                 return {
+#                     "text": text,
+#                     "is_finished": is_finished,
+#                     "finish_reason": finish_reason,
+#                 }
+#             elif json_chunk["response"]:
+#                 print_verbose(f"delta content: {json_chunk}")
+#                 text = json_chunk["response"]
+#                 return {
+#                     "text": text,
+#                     "is_finished": is_finished,
+#                     "finish_reason": finish_reason,
+#                 }
+#             else:
+#                 raise Exception(f"Ollama Error - {json_chunk}")
+#         except Exception as e:
+#             raise e
+
+#     def handle_ollama_chat_stream(self, chunk):
+#         # for ollama_chat/ provider
+#         try:
+#             if isinstance(chunk, dict):
+#                 json_chunk = chunk
+#             else:
+#                 json_chunk = json.loads(chunk)
+#             if "error" in json_chunk:
+#                 raise Exception(f"Ollama Error - {json_chunk}")
+
+#             text = ""
+#             is_finished = False
+#             finish_reason = None
+#             if json_chunk["done"] is True:
+#                 text = ""
+#                 is_finished = True
+#                 finish_reason = "stop"
+#                 return {
+#                     "text": text,
+#                     "is_finished": is_finished,
+#                     "finish_reason": finish_reason,
+#                 }
+#             elif "message" in json_chunk:
+#                 print_verbose(f"delta content: {json_chunk}")
+#                 text = json_chunk["message"]["content"]
+#                 return {
+#                     "text": text,
+#                     "is_finished": is_finished,
+#                     "finish_reason": finish_reason,
+#                 }
+#             else:
+#                 raise Exception(f"Ollama Error - {json_chunk}")
+#         except Exception as e:
+#             raise e
+
+#     def handle_watsonx_stream(self, chunk):
+#         try:
+#             if isinstance(chunk, dict):
+#                 parsed_response = chunk
+#             elif isinstance(chunk, (str, bytes)):
+#                 if isinstance(chunk, bytes):
+#                     chunk = chunk.decode("utf-8")
+#                 if "generated_text" in chunk:
+#                     response = chunk.replace("data: ", "").strip()
+#                     parsed_response = json.loads(response)
+#                 else:
+#                     return {
+#                         "text": "",
+#                         "is_finished": False,
+#                         "prompt_tokens": 0,
+#                         "completion_tokens": 0,
+#                     }
+#             else:
+#                 print_verbose(f"chunk: {chunk} (Type: {type(chunk)})")
+#                 raise ValueError(
+#                     f"Unable to parse response. Original response: {chunk}"
+#                 )
+#             results = parsed_response.get("results", [])
+#             if len(results) > 0:
+#                 text = results[0].get("generated_text", "")
+#                 finish_reason = results[0].get("stop_reason")
+#                 is_finished = finish_reason != "not_finished"
+#                 return {
+#                     "text": text,
+#                     "is_finished": is_finished,
+#                     "finish_reason": finish_reason,
+#                     "prompt_tokens": results[0].get("input_token_count", 0),
+#                     "completion_tokens": results[0].get("generated_token_count", 0),
+#                 }
+#             return {"text": "", "is_finished": False}
+#         except Exception as e:
+#             raise e
+
+#     def handle_triton_stream(self, chunk):
+#         try:
+#             if isinstance(chunk, dict):
+#                 parsed_response = chunk
+#             elif isinstance(chunk, (str, bytes)):
+#                 if isinstance(chunk, bytes):
+#                     chunk = chunk.decode("utf-8")
+#                 if "text_output" in chunk:
+#                     response = chunk.replace("data: ", "").strip()
+#                     parsed_response = json.loads(response)
+#                 else:
+#                     return {
+#                         "text": "",
+#                         "is_finished": False,
+#                         "prompt_tokens": 0,
+#                         "completion_tokens": 0,
+#                     }
+#             else:
+#                 print_verbose(f"chunk: {chunk} (Type: {type(chunk)})")
+#                 raise ValueError(
+#                     f"Unable to parse response. Original response: {chunk}"
+#                 )
+#             text = parsed_response.get("text_output", "")
+#             finish_reason = parsed_response.get("stop_reason")
+#             is_finished = parsed_response.get("is_finished", False)
+#             return {
+#                 "text": text,
+#                 "is_finished": is_finished,
+#                 "finish_reason": finish_reason,
+#                 "prompt_tokens": parsed_response.get("input_token_count", 0),
+#                 "completion_tokens": parsed_response.get("generated_token_count", 0),
+#             }
+#             return {"text": "", "is_finished": False}
+#         except Exception as e:
+#             raise e
+
+#     def handle_clarifai_completion_chunk(self, chunk):
+#         try:
+#             if isinstance(chunk, dict):
+#                 parsed_response = chunk
+#             elif isinstance(chunk, (str, bytes)):
+#                 if isinstance(chunk, bytes):
+#                     parsed_response = chunk.decode("utf-8")
+#                 else:
+#                     parsed_response = chunk
+#             else:
+#                 raise ValueError("Unable to parse streaming chunk")
+#             if isinstance(parsed_response, dict):
+#                 data_json = parsed_response
+#             else:
+#                 data_json = json.loads(parsed_response)
+#             text = (
+#                 data_json.get("outputs", "")[0]
+#                 .get("data", "")
+#                 .get("text", "")
+#                 .get("raw", "")
+#             )
+#             len(
+#                 encoding.encode(
+#                     data_json.get("outputs", "")[0]
+#                     .get("input", "")
+#                     .get("data", "")
+#                     .get("text", "")
+#                     .get("raw", "")
+#                 )
+#             )
+#             len(encoding.encode(text))
+#             return {
+#                 "text": text,
+#                 "is_finished": True,
+#             }
+#         except Exception as e:
+#             verbose_logger.exception(
+#                 "litellm.CustomStreamWrapper.handle_clarifai_chunk(): Exception occured - {}".format(
+#                     str(e)
+#                 )
+#             )
+#             return ""
+
+#     def model_response_creator(
+#         self, chunk: Optional[dict] = None, hidden_params: Optional[dict] = None
+#     ):
+#         _model = self.model
+#         _received_llm_provider = self.custom_llm_provider
+#         _logging_obj_llm_provider = self.logging_obj.model_call_details.get("custom_llm_provider", None)  # type: ignore
+#         if (
+#             _received_llm_provider == "openai"
+#             and _received_llm_provider != _logging_obj_llm_provider
+#         ):
+#             _model = "{}/{}".format(_logging_obj_llm_provider, _model)
+#         if chunk is None:
+#             chunk = {}
+#         else:
+#             # pop model keyword
+#             chunk.pop("model", None)
+
+#         model_response = ModelResponse(
+#             stream=True, model=_model, stream_options=self.stream_options, **chunk
+#         )
+#         if self.response_id is not None:
+#             model_response.id = self.response_id
+#         else:
+#             self.response_id = model_response.id  # type: ignore
+#         if self.system_fingerprint is not None:
+#             model_response.system_fingerprint = self.system_fingerprint
+#         if hidden_params is not None:
+#             model_response._hidden_params = hidden_params
+#         model_response._hidden_params["custom_llm_provider"] = _logging_obj_llm_provider
+#         model_response._hidden_params["created_at"] = time.time()
+#         model_response._hidden_params = {
+#             **model_response._hidden_params,
+#             **self._hidden_params,
+#         }
 
-                        choice = processed_chunk.choices[0]
-                        if isinstance(choice, StreamingChoices):
-                            self.response_uptil_now += (
-                                choice.delta.get("content", "") or ""
-                            )
-                        else:
-                            self.response_uptil_now += ""
-                        self.rules.post_call_rules(
-                            input=self.response_uptil_now, model=self.model
-                        )
-                        # RETURN RESULT
-                        self.chunks.append(processed_chunk)
-                        return processed_chunk
-        except (StopAsyncIteration, StopIteration):
-            if self.sent_last_chunk is True:
-                # log the final chunk with accurate streaming values
-                complete_streaming_response = litellm.stream_chunk_builder(
-                    chunks=self.chunks, messages=self.messages
-                )
-                response = self.model_response_creator()
-                if complete_streaming_response is not None:
-                    setattr(
-                        response,
-                        "usage",
-                        getattr(complete_streaming_response, "usage"),
-                    )
-                ## LOGGING
-                threading.Thread(
-                    target=self.logging_obj.success_handler,
-                    args=(response, None, None, cache_hit),
-                ).start()  # log response
-                asyncio.create_task(
-                    self.logging_obj.async_success_handler(
-                        response, cache_hit=cache_hit
-                    )
-                )
-                if self.sent_stream_usage is False and self.send_stream_usage is True:
-                    self.sent_stream_usage = True
-                    return response
-                raise StopAsyncIteration  # Re-raise StopIteration
-            else:
-                self.sent_last_chunk = True
-                processed_chunk = self.finish_reason_handler()
-                ## LOGGING
-                threading.Thread(
-                    target=self.logging_obj.success_handler,
-                    args=(processed_chunk, None, None, cache_hit),
-                ).start()  # log response
-                asyncio.create_task(
-                    self.logging_obj.async_success_handler(
-                        processed_chunk, cache_hit=cache_hit
-                    )
-                )
-                return processed_chunk
-        except httpx.TimeoutException as e:  # if httpx read timeout error occues
-            traceback_exception = traceback.format_exc()
-            ## ADD DEBUG INFORMATION - E.G. LITELLM REQUEST TIMEOUT
-            traceback_exception += "\nLiteLLM Default Request Timeout - {}".format(
-                litellm.request_timeout
-            )
-            if self.logging_obj is not None:
-                ## LOGGING
-                threading.Thread(
-                    target=self.logging_obj.failure_handler,
-                    args=(e, traceback_exception),
-                ).start()  # log response
-                # Handle any exceptions that might occur during streaming
-                asyncio.create_task(
-                    self.logging_obj.async_failure_handler(e, traceback_exception)
-                )
-            raise e
-        except Exception as e:
-            traceback_exception = traceback.format_exc()
-            if self.logging_obj is not None:
-                ## LOGGING
-                threading.Thread(
-                    target=self.logging_obj.failure_handler,
-                    args=(e, traceback_exception),
-                ).start()  # log response
-                # Handle any exceptions that might occur during streaming
-                asyncio.create_task(
-                    self.logging_obj.async_failure_handler(e, traceback_exception)  # type: ignore
-                )
-            ## Map to OpenAI Exception
-            raise exception_type(
-                model=self.model,
-                custom_llm_provider=self.custom_llm_provider,
-                original_exception=e,
-                completion_kwargs={},
-                extra_kwargs={},
-            )
+#         if (
+#             len(model_response.choices) > 0
+#             and getattr(model_response.choices[0], "delta") is not None
+#         ):
+#             # do nothing, if object instantiated
+#             pass
+#         else:
+#             model_response.choices = [StreamingChoices(finish_reason=None)]
+#         return model_response
+
+#     def is_delta_empty(self, delta: Delta) -> bool:
+#         is_empty = True
+#         if delta.content is not None:
+#             is_empty = False
+#         elif delta.tool_calls is not None:
+#             is_empty = False
+#         elif delta.function_call is not None:
+#             is_empty = False
+#         return is_empty
+
+#     def return_processed_chunk_logic(  # noqa
+#         self,
+#         completion_obj: dict,
+#         model_response: ModelResponseStream,
+#         response_obj: dict,
+#     ):
+
+#         print_verbose(
+#             f"completion_obj: {completion_obj}, model_response.choices[0]: {model_response.choices[0]}, response_obj: {response_obj}"
+#         )
+#         if (
+#             "content" in completion_obj
+#             and (
+#                 isinstance(completion_obj["content"], str)
+#                 and len(completion_obj["content"]) > 0
+#             )
+#             or (
+#                 "tool_calls" in completion_obj
+#                 and completion_obj["tool_calls"] is not None
+#                 and len(completion_obj["tool_calls"]) > 0
+#             )
+#             or (
+#                 "function_call" in completion_obj
+#                 and completion_obj["function_call"] is not None
+#             )
+#         ):  # cannot set content of an OpenAI Object to be an empty string
+#             self.safety_checker()
+#             hold, model_response_str = self.check_special_tokens(
+#                 chunk=completion_obj["content"],
+#                 finish_reason=model_response.choices[0].finish_reason,
+#             )  # filter out bos/eos tokens from openai-compatible hf endpoints
+#             print_verbose(f"hold - {hold}, model_response_str - {model_response_str}")
+#             if hold is False:
+#                 ## check if openai/azure chunk
+#                 original_chunk = response_obj.get("original_chunk", None)
+#                 if original_chunk:
+#                     model_response.id = original_chunk.id
+#                     self.response_id = original_chunk.id
+#                     if len(original_chunk.choices) > 0:
+#                         choices = []
+#                         for choice in original_chunk.choices:
+#                             try:
+#                                 if isinstance(choice, BaseModel):
+#                                     choice_json = choice.model_dump()
+#                                     choice_json.pop(
+#                                         "finish_reason", None
+#                                     )  # for mistral etc. which return a value in their last chunk (not-openai compatible).
+#                                     print_verbose(f"choice_json: {choice_json}")
+#                                     choices.append(StreamingChoices(**choice_json))
+#                             except Exception:
+#                                 choices.append(StreamingChoices())
+#                         print_verbose(f"choices in streaming: {choices}")
+#                         setattr(model_response, "choices", choices)
+#                     else:
+#                         return
+#                     model_response.system_fingerprint = (
+#                         original_chunk.system_fingerprint
+#                     )
+#                     setattr(
+#                         model_response,
+#                         "citations",
+#                         getattr(original_chunk, "citations", None),
+#                     )
+#                     print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}")
+#                     if self.sent_first_chunk is False:
+#                         model_response.choices[0].delta["role"] = "assistant"
+#                         self.sent_first_chunk = True
+#                     elif self.sent_first_chunk is True and hasattr(
+#                         model_response.choices[0].delta, "role"
+#                     ):
+#                         _initial_delta = model_response.choices[0].delta.model_dump()
+#                         _initial_delta.pop("role", None)
+#                         model_response.choices[0].delta = Delta(**_initial_delta)
+#                     print_verbose(
+#                         f"model_response.choices[0].delta: {model_response.choices[0].delta}"
+#                     )
+#                 else:
+#                     ## else
+#                     completion_obj["content"] = model_response_str
+#                     if self.sent_first_chunk is False:
+#                         completion_obj["role"] = "assistant"
+#                         self.sent_first_chunk = True
+
+#                     model_response.choices[0].delta = Delta(**completion_obj)
+#                     _index: Optional[int] = completion_obj.get("index")
+#                     if _index is not None:
+#                         model_response.choices[0].index = _index
+#                 print_verbose(f"returning model_response: {model_response}")
+#                 return model_response
+#             else:
+#                 return
+#         elif self.received_finish_reason is not None:
+#             if self.sent_last_chunk is True:
+#                 # Bedrock returns the guardrail trace in the last chunk - we want to return this here
+#                 if self.custom_llm_provider == "bedrock" and "trace" in model_response:
+#                     return model_response
+
+#                 # Default - return StopIteration
+#                 raise StopIteration
+#             # flush any remaining holding chunk
+#             if len(self.holding_chunk) > 0:
+#                 if model_response.choices[0].delta.content is None:
+#                     model_response.choices[0].delta.content = self.holding_chunk
+#                 else:
+#                     model_response.choices[0].delta.content = (
+#                         self.holding_chunk + model_response.choices[0].delta.content
+#                     )
+#                 self.holding_chunk = ""
+#             # if delta is None
+#             _is_delta_empty = self.is_delta_empty(delta=model_response.choices[0].delta)
+
+#             if _is_delta_empty:
+#                 # get any function call arguments
+#                 model_response.choices[0].finish_reason = map_finish_reason(
+#                     finish_reason=self.received_finish_reason
+#                 )  # ensure consistent output to openai
+
+#                 self.sent_last_chunk = True
+
+#             return model_response
+#         elif (
+#             model_response.choices[0].delta.tool_calls is not None
+#             or model_response.choices[0].delta.function_call is not None
+#         ):
+#             if self.sent_first_chunk is False:
+#                 model_response.choices[0].delta["role"] = "assistant"
+#                 self.sent_first_chunk = True
+#             return model_response
+#         elif (
+#             len(model_response.choices) > 0
+#             and hasattr(model_response.choices[0].delta, "audio")
+#             and model_response.choices[0].delta.audio is not None
+#         ):
+#             return model_response
+#         else:
+#             if hasattr(model_response, "usage"):
+#                 self.chunks.append(model_response)
+#             return
+
+#     def chunk_creator(self, chunk):  # type: ignore  # noqa: PLR0915
+#         model_response = self.model_response_creator()
+#         response_obj: dict = {}
+#         try:
+#             # return this for all models
+#             completion_obj = {"content": ""}
+#             from litellm.litellm_core_utils.streaming_utils import (
+#                 generic_chunk_has_all_required_fields,
+#             )
+#             from litellm.types.utils import GenericStreamingChunk as GChunk
+
+#             if (
+#                 isinstance(chunk, dict)
+#                 and generic_chunk_has_all_required_fields(
+#                     chunk=chunk
+#                 )  # check if chunk is a generic streaming chunk
+#             ) or (
+#                 self.custom_llm_provider
+#                 and (
+#                     self.custom_llm_provider == "anthropic"
+#                     or self.custom_llm_provider in litellm._custom_providers
+#                 )
+#             ):
+
+#                 if self.received_finish_reason is not None:
+#                     if "provider_specific_fields" not in chunk:
+#                         raise StopIteration
+#                 anthropic_response_obj: GChunk = chunk
+#                 completion_obj["content"] = anthropic_response_obj["text"]
+#                 if anthropic_response_obj["is_finished"]:
+#                     self.received_finish_reason = anthropic_response_obj[
+#                         "finish_reason"
+#                     ]
+
+#                 if anthropic_response_obj["usage"] is not None:
+#                     model_response.usage = litellm.Usage(
+#                         **anthropic_response_obj["usage"]
+#                     )
+
+#                 if (
+#                     "tool_use" in anthropic_response_obj
+#                     and anthropic_response_obj["tool_use"] is not None
+#                 ):
+#                     completion_obj["tool_calls"] = [anthropic_response_obj["tool_use"]]
+
+#                 if (
+#                     "provider_specific_fields" in anthropic_response_obj
+#                     and anthropic_response_obj["provider_specific_fields"] is not None
+#                 ):
+#                     for key, value in anthropic_response_obj[
+#                         "provider_specific_fields"
+#                     ].items():
+#                         setattr(model_response, key, value)
+
+#                 response_obj = anthropic_response_obj
+#             elif (
+#                 self.custom_llm_provider
+#                 and self.custom_llm_provider == "anthropic_text"
+#             ):
+#                 response_obj = self.handle_anthropic_text_chunk(chunk)
+#                 completion_obj["content"] = response_obj["text"]
+#                 if response_obj["is_finished"]:
+#                     self.received_finish_reason = response_obj["finish_reason"]
+#             elif self.custom_llm_provider and self.custom_llm_provider == "clarifai":
+#                 response_obj = self.handle_clarifai_completion_chunk(chunk)
+#                 completion_obj["content"] = response_obj["text"]
+#                 if response_obj["is_finished"]:
+#                     self.received_finish_reason = response_obj["finish_reason"]
+#             elif self.model == "replicate" or self.custom_llm_provider == "replicate":
+#                 response_obj = self.handle_replicate_chunk(chunk)
+#                 completion_obj["content"] = response_obj["text"]
+#                 if response_obj["is_finished"]:
+#                     self.received_finish_reason = response_obj["finish_reason"]
+#             elif self.custom_llm_provider and self.custom_llm_provider == "huggingface":
+#                 response_obj = self.handle_huggingface_chunk(chunk)
+#                 completion_obj["content"] = response_obj["text"]
+#                 if response_obj["is_finished"]:
+#                     self.received_finish_reason = response_obj["finish_reason"]
+#             elif self.custom_llm_provider and self.custom_llm_provider == "predibase":
+#                 response_obj = self.handle_predibase_chunk(chunk)
+#                 completion_obj["content"] = response_obj["text"]
+#                 if response_obj["is_finished"]:
+#                     self.received_finish_reason = response_obj["finish_reason"]
+#             elif (
+#                 self.custom_llm_provider and self.custom_llm_provider == "baseten"
+#             ):  # baseten doesn't provide streaming
+#                 completion_obj["content"] = self.handle_baseten_chunk(chunk)
+#             elif (
+#                 self.custom_llm_provider and self.custom_llm_provider == "ai21"
+#             ):  # ai21 doesn't provide streaming
+#                 response_obj = self.handle_ai21_chunk(chunk)
+#                 completion_obj["content"] = response_obj["text"]
+#                 if response_obj["is_finished"]:
+#                     self.received_finish_reason = response_obj["finish_reason"]
+#             elif self.custom_llm_provider and self.custom_llm_provider == "maritalk":
+#                 response_obj = self.handle_maritalk_chunk(chunk)
+#                 completion_obj["content"] = response_obj["text"]
+#                 if response_obj["is_finished"]:
+#                     self.received_finish_reason = response_obj["finish_reason"]
+#             elif self.custom_llm_provider and self.custom_llm_provider == "vllm":
+#                 completion_obj["content"] = chunk[0].outputs[0].text
+#             elif (
+#                 self.custom_llm_provider and self.custom_llm_provider == "aleph_alpha"
+#             ):  # aleph alpha doesn't provide streaming
+#                 response_obj = self.handle_aleph_alpha_chunk(chunk)
+#                 completion_obj["content"] = response_obj["text"]
+#                 if response_obj["is_finished"]:
+#                     self.received_finish_reason = response_obj["finish_reason"]
+#             elif self.custom_llm_provider == "nlp_cloud":
+#                 try:
+#                     response_obj = self.handle_nlp_cloud_chunk(chunk)
+#                     completion_obj["content"] = response_obj["text"]
+#                     if response_obj["is_finished"]:
+#                         self.received_finish_reason = response_obj["finish_reason"]
+#                 except Exception as e:
+#                     if self.received_finish_reason:
+#                         raise e
+#                     else:
+#                         if self.sent_first_chunk is False:
+#                             raise Exception("An unknown error occurred with the stream")
+#                         self.received_finish_reason = "stop"
+#             elif self.custom_llm_provider == "vertex_ai":
+#                 import proto  # type: ignore
+
+#                 if self.model.startswith("claude-3"):
+#                     response_obj = self.handle_vertexai_anthropic_chunk(chunk=chunk)
+#                     if response_obj is None:
+#                         return
+#                     completion_obj["content"] = response_obj["text"]
+#                     setattr(model_response, "usage", Usage())
+#                     if response_obj.get("prompt_tokens", None) is not None:
+#                         model_response.usage.prompt_tokens = response_obj[
+#                             "prompt_tokens"
+#                         ]
+#                     if response_obj.get("completion_tokens", None) is not None:
+#                         model_response.usage.completion_tokens = response_obj[
+#                             "completion_tokens"
+#                         ]
+#                     if hasattr(model_response.usage, "prompt_tokens"):
+#                         model_response.usage.total_tokens = (
+#                             getattr(model_response.usage, "total_tokens", 0)
+#                             + model_response.usage.prompt_tokens
+#                         )
+#                     if hasattr(model_response.usage, "completion_tokens"):
+#                         model_response.usage.total_tokens = (
+#                             getattr(model_response.usage, "total_tokens", 0)
+#                             + model_response.usage.completion_tokens
+#                         )
+
+#                     if response_obj["is_finished"]:
+#                         self.received_finish_reason = response_obj["finish_reason"]
+#                 elif hasattr(chunk, "candidates") is True:
+#                     try:
+#                         try:
+#                             completion_obj["content"] = chunk.text
+#                         except Exception as e:
+#                             if "Part has no text." in str(e):
+#                                 ## check for function calling
+#                                 function_call = (
+#                                     chunk.candidates[0].content.parts[0].function_call
+#                                 )
+
+#                                 args_dict = {}
+
+#                                 # Check if it's a RepeatedComposite instance
+#                                 for key, val in function_call.args.items():
+#                                     if isinstance(
+#                                         val,
+#                                         proto.marshal.collections.repeated.RepeatedComposite,
+#                                     ):
+#                                         # If so, convert to list
+#                                         args_dict[key] = [v for v in val]
+#                                     else:
+#                                         args_dict[key] = val
+
+#                                 try:
+#                                     args_str = json.dumps(args_dict)
+#                                 except Exception as e:
+#                                     raise e
+#                                 _delta_obj = litellm.utils.Delta(
+#                                     content=None,
+#                                     tool_calls=[
+#                                         {
+#                                             "id": f"call_{str(uuid.uuid4())}",
+#                                             "function": {
+#                                                 "arguments": args_str,
+#                                                 "name": function_call.name,
+#                                             },
+#                                             "type": "function",
+#                                         }
+#                                     ],
+#                                 )
+#                                 _streaming_response = StreamingChoices(delta=_delta_obj)
+#                                 _model_response = ModelResponse(stream=True)
+#                                 _model_response.choices = [_streaming_response]
+#                                 response_obj = {"original_chunk": _model_response}
+#                             else:
+#                                 raise e
+#                         if (
+#                             hasattr(chunk.candidates[0], "finish_reason")
+#                             and chunk.candidates[0].finish_reason.name
+#                             != "FINISH_REASON_UNSPECIFIED"
+#                         ):  # every non-final chunk in vertex ai has this
+#                             self.received_finish_reason = chunk.candidates[
+#                                 0
+#                             ].finish_reason.name
+#                     except Exception:
+#                         if chunk.candidates[0].finish_reason.name == "SAFETY":
+#                             raise Exception(
+#                                 f"The response was blocked by VertexAI. {str(chunk)}"
+#                             )
+#                 else:
+#                     completion_obj["content"] = str(chunk)
+#             elif self.custom_llm_provider == "cohere":
+#                 response_obj = self.handle_cohere_chunk(chunk)
+#                 completion_obj["content"] = response_obj["text"]
+#                 if response_obj["is_finished"]:
+#                     self.received_finish_reason = response_obj["finish_reason"]
+#             elif self.custom_llm_provider == "cohere_chat":
+#                 response_obj = self.handle_cohere_chat_chunk(chunk)
+#                 if response_obj is None:
+#                     return
+#                 completion_obj["content"] = response_obj["text"]
+#                 if response_obj["is_finished"]:
+#                     self.received_finish_reason = response_obj["finish_reason"]
+
+#             elif self.custom_llm_provider == "petals":
+#                 if len(self.completion_stream) == 0:
+#                     if self.received_finish_reason is not None:
+#                         raise StopIteration
+#                     else:
+#                         self.received_finish_reason = "stop"
+#                 chunk_size = 30
+#                 new_chunk = self.completion_stream[:chunk_size]
+#                 completion_obj["content"] = new_chunk
+#                 self.completion_stream = self.completion_stream[chunk_size:]
+#             elif self.custom_llm_provider == "palm":
+#                 # fake streaming
+#                 response_obj = {}
+#                 if len(self.completion_stream) == 0:
+#                     if self.received_finish_reason is not None:
+#                         raise StopIteration
+#                     else:
+#                         self.received_finish_reason = "stop"
+#                 chunk_size = 30
+#                 new_chunk = self.completion_stream[:chunk_size]
+#                 completion_obj["content"] = new_chunk
+#                 self.completion_stream = self.completion_stream[chunk_size:]
+#             elif self.custom_llm_provider == "ollama":
+#                 response_obj = self.handle_ollama_stream(chunk)
+#                 completion_obj["content"] = response_obj["text"]
+#                 print_verbose(f"completion obj content: {completion_obj['content']}")
+#                 if response_obj["is_finished"]:
+#                     self.received_finish_reason = response_obj["finish_reason"]
+#             elif self.custom_llm_provider == "ollama_chat":
+#                 response_obj = self.handle_ollama_chat_stream(chunk)
+#                 completion_obj["content"] = response_obj["text"]
+#                 print_verbose(f"completion obj content: {completion_obj['content']}")
+#                 if response_obj["is_finished"]:
+#                     self.received_finish_reason = response_obj["finish_reason"]
+#             elif self.custom_llm_provider == "cloudflare":
+#                 response_obj = self.handle_cloudlfare_stream(chunk)
+#                 completion_obj["content"] = response_obj["text"]
+#                 print_verbose(f"completion obj content: {completion_obj['content']}")
+#                 if response_obj["is_finished"]:
+#                     self.received_finish_reason = response_obj["finish_reason"]
+#             elif self.custom_llm_provider == "watsonx":
+#                 response_obj = self.handle_watsonx_stream(chunk)
+#                 completion_obj["content"] = response_obj["text"]
+#                 if response_obj["is_finished"]:
+#                     self.received_finish_reason = response_obj["finish_reason"]
+#             elif self.custom_llm_provider == "triton":
+#                 response_obj = self.handle_triton_stream(chunk)
+#                 completion_obj["content"] = response_obj["text"]
+#                 print_verbose(f"completion obj content: {completion_obj['content']}")
+#                 if response_obj["is_finished"]:
+#                     self.received_finish_reason = response_obj["finish_reason"]
+#             elif self.custom_llm_provider == "text-completion-openai":
+#                 response_obj = self.handle_openai_text_completion_chunk(chunk)
+#                 completion_obj["content"] = response_obj["text"]
+#                 print_verbose(f"completion obj content: {completion_obj['content']}")
+#                 if response_obj["is_finished"]:
+#                     self.received_finish_reason = response_obj["finish_reason"]
+#                 if response_obj["usage"] is not None:
+#                     model_response.usage = litellm.Usage(
+#                         prompt_tokens=response_obj["usage"].prompt_tokens,
+#                         completion_tokens=response_obj["usage"].completion_tokens,
+#                         total_tokens=response_obj["usage"].total_tokens,
+#                     )
+#             elif self.custom_llm_provider == "text-completion-codestral":
+#                 response_obj = litellm.MistralTextCompletionConfig()._chunk_parser(
+#                     chunk
+#                 )
+#                 completion_obj["content"] = response_obj["text"]
+#                 print_verbose(f"completion obj content: {completion_obj['content']}")
+#                 if response_obj["is_finished"]:
+#                     self.received_finish_reason = response_obj["finish_reason"]
+#                 if "usage" in response_obj is not None:
+#                     model_response.usage = litellm.Usage(
+#                         prompt_tokens=response_obj["usage"].prompt_tokens,
+#                         completion_tokens=response_obj["usage"].completion_tokens,
+#                         total_tokens=response_obj["usage"].total_tokens,
+#                     )
+#             elif self.custom_llm_provider == "azure_text":
+#                 response_obj = self.handle_azure_text_completion_chunk(chunk)
+#                 completion_obj["content"] = response_obj["text"]
+#                 print_verbose(f"completion obj content: {completion_obj['content']}")
+#                 if response_obj["is_finished"]:
+#                     self.received_finish_reason = response_obj["finish_reason"]
+#             elif self.custom_llm_provider == "cached_response":
+#                 response_obj = {
+#                     "text": chunk.choices[0].delta.content,
+#                     "is_finished": True,
+#                     "finish_reason": chunk.choices[0].finish_reason,
+#                     "original_chunk": chunk,
+#                     "tool_calls": (
+#                         chunk.choices[0].delta.tool_calls
+#                         if hasattr(chunk.choices[0].delta, "tool_calls")
+#                         else None
+#                     ),
+#                 }
+
+#                 completion_obj["content"] = response_obj["text"]
+#                 if response_obj["tool_calls"] is not None:
+#                     completion_obj["tool_calls"] = response_obj["tool_calls"]
+#                 print_verbose(f"completion obj content: {completion_obj['content']}")
+#                 if hasattr(chunk, "id"):
+#                     model_response.id = chunk.id
+#                     self.response_id = chunk.id
+#                 if hasattr(chunk, "system_fingerprint"):
+#                     self.system_fingerprint = chunk.system_fingerprint
+#                 if response_obj["is_finished"]:
+#                     self.received_finish_reason = response_obj["finish_reason"]
+#             else:  # openai / azure chat model
+#                 if self.custom_llm_provider == "azure":
+#                     if hasattr(chunk, "model"):
+#                         # for azure, we need to pass the model from the orignal chunk
+#                         self.model = chunk.model
+#                 response_obj = self.handle_openai_chat_completion_chunk(chunk)
+#                 if response_obj is None:
+#                     return
+#                 completion_obj["content"] = response_obj["text"]
+#                 print_verbose(f"completion obj content: {completion_obj['content']}")
+#                 if response_obj["is_finished"]:
+#                     if response_obj["finish_reason"] == "error":
+#                         raise Exception(
+#                             "{} raised a streaming error - finish_reason: error, no content string given. Received Chunk={}".format(
+#                                 self.custom_llm_provider, response_obj
+#                             )
+#                         )
+#                     self.received_finish_reason = response_obj["finish_reason"]
+#                 if response_obj.get("original_chunk", None) is not None:
+#                     if hasattr(response_obj["original_chunk"], "id"):
+#                         model_response.id = response_obj["original_chunk"].id
+#                         self.response_id = model_response.id
+#                     if hasattr(response_obj["original_chunk"], "system_fingerprint"):
+#                         model_response.system_fingerprint = response_obj[
+#                             "original_chunk"
+#                         ].system_fingerprint
+#                         self.system_fingerprint = response_obj[
+#                             "original_chunk"
+#                         ].system_fingerprint
+#                 if response_obj["logprobs"] is not None:
+#                     model_response.choices[0].logprobs = response_obj["logprobs"]
+
+#                 if response_obj["usage"] is not None:
+#                     if isinstance(response_obj["usage"], dict):
+#                         model_response.usage = litellm.Usage(
+#                             prompt_tokens=response_obj["usage"].get(
+#                                 "prompt_tokens", None
+#                             )
+#                             or None,
+#                             completion_tokens=response_obj["usage"].get(
+#                                 "completion_tokens", None
+#                             )
+#                             or None,
+#                             total_tokens=response_obj["usage"].get("total_tokens", None)
+#                             or None,
+#                         )
+#                     elif isinstance(response_obj["usage"], BaseModel):
+#                         model_response.usage = litellm.Usage(
+#                             **response_obj["usage"].model_dump()
+#                         )
+
+#             model_response.model = self.model
+#             print_verbose(
+#                 f"model_response finish reason 3: {self.received_finish_reason}; response_obj={response_obj}"
+#             )
+#             ## FUNCTION CALL PARSING
+#             if (
+#                 response_obj is not None
+#                 and response_obj.get("original_chunk", None) is not None
+#             ):  # function / tool calling branch - only set for openai/azure compatible endpoints
+#                 # enter this branch when no content has been passed in response
+#                 original_chunk = response_obj.get("original_chunk", None)
+#                 model_response.id = original_chunk.id
+#                 self.response_id = original_chunk.id
+#                 if original_chunk.choices and len(original_chunk.choices) > 0:
+#                     delta = original_chunk.choices[0].delta
+#                     if delta is not None and (
+#                         delta.function_call is not None or delta.tool_calls is not None
+#                     ):
+#                         try:
+#                             model_response.system_fingerprint = (
+#                                 original_chunk.system_fingerprint
+#                             )
+#                             ## AZURE - check if arguments is not None
+#                             if (
+#                                 original_chunk.choices[0].delta.function_call
+#                                 is not None
+#                             ):
+#                                 if (
+#                                     getattr(
+#                                         original_chunk.choices[0].delta.function_call,
+#                                         "arguments",
+#                                     )
+#                                     is None
+#                                 ):
+#                                     original_chunk.choices[
+#                                         0
+#                                     ].delta.function_call.arguments = ""
+#                             elif original_chunk.choices[0].delta.tool_calls is not None:
+#                                 if isinstance(
+#                                     original_chunk.choices[0].delta.tool_calls, list
+#                                 ):
+#                                     for t in original_chunk.choices[0].delta.tool_calls:
+#                                         if hasattr(t, "functions") and hasattr(
+#                                             t.functions, "arguments"
+#                                         ):
+#                                             if (
+#                                                 getattr(
+#                                                     t.function,
+#                                                     "arguments",
+#                                                 )
+#                                                 is None
+#                                             ):
+#                                                 t.function.arguments = ""
+#                             _json_delta = delta.model_dump()
+#                             print_verbose(f"_json_delta: {_json_delta}")
+#                             if "role" not in _json_delta or _json_delta["role"] is None:
+#                                 _json_delta["role"] = (
+#                                     "assistant"  # mistral's api returns role as None
+#                                 )
+#                             if "tool_calls" in _json_delta and isinstance(
+#                                 _json_delta["tool_calls"], list
+#                             ):
+#                                 for tool in _json_delta["tool_calls"]:
+#                                     if (
+#                                         isinstance(tool, dict)
+#                                         and "function" in tool
+#                                         and isinstance(tool["function"], dict)
+#                                         and ("type" not in tool or tool["type"] is None)
+#                                     ):
+#                                         # if function returned but type set to None - mistral's api returns type: None
+#                                         tool["type"] = "function"
+#                             model_response.choices[0].delta = Delta(**_json_delta)
+#                         except Exception as e:
+#                             verbose_logger.exception(
+#                                 "litellm.CustomStreamWrapper.chunk_creator(): Exception occured - {}".format(
+#                                     str(e)
+#                                 )
+#                             )
+#                             model_response.choices[0].delta = Delta()
+#                     elif (
+#                         delta is not None and getattr(delta, "audio", None) is not None
+#                     ):
+#                         model_response.choices[0].delta.audio = delta.audio
+#                     else:
+#                         try:
+#                             delta = (
+#                                 dict()
+#                                 if original_chunk.choices[0].delta is None
+#                                 else dict(original_chunk.choices[0].delta)
+#                             )
+#                             print_verbose(f"original delta: {delta}")
+#                             model_response.choices[0].delta = Delta(**delta)
+#                             print_verbose(
+#                                 f"new delta: {model_response.choices[0].delta}"
+#                             )
+#                         except Exception:
+#                             model_response.choices[0].delta = Delta()
+#                 else:
+#                     if (
+#                         self.stream_options is not None
+#                         and self.stream_options["include_usage"] is True
+#                     ):
+#                         return model_response
+#                     return
+#             print_verbose(
+#                 f"model_response.choices[0].delta: {model_response.choices[0].delta}; completion_obj: {completion_obj}"
+#             )
+#             print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}")
+
+#             ## CHECK FOR TOOL USE
+#             if "tool_calls" in completion_obj and len(completion_obj["tool_calls"]) > 0:
+#                 if self.is_function_call is True:  # user passed in 'functions' param
+#                     completion_obj["function_call"] = completion_obj["tool_calls"][0][
+#                         "function"
+#                     ]
+#                     completion_obj["tool_calls"] = None
+
+#                 self.tool_call = True
+
+#             ## RETURN ARG
+#             return self.return_processed_chunk_logic(
+#                 completion_obj=completion_obj,
+#                 model_response=model_response,  # type: ignore
+#                 response_obj=response_obj,
+#             )
+
+#         except StopIteration:
+#             raise StopIteration
+#         except Exception as e:
+#             traceback.format_exc()
+#             e.message = str(e)
+#             raise exception_type(
+#                 model=self.model,
+#                 custom_llm_provider=self.custom_llm_provider,
+#                 original_exception=e,
+#             )
+
+#     def set_logging_event_loop(self, loop):
+#         """
+#         import litellm, asyncio
+
+#         loop = asyncio.get_event_loop() # 👈 gets the current event loop
+
+#         response = litellm.completion(.., stream=True)
+
+#         response.set_logging_event_loop(loop=loop) # 👈 enables async_success callbacks for sync logging
+
+#         for chunk in response:
+#             ...
+#         """
+#         self.logging_loop = loop
+
+#     def run_success_logging_and_cache_storage(self, processed_chunk, cache_hit: bool):
+#         """
+#         Runs success logging in a thread and adds the response to the cache
+#         """
+#         if litellm.disable_streaming_logging is True:
+#             """
+#             [NOT RECOMMENDED]
+#             Set this via `litellm.disable_streaming_logging = True`.
+
+#             Disables streaming logging.
+#             """
+#             return
+#         ## ASYNC LOGGING
+#         # Create an event loop for the new thread
+#         if self.logging_loop is not None:
+#             future = asyncio.run_coroutine_threadsafe(
+#                 self.logging_obj.async_success_handler(
+#                     processed_chunk, None, None, cache_hit
+#                 ),
+#                 loop=self.logging_loop,
+#             )
+#             future.result()
+#         else:
+#             asyncio.run(
+#                 self.logging_obj.async_success_handler(
+#                     processed_chunk, None, None, cache_hit
+#                 )
+#             )
+#         ## SYNC LOGGING
+#         self.logging_obj.success_handler(processed_chunk, None, None, cache_hit)
+
+#         ## Sync store in cache
+#         if self.logging_obj._llm_caching_handler is not None:
+#             self.logging_obj._llm_caching_handler._sync_add_streaming_response_to_cache(
+#                 processed_chunk
+#             )
+
+#     def finish_reason_handler(self):
+#         model_response = self.model_response_creator()
+#         complete_streaming_response = litellm.stream_chunk_builder(
+#             chunks=self.chunks
+#         )
+#         _finish_reason = complete_streaming_response.choices[0].finish_reason
+
+#         print(f"_finish_reason: {_finish_reason}")
+#         if _finish_reason is not None:
+#             model_response.choices[0].finish_reason = _finish_reason
+#         else:
+#             model_response.choices[0].finish_reason = "stop"
+
+#         ## if tool use
+#         if (
+#             model_response.choices[0].finish_reason == "stop" and self.tool_call
+#         ):  # don't overwrite for other - potential error finish reasons
+#             model_response.choices[0].finish_reason = "tool_calls"
+#         return model_response
+
+#     def __next__(self):  # noqa: PLR0915
+#         cache_hit = False
+#         if (
+#             self.custom_llm_provider is not None
+#             and self.custom_llm_provider == "cached_response"
+#         ):
+#             cache_hit = True
+#         try:
+#             if self.completion_stream is None:
+#                 self.fetch_sync_stream()
+#             while True:
+#                 if (
+#                     isinstance(self.completion_stream, str)
+#                     or isinstance(self.completion_stream, bytes)
+#                     or isinstance(self.completion_stream, ModelResponse)
+#                 ):
+#                     chunk = self.completion_stream
+#                 else:
+#                     chunk = next(self.completion_stream)
+#                 if chunk is not None and chunk != b"":
+#                     print_verbose(
+#                         f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}; custom_llm_provider: {self.custom_llm_provider}"
+#                     )
+#                     response: Optional[ModelResponse] = self.chunk_creator(chunk=chunk)
+#                     print_verbose(f"PROCESSED CHUNK POST CHUNK CREATOR: {response}")
+
+#                     if response is None:
+#                         continue
+#                     ## LOGGING
+#                     threading.Thread(
+#                         target=self.run_success_logging_and_cache_storage,
+#                         args=(response, cache_hit),
+#                     ).start()  # log response
+#                     choice = response.choices[0]
+#                     if isinstance(choice, StreamingChoices):
+#                         self.response_uptil_now += choice.delta.get("content", "") or ""
+#                     else:
+#                         self.response_uptil_now += ""
+#                     self.rules.post_call_rules(
+#                         input=self.response_uptil_now, model=self.model
+#                     )
+#                     # HANDLE STREAM OPTIONS
+#                     self.chunks.append(response)
+#                     if hasattr(
+#                         response, "usage"
+#                     ):  # remove usage from chunk, only send on final chunk
+#                         # Convert the object to a dictionary
+#                         obj_dict = response.dict()
+
+#                         # Remove an attribute (e.g., 'attr2')
+#                         if "usage" in obj_dict:
+#                             del obj_dict["usage"]
+
+#                         # Create a new object without the removed attribute
+#                         response = self.model_response_creator(
+#                             chunk=obj_dict, hidden_params=response._hidden_params
+#                         )
+#                     # add usage as hidden param
+#                     if self.sent_last_chunk is True and self.stream_options is None:
+#                         usage = calculate_total_usage(chunks=self.chunks)
+#                         response._hidden_params["usage"] = usage
+#                     # RETURN RESULT
+#                     return response
+
+#         except StopIteration:
+#             if self.sent_last_chunk is True:
+#                 complete_streaming_response = litellm.stream_chunk_builder(
+#                     chunks=self.chunks, messages=self.messages
+#                 )
+#                 response = self.model_response_creator()
+#                 if complete_streaming_response is not None:
+#                     setattr(
+#                         response,
+#                         "usage",
+#                         getattr(complete_streaming_response, "usage"),
+#                     )
+
+#                 ## LOGGING
+#                 threading.Thread(
+#                     target=self.logging_obj.success_handler,
+#                     args=(response, None, None, cache_hit),
+#                 ).start()  # log response
+
+#                 if self.sent_stream_usage is False and self.send_stream_usage is True:
+#                     self.sent_stream_usage = True
+#                     return response
+#                 raise  # Re-raise StopIteration
+#             else:
+#                 self.sent_last_chunk = True
+#                 processed_chunk = self.finish_reason_handler()
+#                 if self.stream_options is None:  # add usage as hidden param
+#                     usage = calculate_total_usage(chunks=self.chunks)
+#                     processed_chunk._hidden_params["usage"] = usage
+#                 ## LOGGING
+#                 threading.Thread(
+#                     target=self.run_success_logging_and_cache_storage,
+#                     args=(processed_chunk, cache_hit),
+#                 ).start()  # log response
+#                 return processed_chunk
+#         except Exception as e:
+#             traceback_exception = traceback.format_exc()
+#             # LOG FAILURE - handle streaming failure logging in the _next_ object, remove `handle_failure` once it's deprecated
+#             threading.Thread(
+#                 target=self.logging_obj.failure_handler, args=(e, traceback_exception)
+#             ).start()
+#             if isinstance(e, OpenAIError):
+#                 raise e
+#             else:
+#                 raise exception_type(
+#                     model=self.model,
+#                     original_exception=e,
+#                     custom_llm_provider=self.custom_llm_provider,
+#                 )
+
+#     def fetch_sync_stream(self):
+#         if self.completion_stream is None and self.make_call is not None:
+#             # Call make_call to get the completion stream
+#             self.completion_stream = self.make_call(client=litellm.module_level_client)
+#             self._stream_iter = self.completion_stream.__iter__()
+
+#         return self.completion_stream
+
+#     async def fetch_stream(self):
+#         if self.completion_stream is None and self.make_call is not None:
+#             # Call make_call to get the completion stream
+#             self.completion_stream = await self.make_call(
+#                 client=litellm.module_level_aclient
+#             )
+#             self._stream_iter = self.completion_stream.__aiter__()
+
+#         return self.completion_stream
+
+#     async def __anext__(self):  # noqa: PLR0915
+#         cache_hit = False
+#         if (
+#             self.custom_llm_provider is not None
+#             and self.custom_llm_provider == "cached_response"
+#         ):
+#             cache_hit = True
+#         try:
+#             if self.completion_stream is None:
+#                 await self.fetch_stream()
+
+#             if (
+#                 self.custom_llm_provider == "openai"
+#                 or self.custom_llm_provider == "azure"
+#                 or self.custom_llm_provider == "custom_openai"
+#                 or self.custom_llm_provider == "text-completion-openai"
+#                 or self.custom_llm_provider == "text-completion-codestral"
+#                 or self.custom_llm_provider == "azure_text"
+#                 or self.custom_llm_provider == "anthropic"
+#                 or self.custom_llm_provider == "anthropic_text"
+#                 or self.custom_llm_provider == "huggingface"
+#                 or self.custom_llm_provider == "ollama"
+#                 or self.custom_llm_provider == "ollama_chat"
+#                 or self.custom_llm_provider == "vertex_ai"
+#                 or self.custom_llm_provider == "vertex_ai_beta"
+#                 or self.custom_llm_provider == "sagemaker"
+#                 or self.custom_llm_provider == "sagemaker_chat"
+#                 or self.custom_llm_provider == "gemini"
+#                 or self.custom_llm_provider == "replicate"
+#                 or self.custom_llm_provider == "cached_response"
+#                 or self.custom_llm_provider == "predibase"
+#                 or self.custom_llm_provider == "databricks"
+#                 or self.custom_llm_provider == "bedrock"
+#                 or self.custom_llm_provider == "triton"
+#                 or self.custom_llm_provider == "watsonx"
+#                 or self.custom_llm_provider in litellm.openai_compatible_endpoints
+#                 or self.custom_llm_provider in litellm._custom_providers
+#             ):
+#                 async for chunk in self.completion_stream:
+#                     if chunk == "None" or chunk is None:
+#                         raise Exception
+#                     elif (
+#                         self.custom_llm_provider == "gemini"
+#                         and hasattr(chunk, "parts")
+#                         and len(chunk.parts) == 0
+#                     ):
+#                         continue
+#                     # chunk_creator() does logging/stream chunk building. We need to let it know its being called in_async_func, so we don't double add chunks.
+#                     # __anext__ also calls async_success_handler, which does logging
+#                     print_verbose(f"PROCESSED ASYNC CHUNK PRE CHUNK CREATOR: {chunk}")
+
+#                     processed_chunk: Optional[ModelResponse] = self.chunk_creator(
+#                         chunk=chunk
+#                     )
+#                     print_verbose(
+#                         f"PROCESSED ASYNC CHUNK POST CHUNK CREATOR: {processed_chunk}"
+#                     )
+#                     if processed_chunk is None:
+#                         continue
+#                     ## LOGGING
+#                     ## LOGGING
+#                     executor.submit(
+#                         self.logging_obj.success_handler,
+#                         result=processed_chunk,
+#                         start_time=None,
+#                         end_time=None,
+#                         cache_hit=cache_hit,
+#                     )
+
+#                     asyncio.create_task(
+#                         self.logging_obj.async_success_handler(
+#                             processed_chunk, cache_hit=cache_hit
+#                         )
+#                     )
+
+#                     if self.logging_obj._llm_caching_handler is not None:
+#                         asyncio.create_task(
+#                             self.logging_obj._llm_caching_handler._add_streaming_response_to_cache(
+#                                 processed_chunk=processed_chunk,
+#                             )
+#                         )
+
+#                     choice = processed_chunk.choices[0]
+#                     if isinstance(choice, StreamingChoices):
+#                         self.response_uptil_now += choice.delta.get("content", "") or ""
+#                     else:
+#                         self.response_uptil_now += ""
+#                     self.rules.post_call_rules(
+#                         input=self.response_uptil_now, model=self.model
+#                     )
+#                     self.chunks.append(processed_chunk)
+#                     if hasattr(
+#                         processed_chunk, "usage"
+#                     ):  # remove usage from chunk, only send on final chunk
+#                         # Convert the object to a dictionary
+#                         obj_dict = processed_chunk.dict()
+
+#                         # Remove an attribute (e.g., 'attr2')
+#                         if "usage" in obj_dict:
+#                             del obj_dict["usage"]
+
+#                         # Create a new object without the removed attribute
+#                         processed_chunk = self.model_response_creator(chunk=obj_dict)
+#                     print_verbose(f"final returned processed chunk: {processed_chunk}")
+#                     return processed_chunk
+#                 raise StopAsyncIteration
+#             else:  # temporary patch for non-aiohttp async calls
+#                 # example - boto3 bedrock llms
+#                 while True:
+#                     if isinstance(self.completion_stream, str) or isinstance(
+#                         self.completion_stream, bytes
+#                     ):
+#                         chunk = self.completion_stream
+#                     else:
+#                         chunk = next(self.completion_stream)
+#                     if chunk is not None and chunk != b"":
+#                         print_verbose(f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}")
+#                         processed_chunk: Optional[ModelResponse] = self.chunk_creator(
+#                             chunk=chunk
+#                         )
+#                         print_verbose(
+#                             f"PROCESSED CHUNK POST CHUNK CREATOR: {processed_chunk}"
+#                         )
+#                         if processed_chunk is None:
+#                             continue
+#                         ## LOGGING
+#                         threading.Thread(
+#                             target=self.logging_obj.success_handler,
+#                             args=(processed_chunk, None, None, cache_hit),
+#                         ).start()  # log processed_chunk
+#                         asyncio.create_task(
+#                             self.logging_obj.async_success_handler(
+#                                 processed_chunk, cache_hit=cache_hit
+#                             )
+#                         )
+
+#                         choice = processed_chunk.choices[0]
+#                         if isinstance(choice, StreamingChoices):
+#                             self.response_uptil_now += (
+#                                 choice.delta.get("content", "") or ""
+#                             )
+#                         else:
+#                             self.response_uptil_now += ""
+#                         self.rules.post_call_rules(
+#                             input=self.response_uptil_now, model=self.model
+#                         )
+#                         # RETURN RESULT
+#                         self.chunks.append(processed_chunk)
+#                         return processed_chunk
+#         except (StopAsyncIteration, StopIteration):
+#             if self.sent_last_chunk is True:
+#                 # log the final chunk with accurate streaming values
+#                 complete_streaming_response = litellm.stream_chunk_builder(
+#                     chunks=self.chunks, messages=self.messages
+#                 )
+#                 response = self.model_response_creator()
+#                 if complete_streaming_response is not None:
+#                     setattr(
+#                         response,
+#                         "usage",
+#                         getattr(complete_streaming_response, "usage"),
+#                     )
+#                 ## LOGGING
+#                 threading.Thread(
+#                     target=self.logging_obj.success_handler,
+#                     args=(response, None, None, cache_hit),
+#                 ).start()  # log response
+#                 asyncio.create_task(
+#                     self.logging_obj.async_success_handler(
+#                         response, cache_hit=cache_hit
+#                     )
+#                 )
+#                 if self.sent_stream_usage is False and self.send_stream_usage is True:
+#                     self.sent_stream_usage = True
+#                     return response
+#                 raise StopAsyncIteration  # Re-raise StopIteration
+#             else:
+#                 self.sent_last_chunk = True
+#                 processed_chunk = self.finish_reason_handler()
+#                 ## LOGGING
+#                 threading.Thread(
+#                     target=self.logging_obj.success_handler,
+#                     args=(processed_chunk, None, None, cache_hit),
+#                 ).start()  # log response
+#                 asyncio.create_task(
+#                     self.logging_obj.async_success_handler(
+#                         processed_chunk, cache_hit=cache_hit
+#                     )
+#                 )
+#                 return processed_chunk
+#         except httpx.TimeoutException as e:  # if httpx read timeout error occues
+#             traceback_exception = traceback.format_exc()
+#             ## ADD DEBUG INFORMATION - E.G. LITELLM REQUEST TIMEOUT
+#             traceback_exception += "\nLiteLLM Default Request Timeout - {}".format(
+#                 litellm.request_timeout
+#             )
+#             if self.logging_obj is not None:
+#                 ## LOGGING
+#                 threading.Thread(
+#                     target=self.logging_obj.failure_handler,
+#                     args=(e, traceback_exception),
+#                 ).start()  # log response
+#                 # Handle any exceptions that might occur during streaming
+#                 asyncio.create_task(
+#                     self.logging_obj.async_failure_handler(e, traceback_exception)
+#                 )
+#             raise e
+#         except Exception as e:
+#             traceback_exception = traceback.format_exc()
+#             if self.logging_obj is not None:
+#                 ## LOGGING
+#                 threading.Thread(
+#                     target=self.logging_obj.failure_handler,
+#                     args=(e, traceback_exception),
+#                 ).start()  # log response
+#                 # Handle any exceptions that might occur during streaming
+#                 asyncio.create_task(
+#                     self.logging_obj.async_failure_handler(e, traceback_exception)  # type: ignore
+#                 )
+#             ## Map to OpenAI Exception
+#             raise exception_type(
+#                 model=self.model,
+#                 custom_llm_provider=self.custom_llm_provider,
+#                 original_exception=e,
+#                 completion_kwargs={},
+#                 extra_kwargs={},
+#             )
 
 
 class TextCompletionStreamWrapper:
@@ -8267,29 +8189,6 @@ def has_tool_call_blocks(messages: List[AllMessageValues]) -> bool:
     return False
 
 
-def process_response_headers(response_headers: Union[httpx.Headers, dict]) -> dict:
-    openai_headers = {}
-    processed_headers = {}
-    additional_headers = {}
-
-    for k, v in response_headers.items():
-        if k in OPENAI_RESPONSE_HEADERS:  # return openai-compatible headers
-            openai_headers[k] = v
-        if k.startswith(
-            "llm_provider-"
-        ):  # return raw provider headers (incl. openai-compatible ones)
-            processed_headers[k] = v
-        else:
-            additional_headers["{}-{}".format("llm_provider", k)] = v
-
-    additional_headers = {
-        **openai_headers,
-        **processed_headers,
-        **additional_headers,
-    }
-    return additional_headers
-
-
 def add_dummy_tool(custom_llm_provider: str) -> List[ChatCompletionToolParam]:
     """
     Prevent Anthropic from raising error when tool_use block exists but no tools are provided.
diff --git a/tests/local_testing/test_streaming.py b/tests/local_testing/test_streaming.py
index 827a2495b0b3..fcdc6b60d4f9 100644
--- a/tests/local_testing/test_streaming.py
+++ b/tests/local_testing/test_streaming.py
@@ -3470,6 +3470,86 @@ def test_unit_test_custom_stream_wrapper_repeating_chunk(
             continue
 
 
+def test_unit_test_gemini_streaming_content_filter():
+    chunks = [
+        {
+            "text": "##",
+            "tool_use": None,
+            "is_finished": False,
+            "finish_reason": "stop",
+            "usage": {"prompt_tokens": 37, "completion_tokens": 1, "total_tokens": 38},
+            "index": 0,
+        },
+        {
+            "text": "",
+            "is_finished": False,
+            "finish_reason": "",
+            "usage": None,
+            "index": 0,
+            "tool_use": None,
+        },
+        {
+            "text": " Downsides of Prompt Hacking in a Customer Portal\n\nWhile prompt engineering can be incredibly",
+            "tool_use": None,
+            "is_finished": False,
+            "finish_reason": "stop",
+            "usage": {"prompt_tokens": 37, "completion_tokens": 17, "total_tokens": 54},
+            "index": 0,
+        },
+        {
+            "text": "",
+            "is_finished": False,
+            "finish_reason": "",
+            "usage": None,
+            "index": 0,
+            "tool_use": None,
+        },
+        {
+            "text": "",
+            "tool_use": None,
+            "is_finished": False,
+            "finish_reason": "content_filter",
+            "usage": {"prompt_tokens": 37, "completion_tokens": 17, "total_tokens": 54},
+            "index": 0,
+        },
+        {
+            "text": "",
+            "is_finished": False,
+            "finish_reason": "",
+            "usage": None,
+            "index": 0,
+            "tool_use": None,
+        },
+    ]
+
+    completion_stream = ModelResponseListIterator(model_responses=chunks)
+
+    response = litellm.CustomStreamWrapper(
+        completion_stream=completion_stream,
+        model="gemini/gemini-1.5-pro",
+        custom_llm_provider="gemini",
+        logging_obj=litellm.Logging(
+            model="gemini/gemini-1.5-pro",
+            messages=[{"role": "user", "content": "Hey"}],
+            stream=True,
+            call_type="completion",
+            start_time=time.time(),
+            litellm_call_id="12345",
+            function_id="1245",
+        ),
+    )
+
+    stream_finish_reason: Optional[str] = None
+    idx = 0
+    for chunk in response:
+        print(f"chunk: {chunk}")
+        if chunk.choices[0].finish_reason is not None:
+            stream_finish_reason = chunk.choices[0].finish_reason
+        idx += 1
+    print(f"num chunks: {idx}")
+    assert stream_finish_reason == "content_filter"
+
+
 def test_unit_test_custom_stream_wrapper_openai():
     """
     Test if last streaming chunk ends with '?', if the message repeats itself.