diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py
index 14a2243..a646924 100644
--- a/endpoints/OAI/utils/chat_completion.py
+++ b/endpoints/OAI/utils/chat_completion.py
@@ -454,16 +454,23 @@ async def generate_tool_calls(
         if gen["stop_str"] in tool_data.tool_call_start:
             if "text" in gen:
                 # non streaming, all generations will have the text they generated
-                pre_tool_prompt = await apply_chat_template(data, gen["text"])
+                pre_tool_prompt, mm_embeddings = await apply_chat_template(
+                    data, gen["text"]
+                )
             elif current_generations is not None:
                 # streaming, we wont have text in the generation,
                 # we'll have to use the current_generations
-                pre_tool_prompt = await apply_chat_template(data, current_generations)
+                pre_tool_prompt, mm_embeddings = await apply_chat_template(
+                    data, current_generations
+                )
 
             gen_tasks.append(
                 asyncio.create_task(
                     model.container.generate(
-                        pre_tool_prompt, request.state.id, **gen_params
+                        pre_tool_prompt,
+                        request.state.id,
+                        embeddings=mm_embeddings,
+                        **gen_params,
                     )
                 )
             )
diff --git a/endpoints/core/router.py b/endpoints/core/router.py
index ccb26d9..6b48182 100644
--- a/endpoints/core/router.py
+++ b/endpoints/core/router.py
@@ -4,6 +4,7 @@
 from typing import Optional
 from common.multimodal import MultimodalEmbeddingWrapper
 from fastapi import APIRouter, Depends, HTTPException, Request, Response
+from fastapi.responses import JSONResponse
 from sse_starlette import EventSourceResponse
 
 from common import model, sampling
@@ -65,6 +66,34 @@ async def healthcheck(response: Response) -> HealthCheckResponse:
     )
 
 
+@router.get("/.well-known/serviceinfo")
+async def service_info():
+    return JSONResponse(
+        content={
+            "version": 0.1,
+            "software": {
+                "name": "TabbyAPI",
+                "repository": "https://github.com/theroyallab/tabbyAPI",
+                "homepage": "https://github.com/theroyallab/tabbyAPI",
+            },
+            "api": {
+                "openai": {
+                    "name": "OpenAI API",
+                    "relative_url": "/v1",
+                    "documentation": "https://theroyallab.github.io/tabbyAPI",
+                    "version": 1,
+                },
+                "koboldai": {
+                    "name": "KoboldAI API",
+                    "relative_url": "/api",
+                    "documentation": "https://theroyallab.github.io/tabbyAPI",
+                    "version": 1,
+                },
+            },
+        }
+    )
+
+
 # Model list endpoint
 @router.get("/v1/models", dependencies=[Depends(check_api_key)])
 @router.get("/v1/model/list", dependencies=[Depends(check_api_key)])
diff --git a/pyproject.toml b/pyproject.toml
index b27f453..6f23bd0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -62,52 +62,30 @@ dev = [
 ]
 cu121 = [
     # Torch (Extra index URLs not support in pyproject.toml)
-    "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
-    "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
-    "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 
     # Exl2
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 
     # Windows FA2 from https://github.com/bdashore3/flash-attention/releases
-    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4.0cxx11abiFALSE-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
-    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4.0cxx11abiFALSE-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4.0cxx11abiFALSE-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
+    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu124torch2.5.1cxx11abiFALSE-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
+    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu124torch2.5.1cxx11abiFALSE-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
+    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu124torch2.5.1cxx11abiFALSE-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
 
     # Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases
-    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
-]
-cu118 = [
-    # Torch
-    "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
-    "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
-    "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
-
-    # Exl2
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
-
-    # Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases
-    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 ]
 amd = [
     # Torch triton for ROCm
@@ -121,9 +99,9 @@ amd = [
     "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.4.1%2Brocm6.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
 
     # Exl2
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+rocm6.1.torch2.4.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+rocm6.1.torch2.4.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+rocm6.1.torch2.4.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
 ]
 
 # MARK: Ruff options