From 1c9bc2d1af2aee83056196b8b0953ebcff82f82f Mon Sep 17 00:00:00 2001
From: AlpinDale <alpindale@gmail.com>
Date: Mon, 4 Nov 2024 12:35:08 +0000
Subject: [PATCH 1/7] feat: add serviceinfo URI

---
 endpoints/OAI/router.py | 34 +++++++++++++++++++++++++++++++++-
 endpoints/server.py     |  2 +-
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/endpoints/OAI/router.py b/endpoints/OAI/router.py
index b6a44c98..fe10a9d3 100644
--- a/endpoints/OAI/router.py
+++ b/endpoints/OAI/router.py
@@ -1,5 +1,6 @@
 import asyncio
 from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse
 from sse_starlette import EventSourceResponse
 from sys import maxsize
 
@@ -29,13 +30,19 @@
 
 api_name = "OAI"
 router = APIRouter()
+host = None
+port = None
+
 urls = {
     "Completions": "http://{host}:{port}/v1/completions",
     "Chat completions": "http://{host}:{port}/v1/chat/completions",
 }
 
 
-def setup():
+def setup(server_host: str = None, server_port: int = None):
+    global host, port
+    host = server_host
+    port = server_port
     return router
 
 
@@ -166,3 +173,28 @@ async def embeddings(request: Request, data: EmbeddingsRequest) -> EmbeddingsRes
     )
 
     return response
+
+@router.get("/.well-known/serviceinfo")
+async def service_info():
+    return JSONResponse(content={
+        "version": 0.1,
+        "software": {
+            "name": "TabbyAPI",
+            "repository": "https://github.com/theroyallab/tabbyAPI",
+            "homepage": "https://github.com/theroyallab/tabbyAPI",
+        },
+        "api": {
+            "openai": {
+                "name": "OpenAI API",
+                "base_url": f"http://{host}:{port}/v1",
+                "documentation": "https://theroyallab.github.io/tabbyAPI",
+                "version": 1
+            },
+            "koboldai": {
+                "name": "KoboldAI API",
+                "base_url": f"http://{host}:{port}/api",
+                "documentation": "https://theroyallab.github.io/tabbyAPI",
+                "version": 1
+            }
+        }
+    })
diff --git a/endpoints/server.py b/endpoints/server.py
index 3555a5b4..90bf6c48 100644
--- a/endpoints/server.py
+++ b/endpoints/server.py
@@ -52,7 +52,7 @@ def setup_app(host: Optional[str] = None, port: Optional[int] = None):
         selected_server = router_mapping.get(server.lower())
 
         if selected_server:
-            app.include_router(selected_server.setup())
+            app.include_router(selected_server.setup(host, port))
 
             logger.info(f"Starting {selected_server.api_name} API")
             for path, url in selected_server.urls.items():

From c9ff8ef2c2a6b9bf47698ba935f543ce336d1662 Mon Sep 17 00:00:00 2001
From: AlpinDale <alpindale@gmail.com>
Date: Mon, 4 Nov 2024 13:28:04 +0000
Subject: [PATCH 2/7] upgrade to v0.2

---
 endpoints/OAI/router.py | 13 ++++---------
 endpoints/server.py     |  2 +-
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/endpoints/OAI/router.py b/endpoints/OAI/router.py
index fe10a9d3..5bdd5b07 100644
--- a/endpoints/OAI/router.py
+++ b/endpoints/OAI/router.py
@@ -30,8 +30,6 @@
 
 api_name = "OAI"
 router = APIRouter()
-host = None
-port = None
 
 urls = {
     "Completions": "http://{host}:{port}/v1/completions",
@@ -39,10 +37,7 @@
 }
 
 
-def setup(server_host: str = None, server_port: int = None):
-    global host, port
-    host = server_host
-    port = server_port
+def setup():
     return router
 
 
@@ -177,7 +172,7 @@ async def embeddings(request: Request, data: EmbeddingsRequest) -> EmbeddingsRes
 @router.get("/.well-known/serviceinfo")
 async def service_info():
     return JSONResponse(content={
-        "version": 0.1,
+        "version": 0.2,
         "software": {
             "name": "TabbyAPI",
             "repository": "https://github.com/theroyallab/tabbyAPI",
@@ -186,13 +181,13 @@ async def service_info():
         "api": {
             "openai": {
                 "name": "OpenAI API",
-                "base_url": f"http://{host}:{port}/v1",
+                "relative_url": "/v1",
                 "documentation": "https://theroyallab.github.io/tabbyAPI",
                 "version": 1
             },
             "koboldai": {
                 "name": "KoboldAI API",
-                "base_url": f"http://{host}:{port}/api",
+                "relative_url": "/api",
                 "documentation": "https://theroyallab.github.io/tabbyAPI",
                 "version": 1
             }
diff --git a/endpoints/server.py b/endpoints/server.py
index 90bf6c48..3555a5b4 100644
--- a/endpoints/server.py
+++ b/endpoints/server.py
@@ -52,7 +52,7 @@ def setup_app(host: Optional[str] = None, port: Optional[int] = None):
         selected_server = router_mapping.get(server.lower())
 
         if selected_server:
-            app.include_router(selected_server.setup(host, port))
+            app.include_router(selected_server.setup())
 
             logger.info(f"Starting {selected_server.api_name} API")
             for path, url in selected_server.urls.items():

From a52610fb19ca32c39fa431559e674ea50c803568 Mon Sep 17 00:00:00 2001
From: randoentity <random>
Date: Sun, 24 Nov 2024 13:40:33 +0100
Subject: [PATCH 3/7] workaround for tool calling

---
 endpoints/OAI/utils/chat_completion.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py
index 14a2243d..9fe9aa87 100644
--- a/endpoints/OAI/utils/chat_completion.py
+++ b/endpoints/OAI/utils/chat_completion.py
@@ -454,11 +454,11 @@ async def generate_tool_calls(
         if gen["stop_str"] in tool_data.tool_call_start:
             if "text" in gen:
                 # non streaming, all generations will have the text they generated
-                pre_tool_prompt = await apply_chat_template(data, gen["text"])
+                pre_tool_prompt, _ = await apply_chat_template(data, gen["text"])
             elif current_generations is not None:
                 # streaming, we wont have text in the generation,
                 # we'll have to use the current_generations
-                pre_tool_prompt = await apply_chat_template(data, current_generations)
+                pre_tool_prompt, _ = await apply_chat_template(data, current_generations)
 
             gen_tasks.append(
                 asyncio.create_task(

From 5fadaa728a181c163733c4d37fa74068347e9958 Mon Sep 17 00:00:00 2001
From: kingbri <bdashore3@proton.me>
Date: Thu, 28 Nov 2024 23:07:58 -0500
Subject: [PATCH 4/7] API: Move serviceinfo to core

Best to expose this endpoint to all APIs as its an information endpoint.

Signed-off-by: kingbri <bdashore3@proton.me>
---
 endpoints/OAI/router.py  | 27 ---------------------------
 endpoints/core/router.py | 29 +++++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/endpoints/OAI/router.py b/endpoints/OAI/router.py
index 5bdd5b07..b6a44c98 100644
--- a/endpoints/OAI/router.py
+++ b/endpoints/OAI/router.py
@@ -1,6 +1,5 @@
 import asyncio
 from fastapi import APIRouter, Depends, HTTPException, Request
-from fastapi.responses import JSONResponse
 from sse_starlette import EventSourceResponse
 from sys import maxsize
 
@@ -30,7 +29,6 @@
 
 api_name = "OAI"
 router = APIRouter()
-
 urls = {
     "Completions": "http://{host}:{port}/v1/completions",
     "Chat completions": "http://{host}:{port}/v1/chat/completions",
@@ -168,28 +166,3 @@ async def embeddings(request: Request, data: EmbeddingsRequest) -> EmbeddingsRes
     )
 
     return response
-
-@router.get("/.well-known/serviceinfo")
-async def service_info():
-    return JSONResponse(content={
-        "version": 0.2,
-        "software": {
-            "name": "TabbyAPI",
-            "repository": "https://github.com/theroyallab/tabbyAPI",
-            "homepage": "https://github.com/theroyallab/tabbyAPI",
-        },
-        "api": {
-            "openai": {
-                "name": "OpenAI API",
-                "relative_url": "/v1",
-                "documentation": "https://theroyallab.github.io/tabbyAPI",
-                "version": 1
-            },
-            "koboldai": {
-                "name": "KoboldAI API",
-                "relative_url": "/api",
-                "documentation": "https://theroyallab.github.io/tabbyAPI",
-                "version": 1
-            }
-        }
-    })
diff --git a/endpoints/core/router.py b/endpoints/core/router.py
index f2b42473..017738b7 100644
--- a/endpoints/core/router.py
+++ b/endpoints/core/router.py
@@ -2,6 +2,7 @@
 import pathlib
 from sys import maxsize
 from fastapi import APIRouter, Depends, HTTPException, Request, Response
+from fastapi.responses import JSONResponse
 from sse_starlette import EventSourceResponse
 
 from common import model, sampling
@@ -61,6 +62,34 @@ async def healthcheck(response: Response) -> HealthCheckResponse:
     )
 
 
+@router.get("/.well-known/serviceinfo")
+async def service_info():
+    return JSONResponse(
+        content={
+            "version": 0.1,
+            "software": {
+                "name": "TabbyAPI",
+                "repository": "https://github.com/theroyallab/tabbyAPI",
+                "homepage": "https://github.com/theroyallab/tabbyAPI",
+            },
+            "api": {
+                "openai": {
+                    "name": "OpenAI API",
+                    "relative_url": "/v1",
+                    "documentation": "https://theroyallab.github.io/tabbyAPI",
+                    "version": 1,
+                },
+                "koboldai": {
+                    "name": "KoboldAI API",
+                    "relative_url": "/api",
+                    "documentation": "https://theroyallab.github.io/tabbyAPI",
+                    "version": 1,
+                },
+            },
+        }
+    )
+
+
 # Model list endpoint
 @router.get("/v1/models", dependencies=[Depends(check_api_key)])
 @router.get("/v1/model/list", dependencies=[Depends(check_api_key)])

From 2e06fb01d3703e71075ade2a5c202309f68d52ad Mon Sep 17 00:00:00 2001
From: kingbri <bdashore3@proton.me>
Date: Thu, 28 Nov 2024 23:27:59 -0500
Subject: [PATCH 5/7] OAI: Pass mm_embeddings to tool call generation

Don't exclude the vision embeddings when regenerating for a tool call.

Signed-off-by: kingbri <bdashore3@proton.me>
---
 endpoints/OAI/utils/chat_completion.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py
index 9fe9aa87..a646924c 100644
--- a/endpoints/OAI/utils/chat_completion.py
+++ b/endpoints/OAI/utils/chat_completion.py
@@ -454,16 +454,23 @@ async def generate_tool_calls(
         if gen["stop_str"] in tool_data.tool_call_start:
             if "text" in gen:
                 # non streaming, all generations will have the text they generated
-                pre_tool_prompt, _ = await apply_chat_template(data, gen["text"])
+                pre_tool_prompt, mm_embeddings = await apply_chat_template(
+                    data, gen["text"]
+                )
             elif current_generations is not None:
                 # streaming, we wont have text in the generation,
                 # we'll have to use the current_generations
-                pre_tool_prompt, _ = await apply_chat_template(data, current_generations)
+                pre_tool_prompt, mm_embeddings = await apply_chat_template(
+                    data, current_generations
+                )
 
             gen_tasks.append(
                 asyncio.create_task(
                     model.container.generate(
-                        pre_tool_prompt, request.state.id, **gen_params
+                        pre_tool_prompt,
+                        request.state.id,
+                        embeddings=mm_embeddings,
+                        **gen_params,
                     )
                 )
             )

From ca86ab54776ed4648d3a482b2722770f16fc95ed Mon Sep 17 00:00:00 2001
From: kingbri <8082010+bdashore3@users.noreply.github.com>
Date: Tue, 3 Dec 2024 22:37:03 -0500
Subject: [PATCH 6/7] Dependencies: Remove CUDA 11.8

Most software has moved to CUDA 12 and cards that aren't supported by
11.8 don't use tabby anyways.

Signed-off-by: kingbri <8082010+bdashore3@users.noreply.github.com>
---
 pyproject.toml | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index de782b73..0694ddfa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -87,28 +87,6 @@ cu121 = [
     "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
     "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 ]
-cu118 = [
-    # Torch
-    "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
-    "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
-    "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
-
-    # Exl2
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
-
-    # Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases
-    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
-]
 amd = [
     # Torch triton for ROCm
     "pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-3.0.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",

From ac85e34356079af017ce4e85a7d431ba4a37bef8 Mon Sep 17 00:00:00 2001
From: kingbri <8082010+bdashore3@users.noreply.github.com>
Date: Tue, 3 Dec 2024 22:57:00 -0500
Subject: [PATCH 7/7] Depenedencies: Update Torch, FA2, and Exl2

Torch: 2.5, FA2 2.7.0.post2, Exl2 v0.2.5

Don't update torch for rocm as exl2 isn't built for rocm 6.2

Signed-off-by: kingbri <8082010+bdashore3@users.noreply.github.com>
---
 pyproject.toml | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0694ddfa..89835990 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -62,30 +62,30 @@ dev = [
 ]
 cu121 = [
     # Torch (Extra index URLs not support in pyproject.toml)
-    "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
-    "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
-    "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 
     # Exl2
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 
     # Windows FA2 from https://github.com/bdashore3/flash-attention/releases
-    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4.0cxx11abiFALSE-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
-    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4.0cxx11abiFALSE-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4.0cxx11abiFALSE-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
+    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu124torch2.5.1cxx11abiFALSE-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
+    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu124torch2.5.1cxx11abiFALSE-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
+    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu124torch2.5.1cxx11abiFALSE-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
 
     # Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases
-    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 ]
 amd = [
     # Torch triton for ROCm
@@ -99,9 +99,9 @@ amd = [
     "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.4.1%2Brocm6.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
 
     # Exl2
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+rocm6.1.torch2.4.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+rocm6.1.torch2.4.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+rocm6.1.torch2.4.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
 ]
 
 # MARK: Ruff options