From 1c9bc2d1af2aee83056196b8b0953ebcff82f82f Mon Sep 17 00:00:00 2001 From: AlpinDale Date: Mon, 4 Nov 2024 12:35:08 +0000 Subject: [PATCH 1/7] feat: add serviceinfo URI --- endpoints/OAI/router.py | 34 +++++++++++++++++++++++++++++++++- endpoints/server.py | 2 +- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/endpoints/OAI/router.py b/endpoints/OAI/router.py index b6a44c98..fe10a9d3 100644 --- a/endpoints/OAI/router.py +++ b/endpoints/OAI/router.py @@ -1,5 +1,6 @@ import asyncio from fastapi import APIRouter, Depends, HTTPException, Request +from fastapi.responses import JSONResponse from sse_starlette import EventSourceResponse from sys import maxsize @@ -29,13 +30,19 @@ api_name = "OAI" router = APIRouter() +host = None +port = None + urls = { "Completions": "http://{host}:{port}/v1/completions", "Chat completions": "http://{host}:{port}/v1/chat/completions", } -def setup(): +def setup(server_host: str = None, server_port: int = None): + global host, port + host = server_host + port = server_port return router @@ -166,3 +173,28 @@ async def embeddings(request: Request, data: EmbeddingsRequest) -> EmbeddingsRes ) return response + +@router.get("/.well-known/serviceinfo") +async def service_info(): + return JSONResponse(content={ + "version": 0.1, + "software": { + "name": "TabbyAPI", + "repository": "https://github.com/theroyallab/tabbyAPI", + "homepage": "https://github.com/theroyallab/tabbyAPI", + }, + "api": { + "openai": { + "name": "OpenAI API", + "base_url": f"http://{host}:{port}/v1", + "documentation": "https://theroyallab.github.io/tabbyAPI", + "version": 1 + }, + "koboldai": { + "name": "KoboldAI API", + "base_url": f"http://{host}:{port}/api", + "documentation": "https://theroyallab.github.io/tabbyAPI", + "version": 1 + } + } + }) diff --git a/endpoints/server.py b/endpoints/server.py index 3555a5b4..90bf6c48 100644 --- a/endpoints/server.py +++ b/endpoints/server.py @@ -52,7 +52,7 @@ def setup_app(host: Optional[str] = None, port: Optional[int] = None): selected_server = router_mapping.get(server.lower()) if selected_server: - app.include_router(selected_server.setup()) + app.include_router(selected_server.setup(host, port)) logger.info(f"Starting {selected_server.api_name} API") for path, url in selected_server.urls.items(): From c9ff8ef2c2a6b9bf47698ba935f543ce336d1662 Mon Sep 17 00:00:00 2001 From: AlpinDale Date: Mon, 4 Nov 2024 13:28:04 +0000 Subject: [PATCH 2/7] upgrade to v0.2 --- endpoints/OAI/router.py | 13 ++++--------- endpoints/server.py | 2 +- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/endpoints/OAI/router.py b/endpoints/OAI/router.py index fe10a9d3..5bdd5b07 100644 --- a/endpoints/OAI/router.py +++ b/endpoints/OAI/router.py @@ -30,8 +30,6 @@ api_name = "OAI" router = APIRouter() -host = None -port = None urls = { "Completions": "http://{host}:{port}/v1/completions", @@ -39,10 +37,7 @@ } -def setup(server_host: str = None, server_port: int = None): - global host, port - host = server_host - port = server_port +def setup(): return router @@ -177,7 +172,7 @@ async def embeddings(request: Request, data: EmbeddingsRequest) -> EmbeddingsRes @router.get("/.well-known/serviceinfo") async def service_info(): return JSONResponse(content={ - "version": 0.1, + "version": 0.2, "software": { "name": "TabbyAPI", "repository": "https://github.com/theroyallab/tabbyAPI", @@ -186,13 +181,13 @@ async def service_info(): "api": { "openai": { "name": "OpenAI API", - "base_url": f"http://{host}:{port}/v1", + "relative_url": "/v1", "documentation": "https://theroyallab.github.io/tabbyAPI", "version": 1 }, "koboldai": { "name": "KoboldAI API", - "base_url": f"http://{host}:{port}/api", + "relative_url": "/api", "documentation": "https://theroyallab.github.io/tabbyAPI", "version": 1 } diff --git a/endpoints/server.py b/endpoints/server.py index 90bf6c48..3555a5b4 100644 --- a/endpoints/server.py +++ b/endpoints/server.py @@ -52,7 +52,7 @@ def setup_app(host: Optional[str] = None, port: Optional[int] = None): selected_server = router_mapping.get(server.lower()) if selected_server: - app.include_router(selected_server.setup(host, port)) + app.include_router(selected_server.setup()) logger.info(f"Starting {selected_server.api_name} API") for path, url in selected_server.urls.items(): From a52610fb19ca32c39fa431559e674ea50c803568 Mon Sep 17 00:00:00 2001 From: randoentity Date: Sun, 24 Nov 2024 13:40:33 +0100 Subject: [PATCH 3/7] workaround for tool calling --- endpoints/OAI/utils/chat_completion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py index 14a2243d..9fe9aa87 100644 --- a/endpoints/OAI/utils/chat_completion.py +++ b/endpoints/OAI/utils/chat_completion.py @@ -454,11 +454,11 @@ async def generate_tool_calls( if gen["stop_str"] in tool_data.tool_call_start: if "text" in gen: # non streaming, all generations will have the text they generated - pre_tool_prompt = await apply_chat_template(data, gen["text"]) + pre_tool_prompt, _ = await apply_chat_template(data, gen["text"]) elif current_generations is not None: # streaming, we wont have text in the generation, # we'll have to use the current_generations - pre_tool_prompt = await apply_chat_template(data, current_generations) + pre_tool_prompt, _ = await apply_chat_template(data, current_generations) gen_tasks.append( asyncio.create_task( From 5fadaa728a181c163733c4d37fa74068347e9958 Mon Sep 17 00:00:00 2001 From: kingbri Date: Thu, 28 Nov 2024 23:07:58 -0500 Subject: [PATCH 4/7] API: Move serviceinfo to core Best to expose this endpoint to all APIs as its an information endpoint. Signed-off-by: kingbri --- endpoints/OAI/router.py | 27 --------------------------- endpoints/core/router.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/endpoints/OAI/router.py b/endpoints/OAI/router.py index 5bdd5b07..b6a44c98 100644 --- a/endpoints/OAI/router.py +++ b/endpoints/OAI/router.py @@ -1,6 +1,5 @@ import asyncio from fastapi import APIRouter, Depends, HTTPException, Request -from fastapi.responses import JSONResponse from sse_starlette import EventSourceResponse from sys import maxsize @@ -30,7 +29,6 @@ api_name = "OAI" router = APIRouter() - urls = { "Completions": "http://{host}:{port}/v1/completions", "Chat completions": "http://{host}:{port}/v1/chat/completions", @@ -168,28 +166,3 @@ async def embeddings(request: Request, data: EmbeddingsRequest) -> EmbeddingsRes ) return response - -@router.get("/.well-known/serviceinfo") -async def service_info(): - return JSONResponse(content={ - "version": 0.2, - "software": { - "name": "TabbyAPI", - "repository": "https://github.com/theroyallab/tabbyAPI", - "homepage": "https://github.com/theroyallab/tabbyAPI", - }, - "api": { - "openai": { - "name": "OpenAI API", - "relative_url": "/v1", - "documentation": "https://theroyallab.github.io/tabbyAPI", - "version": 1 - }, - "koboldai": { - "name": "KoboldAI API", - "relative_url": "/api", - "documentation": "https://theroyallab.github.io/tabbyAPI", - "version": 1 - } - } - }) diff --git a/endpoints/core/router.py b/endpoints/core/router.py index f2b42473..017738b7 100644 --- a/endpoints/core/router.py +++ b/endpoints/core/router.py @@ -2,6 +2,7 @@ import pathlib from sys import maxsize from fastapi import APIRouter, Depends, HTTPException, Request, Response +from fastapi.responses import JSONResponse from sse_starlette import EventSourceResponse from common import model, sampling @@ -61,6 +62,34 @@ async def healthcheck(response: Response) -> HealthCheckResponse: ) +@router.get("/.well-known/serviceinfo") +async def service_info(): + return JSONResponse( + content={ + "version": 0.1, + "software": { + "name": "TabbyAPI", + "repository": "https://github.com/theroyallab/tabbyAPI", + "homepage": "https://github.com/theroyallab/tabbyAPI", + }, + "api": { + "openai": { + "name": "OpenAI API", + "relative_url": "/v1", + "documentation": "https://theroyallab.github.io/tabbyAPI", + "version": 1, + }, + "koboldai": { + "name": "KoboldAI API", + "relative_url": "/api", + "documentation": "https://theroyallab.github.io/tabbyAPI", + "version": 1, + }, + }, + } + ) + + # Model list endpoint @router.get("/v1/models", dependencies=[Depends(check_api_key)]) @router.get("/v1/model/list", dependencies=[Depends(check_api_key)]) From 2e06fb01d3703e71075ade2a5c202309f68d52ad Mon Sep 17 00:00:00 2001 From: kingbri Date: Thu, 28 Nov 2024 23:27:59 -0500 Subject: [PATCH 5/7] OAI: Pass mm_embeddings to tool call generation Don't exclude the vision embeddings when regenerating for a tool call. Signed-off-by: kingbri --- endpoints/OAI/utils/chat_completion.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py index 9fe9aa87..a646924c 100644 --- a/endpoints/OAI/utils/chat_completion.py +++ b/endpoints/OAI/utils/chat_completion.py @@ -454,16 +454,23 @@ async def generate_tool_calls( if gen["stop_str"] in tool_data.tool_call_start: if "text" in gen: # non streaming, all generations will have the text they generated - pre_tool_prompt, _ = await apply_chat_template(data, gen["text"]) + pre_tool_prompt, mm_embeddings = await apply_chat_template( + data, gen["text"] + ) elif current_generations is not None: # streaming, we wont have text in the generation, # we'll have to use the current_generations - pre_tool_prompt, _ = await apply_chat_template(data, current_generations) + pre_tool_prompt, mm_embeddings = await apply_chat_template( + data, current_generations + ) gen_tasks.append( asyncio.create_task( model.container.generate( - pre_tool_prompt, request.state.id, **gen_params + pre_tool_prompt, + request.state.id, + embeddings=mm_embeddings, + **gen_params, ) ) ) From ca86ab54776ed4648d3a482b2722770f16fc95ed Mon Sep 17 00:00:00 2001 From: kingbri <8082010+bdashore3@users.noreply.github.com> Date: Tue, 3 Dec 2024 22:37:03 -0500 Subject: [PATCH 6/7] Dependencies: Remove CUDA 11.8 Most software has moved to CUDA 12 and cards that aren't supported by 11.8 don't use tabby anyways. Signed-off-by: kingbri <8082010+bdashore3@users.noreply.github.com> --- pyproject.toml | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index de782b73..0694ddfa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,28 +87,6 @@ cu121 = [ "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", ] -cu118 = [ - # Torch - "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", - "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", - "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", - "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", - "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", - "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", - - # Exl2 - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", - - # Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases - "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", - "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", - "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", -] amd = [ # Torch triton for ROCm "pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-3.0.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'", From ac85e34356079af017ce4e85a7d431ba4a37bef8 Mon Sep 17 00:00:00 2001 From: kingbri <8082010+bdashore3@users.noreply.github.com> Date: Tue, 3 Dec 2024 22:57:00 -0500 Subject: [PATCH 7/7] Depenedencies: Update Torch, FA2, and Exl2 Torch: 2.5, FA2 2.7.0.post2, Exl2 v0.2.5 Don't update torch for rocm as exl2 isn't built for rocm 6.2 Signed-off-by: kingbri <8082010+bdashore3@users.noreply.github.com> --- pyproject.toml | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0694ddfa..89835990 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,30 +62,30 @@ dev = [ ] cu121 = [ # Torch (Extra index URLs not support in pyproject.toml) - "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", - "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", - "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", - "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", - "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", - "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", + "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", + "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", + "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", + "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", + "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", + "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", # Exl2 - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", # Windows FA2 from https://github.com/bdashore3/flash-attention/releases - "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4.0cxx11abiFALSE-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", - "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4.0cxx11abiFALSE-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", - "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4.0cxx11abiFALSE-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", + "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu124torch2.5.1cxx11abiFALSE-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", + "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu124torch2.5.1cxx11abiFALSE-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", + "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu124torch2.5.1cxx11abiFALSE-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", # Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases - "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", - "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", - "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", + "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", + "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", + "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", ] amd = [ # Torch triton for ROCm @@ -99,9 +99,9 @@ amd = [ "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.4.1%2Brocm6.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'", # Exl2 - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+rocm6.1.torch2.4.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+rocm6.1.torch2.4.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+rocm6.1.torch2.4.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'", ] # MARK: Ruff options