diff --git a/models/all-minilm-L6-v2-q5_k_m.gguf b/models/all-minilm-L6-v2-q5_k_m.gguf
deleted file mode 100644
index 99b1ef37..00000000
--- a/models/all-minilm-L6-v2-q5_k_m.gguf
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:60c7e141495321c7d303ec5ccc79296cfeb044263af840c583fed695d423aee8
-size 21717952
diff --git a/models/qwen2.5-coder-1.5b-instruct-q5_k_m.gguf b/models/qwen2.5-coder-1.5b-instruct-q5_k_m.gguf
deleted file mode 100644
index b1c22cee..00000000
--- a/models/qwen2.5-coder-1.5b-instruct-q5_k_m.gguf
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f15d02e8e51a5c6f9448b972819acfa66aee4d8bb7d881a8b8ba3d90da08ef09
-size 1285494336
diff --git a/tests/conftest.py b/tests/conftest.py
index 23def424..afbb5956 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -11,7 +11,6 @@
 import yaml
 
 from codegate.config import Config
-from codegate.inference import LlamaCppInferenceEngine
 
 
 @pytest.fixture
@@ -95,8 +94,3 @@ def parse_json_log(log_line: str) -> dict[str, Any]:
         return json.loads(log_line)
     except json.JSONDecodeError as e:
         pytest.fail(f"Invalid JSON log line: {e}")
-
-
-@pytest.fixture
-def inference_engine() -> LlamaCppInferenceEngine:
-    return LlamaCppInferenceEngine()
diff --git a/tests/test_inference.py b/tests/test_inference.py
deleted file mode 100644
index 9cfe5d14..00000000
--- a/tests/test_inference.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import pytest
-
-
-@pytest.mark.asyncio
-async def test_generate(inference_engine) -> None:
-    """Test code generation."""
-
-    completion_request = {
-        "model": "qwen2.5-coder-1.5b-instruct-q5_k_m",
-        "max_tokens": 4096,
-        "temperature": 0,
-        "stream": True,
-        "stop": [
-            "<|endoftext|>",
-            "<|fim_prefix|>",
-            "<|fim_middle|>",
-            "<|fim_suffix|>",
-            "<|fim_pad|>",
-            "<|repo_name|>",
-            "<|file_sep|>",
-            "<|im_start|>",
-            "<|im_end|>",
-            "/src/",
-            "#- coding: utf-8",
-            "```",
-        ],
-        "prompt": "<|fim_prefix|>\\n# codegate/test.py\\nimport requests\\n\\ndef call_api(url):\\n"
-        "    <|fim_suffix|>\\n\\n\\n\\nresponse = call_api('http://localhost/test')"
-        "\\nprint(response)<|fim_middle|>",
-    }
-    model_path = f"./models/{completion_request['model']}.gguf"
-    response = await inference_engine.complete(model_path, **completion_request)
-
-    for chunk in response:
-        assert chunk["choices"][0]["text"] is not None
-
-
-@pytest.mark.asyncio
-async def test_chat(inference_engine) -> None:
-    """Test chat completion."""
-
-    chat_request = {
-        "messages": [{"role": "user", "content": "hello"}],
-        "model": "qwen2.5-coder-1.5b-instruct-q5_k_m",
-        "max_tokens": 4096,
-        "temperature": 0,
-        "stream": True,
-    }
-
-    model_path = f"./models/{chat_request['model']}.gguf"
-    response = await inference_engine.chat(model_path, **chat_request)
-
-    for chunk in response:
-        assert "delta" in chunk["choices"][0]
-
-
-@pytest.mark.asyncio
-async def test_embed(inference_engine) -> None:
-    """Test content embedding."""
-
-    content = "Can I use invokehttp package in my project?"
-    model_path = "./models/all-minilm-L6-v2-q5_k_m.gguf"
-    vector = await inference_engine.embed(model_path, content=content)
-    assert len(vector) == 384