From 1d900b365b1304b004ccf7c0d942b950dd788aa4 Mon Sep 17 00:00:00 2001
From: vansangpfiev <sang@jan.ai>
Date: Fri, 27 Dec 2024 06:39:47 +0700
Subject: [PATCH] fix: CI

---
 .../scripts/e2e-test-server-linux-and-mac.sh  |   0
 .github/scripts/e2e-test-server-windows.bat   |   4 +
 .../workflows/template-quality-gate-pr.yml    | 410 +++++++++---------
 Makefile                                      |   8 +-
 src/llama_engine.cc                           |  84 +++-
 src/llama_utils.h                             |   6 +-
 6 files changed, 288 insertions(+), 224 deletions(-)
 mode change 100644 => 100755 .github/scripts/e2e-test-server-linux-and-mac.sh

diff --git a/.github/scripts/e2e-test-server-linux-and-mac.sh b/.github/scripts/e2e-test-server-linux-and-mac.sh
old mode 100644
new mode 100755
diff --git a/.github/scripts/e2e-test-server-windows.bat b/.github/scripts/e2e-test-server-windows.bat
index d1bf785..7f10492 100644
--- a/.github/scripts/e2e-test-server-windows.bat
+++ b/.github/scripts/e2e-test-server-windows.bat
@@ -69,6 +69,7 @@ set "curl_data3={\"llama_model_path\":\"%MODEL_LLM_PATH_STRING%\"}"
 set "curl_data4={\"llama_model_path\":\"%MODEL_EMBEDDING_PATH_STRING%\", \"embedding\": true, \"model_type\": \"embedding\"}"
 set "curl_data5={}"
 set "curl_data6={\"input\": \"Hello\", \"model\": \"test-embedding\", \"encoding_format\": \"float\"}"
+set "curl_data7={\"model\": \"test-embedding\"}"
 
 rem Print the values of curl_data for debugging
 echo curl_data1=%curl_data1%
@@ -77,6 +78,7 @@ echo curl_data3=%curl_data3%
 echo curl_data4=%curl_data4%
 echo curl_data5=%curl_data5%
 echo curl_data6=%curl_data6%
+echo curl_data7=%curl_data7%
 
 rem Run the curl commands and capture the status code
 curl.exe --connect-timeout 60 -o "%TEMP%\response1.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/loadmodel" --header "Content-Type: application/json" --data "%curl_data1%" > %TEMP%\response1.log 2>&1
@@ -95,6 +97,8 @@ curl.exe --connect-timeout 60 -o "%TEMP%\response6.log" -s -w "%%{http_code}" --
 --header "Content-Type: application/json" ^
 --data "%curl_data6%" > %TEMP%\response6.log 2>&1
 
+curl.exe --connect-timeout 60 -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/unloadmodel" --header "Content-Type: application/json" --data "%curl_data7%" 2>&1
+
 set "error_occurred=0"
 
 rem Read the status codes from the log files
diff --git a/.github/workflows/template-quality-gate-pr.yml b/.github/workflows/template-quality-gate-pr.yml
index 93e055d..d3910d7 100644
--- a/.github/workflows/template-quality-gate-pr.yml
+++ b/.github/workflows/template-quality-gate-pr.yml
@@ -28,122 +28,122 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - os: "linux"
-            name: "arm64"
-            runs-on: "ubuntu-2004-arm64"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: true
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
+          # - os: "linux"
+          #   name: "arm64"
+          #   runs-on: "ubuntu-2004-arm64"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+          #   run-e2e: true
+          #   vulkan: false
+          #   ccache: true
+          #   ccache-dir: "/home/runner/.ccache"
           - os: "linux"
             name: "amd64-avx2"
             runs-on: "ubuntu-20-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: true
             vulkan: false
             ccache: true
             ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "amd64-noavx"
-            runs-on: "ubuntu-20-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "amd64-avx"
-            runs-on: "ubuntu-20-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "amd64-avx512"
-            runs-on: "ubuntu-20-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "amd64-vulkan"
-            runs-on: "ubuntu-22-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: true
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "amd64-noavx-cuda-11-7"
-            runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "amd64-avx2-cuda-11-7"
-            runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "amd64-avx-cuda-11-7"
-            runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "amd64-avx512-cuda-11-7"
-            runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "amd64-noavx-cuda-12-0"
-            runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_FMA=OFF -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "amd64-avx2-cuda-12-0"
-            runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "amd64-avx-cuda-12-0"
-            runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "amd64-avx512-cuda-12-0"
-            runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
+          # - os: "linux"
+          #   name: "amd64-noavx"
+          #   runs-on: "ubuntu-20-04"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: true
+          #   ccache-dir: "/home/runner/.ccache"
+          # - os: "linux"
+          #   name: "amd64-avx"
+          #   runs-on: "ubuntu-20-04"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: true
+          #   ccache-dir: "/home/runner/.ccache"
+          # - os: "linux"
+          #   name: "amd64-avx512"
+          #   runs-on: "ubuntu-20-04"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: true
+          #   ccache-dir: "/home/runner/.ccache"
+          # - os: "linux"
+          #   name: "amd64-vulkan"
+          #   runs-on: "ubuntu-22-04"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+          #   run-e2e: false
+          #   vulkan: true
+          #   ccache: true
+          #   ccache-dir: "/home/runner/.ccache"
+          # - os: "linux"
+          #   name: "amd64-noavx-cuda-11-7"
+          #   runs-on: "ubuntu-20-04-cuda-11-7"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: true
+          #   ccache-dir: "/home/runner/.ccache"
+          # - os: "linux"
+          #   name: "amd64-avx2-cuda-11-7"
+          #   runs-on: "ubuntu-20-04-cuda-11-7"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: true
+          #   ccache-dir: "/home/runner/.ccache"
+          # - os: "linux"
+          #   name: "amd64-avx-cuda-11-7"
+          #   runs-on: "ubuntu-20-04-cuda-11-7"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: true
+          #   ccache-dir: "/home/runner/.ccache"
+          # - os: "linux"
+          #   name: "amd64-avx512-cuda-11-7"
+          #   runs-on: "ubuntu-20-04-cuda-11-7"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: true
+          #   ccache-dir: "/home/runner/.ccache"
+          # - os: "linux"
+          #   name: "amd64-noavx-cuda-12-0"
+          #   runs-on: "ubuntu-20-04-cuda-12-0"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_FMA=OFF -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: true
+          #   ccache-dir: "/home/runner/.ccache"
+          # - os: "linux"
+          #   name: "amd64-avx2-cuda-12-0"
+          #   runs-on: "ubuntu-20-04-cuda-12-0"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: true
+          #   ccache-dir: "/home/runner/.ccache"
+          # - os: "linux"
+          #   name: "amd64-avx-cuda-12-0"
+          #   runs-on: "ubuntu-20-04-cuda-12-0"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: true
+          #   ccache-dir: "/home/runner/.ccache"
+          # - os: "linux"
+          #   name: "amd64-avx512-cuda-12-0"
+          #   runs-on: "ubuntu-20-04-cuda-12-0"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: true
+          #   ccache-dir: "/home/runner/.ccache"
           - os: "mac"
             name: "amd64"
             runs-on: "macos-selfhosted-12"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL=OFF"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL=OFF"
             run-e2e: true
             vulkan: false
             ccache: false
@@ -151,7 +151,7 @@ jobs:
           - os: "mac"
             name: "arm64"
             runs-on: "macos-selfhosted-12-arm64"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL_EMBED_LIBRARY=ON"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL_EMBED_LIBRARY=ON"
             run-e2e: true
             vulkan: false
             ccache: false
@@ -159,107 +159,107 @@ jobs:
           - os: "windows"
             name: "amd64-avx2"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-            run-e2e: true
-            vulkan: false
-            ccache: false
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "windows"
-            name: "amd64-noavx"
-            runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: false
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "windows"
-            name: "amd64-avx"
-            runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
             run-e2e: true
             vulkan: false
             ccache: false
             ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "windows"
-            name: "amd64-avx512"
-            runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: false
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "windows"
-            name: "amd64-vulkan"
-            runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-            run-e2e: false
-            vulkan: true
-            ccache: false
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "windows"
-            name: "amd64-noavx-cuda-12-0"
-            runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "windows"
-            name: "amd64-avx2-cuda-12-0"
-            runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "windows"
-            name: "amd64-avx-cuda-12-0"
-            runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "windows"
-            name: "amd64-avx512-cuda-12-0"
-            runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "windows"
-            name: "amd64-noavx-cuda-11-7"
-            runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "windows"
-            name: "amd64-avx2-cuda-11-7"
-            runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "windows"
-            name: "amd64-avx-cuda-11-7"
-            runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "windows"
-            name: "amd64-avx512-cuda-11-7"
-            runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "windows"
+          #   name: "amd64-noavx"
+          #   runs-on: "windows-cuda-11-7"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "windows"
+          #   name: "amd64-avx"
+          #   runs-on: "windows-cuda-12-0"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+          #   run-e2e: true
+          #   vulkan: false
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "windows"
+          #   name: "amd64-avx512"
+          #   runs-on: "windows-cuda-12-0"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "windows"
+          #   name: "amd64-vulkan"
+          #   runs-on: "windows-cuda-12-0"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+          #   run-e2e: false
+          #   vulkan: true
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "windows"
+          #   name: "amd64-noavx-cuda-12-0"
+          #   runs-on: "windows-cuda-12-0"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: true
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "windows"
+          #   name: "amd64-avx2-cuda-12-0"
+          #   runs-on: "windows-cuda-12-0"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: true
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "windows"
+          #   name: "amd64-avx-cuda-12-0"
+          #   runs-on: "windows-cuda-12-0"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: true
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "windows"
+          #   name: "amd64-avx512-cuda-12-0"
+          #   runs-on: "windows-cuda-12-0"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: true
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "windows"
+          #   name: "amd64-noavx-cuda-11-7"
+          #   runs-on: "windows-cuda-11-7"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: true
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "windows"
+          #   name: "amd64-avx2-cuda-11-7"
+          #   runs-on: "windows-cuda-11-7"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: true
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "windows"
+          #   name: "amd64-avx-cuda-11-7"
+          #   runs-on: "windows-cuda-11-7"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: true
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "windows"
+          #   name: "amd64-avx512-cuda-11-7"
+          #   runs-on: "windows-cuda-11-7"
+          #   cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: true
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
 
     steps:
       - name: Clone
@@ -271,7 +271,7 @@ jobs:
       - name: Apply patch file
         run: |
           cd llama.cpp
-          git apply ../patches/0001-Add-API-query-buffer-size.patch
+          git apply ../patches/0002-Build-llama-cpp-examples.patch
 
       - name: use python for linux
         continue-on-error: true
diff --git a/Makefile b/Makefile
index a91b73d..7db679e 100644
--- a/Makefile
+++ b/Makefile
@@ -97,16 +97,18 @@ ifeq ($(RUN_TESTS),false)
 	@exit 0
 endif
 ifeq ($(OS),Windows_NT)
-	@powershell -Command "mkdir -p examples\server\build\engines\cortex.llamacpp; cd examples\server\build; cp ..\..\..\build\engine.dll engines\cortex.llamacpp; ..\..\..\.github\scripts\e2e-test-server-windows.bat server.exe $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);"
+	@powershell -Command "mkdir -p examples\server\build\engines\cortex.llamacpp; cd examples\server\build; cp ..\..\..\build\engine.dll engines\cortex.llamacpp; cp ..\..\..\build\bin\llama-server.exe engines\cortex.llamacpp; ..\..\..\.github\scripts\e2e-test-server-windows.bat server.exe $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);"
 else ifeq ($(shell uname -s),Linux)
 	@mkdir -p examples/server/build/engines/cortex.llamacpp; \
 	cd examples/server/build/; \
 	cp ../../../build/libengine.so engines/cortex.llamacpp/; \
+	cp ../../../build/bin/llama-server engines/cortex.llamacpp/; \
 	chmod +x ../../../.github/scripts/e2e-test-server-linux-and-mac.sh && ../../../.github/scripts/e2e-test-server-linux-and-mac.sh ./server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);
 else
 	@mkdir -p examples/server/build/engines/cortex.llamacpp; \
 	cd examples/server/build/; \
 	cp ../../../build/libengine.dylib engines/cortex.llamacpp/; \
+	cp ../../../build/bin/llama-server engines/cortex.llamacpp/; \
 	chmod +x ../../../.github/scripts/e2e-test-server-linux-and-mac.sh && ../../../.github/scripts/e2e-test-server-linux-and-mac.sh ./server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);
 endif
 
@@ -118,13 +120,14 @@ endif
 ifeq ($(OS),Windows_NT)
 	@powershell -Command "python -m pip install --upgrade pip"
 	@powershell -Command "python -m pip install requests;"
-	@powershell -Command "mkdir -p examples\server\build\engines\cortex.llamacpp; cd examples\server\build; cp ..\..\..\build\engine.dll engines\cortex.llamacpp; python ..\..\..\.github\scripts\e2e-test-server.py server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);"
+	@powershell -Command "mkdir -p examples\server\build\engines\cortex.llamacpp; cd examples\server\build; cp ..\..\..\build\engine.dll engines\cortex.llamacpp; cp ..\..\..\build\bin\llama-server.exe engines\cortex.llamacpp; python ..\..\..\.github\scripts\e2e-test-server.py server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);"
 else ifeq ($(shell uname -s),Linux)
 	python -m pip install --upgrade pip;
 	python -m pip install requests;
 	@mkdir -p examples/server/build/engines/cortex.llamacpp; \
 	cd examples/server/build/; \
 	cp ../../../build/libengine.so engines/cortex.llamacpp/; \
+	cp ../../../build/bin/llama-server engines/cortex.llamacpp/; \
 	python  ../../../.github/scripts/e2e-test-server.py server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);
 else
 	python -m pip install --upgrade pip;
@@ -132,6 +135,7 @@ else
 	@mkdir -p examples/server/build/engines/cortex.llamacpp; \
 	cd examples/server/build/; \
 	cp ../../../build/libengine.dylib engines/cortex.llamacpp/; \
+	cp ../../../build/bin/llama-server engines/cortex.llamacpp/; \
 	python  ../../../.github/scripts/e2e-test-server.py server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);
 endif
 
diff --git a/src/llama_engine.cc b/src/llama_engine.cc
index c3af7c3..e2571a2 100644
--- a/src/llama_engine.cc
+++ b/src/llama_engine.cc
@@ -634,8 +634,7 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
 
   // Spawn llama.cpp server only if it is chat model
   if (!json_body->isMember("mmproj")) {
-    SpawnLlamaServer(*json_body);
-    return true;
+    return SpawnLlamaServer(*json_body);
   }
   common_params params;
   std::string model_type;
@@ -1388,11 +1387,24 @@ bool LlamaEngine::HasForceStopInferenceModel(const std::string& id) const {
 }
 
 bool LlamaEngine::SpawnLlamaServer(const Json::Value& json_params) {
+  auto wait_for_server_up = [](const std::string& host, int port) {
+    for (size_t i = 0; i < 10; i++) {
+      httplib::Client cli(host + ":" + std::to_string(port));
+      auto res = cli.Get("/health");
+      if (res && res->status == httplib::StatusCode::OK_200) {
+        return true;
+      } else {
+        LOG_INFO << "Wait for server up: " << i;
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+      }
+    }
+    return false;
+  };
+
   // TODO(sang) clean up resources if any errors
   LOG_DEBUG << "Start to spawn llama-server";
-  std::string model;
-  if (auto o = json_params["model"]; !o.isNull()) {
-    model = o.asString();
+  auto model = llama_utils::GetModelId(json_params);
+  if (!model.empty()) {
     llama_server_map_[model].host = "127.0.0.1";
     llama_server_map_[model].port =
         llama_utils::GenerateRandomInteger(39400, 39999);
@@ -1440,6 +1452,8 @@ bool LlamaEngine::SpawnLlamaServer(const Json::Value& json_params) {
     std::cout << "Could not start server: " << GetLastError() << std::endl;
     return false;
   } else {
+    if (!wait_for_server_up(s.host, s.port))
+      return false;
     std::cout << "Server started" << std::endl;
   }
 #else
@@ -1483,6 +1497,8 @@ bool LlamaEngine::SpawnLlamaServer(const Json::Value& json_params) {
     execv(p.c_str(), exec_args.data());
   } else {
     // Parent process
+    if (!wait_for_server_up(s.host, s.port))
+      return false;
     std::cout << "Server started" << std::endl;
   }
 #endif
@@ -1494,13 +1510,28 @@ std::string LlamaEngine::ConvertJsonToParams(const Json::Value& root) {
   std::string errors;
 
   for (const auto& member : root.getMemberNames()) {
-    if (member == "model_path") {
+    if (member == "model_path" || member == "llama_model_path") {
       ss << "--model" << " ";
       ss << "\"" << root[member].asString() << "\" ";
       continue;
-    } else if (member == "model") {
+    } else if (member == "model" || member == "model_alias" ||
+               member == "embedding") {
+      continue;
+    } else if (member == "ctx_len") {
+      ss << "--ctx-size" << " ";
+      ss << "\"" << std::to_string(root[member].asInt()) << "\" ";
+      continue;
+    } else if (member == "ngl") {
+      ss << "-ngl" << " ";
+      ss << "\"" << std::to_string(root[member].asInt()) << "\" ";
+      continue;
+    } else if (member == "model_type") {
+      if (root[member].asString() == "embedding") {
+        ss << "--embedding" << " ";
+      }
       continue;
     }
+
     ss << "--" << member << " ";
     if (root[member].isString()) {
       ss << "\"" << root[member].asString() << "\" ";
@@ -1529,11 +1560,25 @@ std::vector<std::string> LlamaEngine::ConvertJsonToParamsVector(
   std::string errors;
 
   for (const auto& member : root.getMemberNames()) {
-    if (member == "model_path") {
+    if (member == "model_path" || member == "llama_model_path") {
       res.push_back("--model");
       res.push_back(root[member].asString());
       continue;
-    } else if (member == "model") {
+    } else if (member == "model" || member == "model_alias" ||
+               member == "embedding") {
+      continue;
+    } else if (member == "ctx_len") {
+      res.push_back("--ctx-size");
+      res.push_back(std::to_string(root[member].asInt()));
+      continue;
+    } else if (member == "ngl") {
+      res.push_back("-ngl");
+      res.push_back(std::to_string(root[member].asInt()));
+      continue;
+    } else if (member == "model_type") {
+      if (root[member].asString() == "embedding") {
+        res.push_back("--embedding");
+      }
       continue;
     }
 
@@ -1584,18 +1629,25 @@ bool LlamaEngine::HandleLlamaCppChatCompletion(
           req.content_receiver = [cb](const char* data, size_t data_length,
                                       uint64_t offset, uint64_t total_length) {
             std::string s(data, data_length);
-            Json::Value respData;
-            respData["data"] = s;
+            Json::Value resp_data;
             Json::Value status;
+
+            if (s.find("[DONE]") != std::string::npos) {
+              LOG_DEBUG << "[DONE]";
+              status["is_done"] = true;
+              status["has_error"] = false;
+              status["is_stream"] = true;
+              status["status_code"] = k200OK;
+              cb(std::move(status), std::move(resp_data));
+              return false;
+            }
+
+            resp_data["data"] = s;
             status["is_done"] = false;
             status["has_error"] = false;
             status["is_stream"] = true;
             status["status_code"] = k200OK;
-            cb(std::move(status), std::move(respData));
-            if (s == "[DONE]") {
-              LOG_DEBUG << "[DONE]";
-              return false;
-            }
+            cb(std::move(status), std::move(resp_data));
             LOG_DEBUG << s;
             return true;
           };
diff --git a/src/llama_utils.h b/src/llama_utils.h
index 910eba3..2465ec0 100644
--- a/src/llama_utils.h
+++ b/src/llama_utils.h
@@ -10,6 +10,10 @@
 #include <string>
 #include <vector>
 
+#if defined(__APPLE__)
+#include <mach-o/dyld.h>
+#endif
+
 namespace llama_utils {
 
 inline std::string models_folder = "./models";
@@ -195,7 +199,7 @@ std::filesystem::path GetExecutableFolderContainerPath() {
     // CTL_DBG("Executable path: " << buffer);
     return std::filesystem::path{buffer}.parent_path();
   } else {
-    CTL_ERR("Failed to get executable path");
+    LOG_ERROR << "Failed to get executable path";
     return std::filesystem::current_path();
   }
 #elif defined(__linux__)