From 1d900b365b1304b004ccf7c0d942b950dd788aa4 Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Fri, 27 Dec 2024 06:39:47 +0700 Subject: [PATCH] fix: CI --- .../scripts/e2e-test-server-linux-and-mac.sh | 0 .github/scripts/e2e-test-server-windows.bat | 4 + .../workflows/template-quality-gate-pr.yml | 410 +++++++++--------- Makefile | 8 +- src/llama_engine.cc | 84 +++- src/llama_utils.h | 6 +- 6 files changed, 288 insertions(+), 224 deletions(-) mode change 100644 => 100755 .github/scripts/e2e-test-server-linux-and-mac.sh diff --git a/.github/scripts/e2e-test-server-linux-and-mac.sh b/.github/scripts/e2e-test-server-linux-and-mac.sh old mode 100644 new mode 100755 diff --git a/.github/scripts/e2e-test-server-windows.bat b/.github/scripts/e2e-test-server-windows.bat index d1bf785..7f10492 100644 --- a/.github/scripts/e2e-test-server-windows.bat +++ b/.github/scripts/e2e-test-server-windows.bat @@ -69,6 +69,7 @@ set "curl_data3={\"llama_model_path\":\"%MODEL_LLM_PATH_STRING%\"}" set "curl_data4={\"llama_model_path\":\"%MODEL_EMBEDDING_PATH_STRING%\", \"embedding\": true, \"model_type\": \"embedding\"}" set "curl_data5={}" set "curl_data6={\"input\": \"Hello\", \"model\": \"test-embedding\", \"encoding_format\": \"float\"}" +set "curl_data7={\"model\": \"test-embedding\"}" rem Print the values of curl_data for debugging echo curl_data1=%curl_data1% @@ -77,6 +78,7 @@ echo curl_data3=%curl_data3% echo curl_data4=%curl_data4% echo curl_data5=%curl_data5% echo curl_data6=%curl_data6% +echo curl_data7=%curl_data7% rem Run the curl commands and capture the status code curl.exe --connect-timeout 60 -o "%TEMP%\response1.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/loadmodel" --header "Content-Type: application/json" --data "%curl_data1%" > %TEMP%\response1.log 2>&1 @@ -95,6 +97,8 @@ curl.exe --connect-timeout 60 -o "%TEMP%\response6.log" -s -w "%%{http_code}" -- --header "Content-Type: application/json" ^ --data "%curl_data6%" > %TEMP%\response6.log 2>&1 +curl.exe --connect-timeout 60 -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/unloadmodel" --header "Content-Type: application/json" --data "%curl_data7%" 2>&1 + set "error_occurred=0" rem Read the status codes from the log files diff --git a/.github/workflows/template-quality-gate-pr.yml b/.github/workflows/template-quality-gate-pr.yml index 93e055d..d3910d7 100644 --- a/.github/workflows/template-quality-gate-pr.yml +++ b/.github/workflows/template-quality-gate-pr.yml @@ -28,122 +28,122 @@ jobs: fail-fast: false matrix: include: - - os: "linux" - name: "arm64" - runs-on: "ubuntu-2004-arm64" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" - run-e2e: true - vulkan: false - ccache: true - ccache-dir: "/home/runner/.ccache" + # - os: "linux" + # name: "arm64" + # runs-on: "ubuntu-2004-arm64" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" + # run-e2e: true + # vulkan: false + # ccache: true + # ccache-dir: "/home/runner/.ccache" - os: "linux" name: "amd64-avx2" runs-on: "ubuntu-20-04" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" + cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" run-e2e: true vulkan: false ccache: true ccache-dir: "/home/runner/.ccache" - - os: "linux" - name: "amd64-noavx" - runs-on: "ubuntu-20-04" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" - run-e2e: false - vulkan: false - ccache: true - ccache-dir: "/home/runner/.ccache" - - os: "linux" - name: "amd64-avx" - runs-on: "ubuntu-20-04" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" - run-e2e: false - vulkan: false - ccache: true - ccache-dir: "/home/runner/.ccache" - - os: "linux" - name: "amd64-avx512" - runs-on: "ubuntu-20-04" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" - run-e2e: false - vulkan: false - ccache: true - ccache-dir: "/home/runner/.ccache" - - os: "linux" - name: "amd64-vulkan" - runs-on: "ubuntu-22-04" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" - run-e2e: false - vulkan: true - ccache: true - ccache-dir: "/home/runner/.ccache" - - os: "linux" - name: "amd64-noavx-cuda-11-7" - runs-on: "ubuntu-20-04-cuda-11-7" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" - run-e2e: false - vulkan: false - ccache: true - ccache-dir: "/home/runner/.ccache" - - os: "linux" - name: "amd64-avx2-cuda-11-7" - runs-on: "ubuntu-20-04-cuda-11-7" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" - run-e2e: false - vulkan: false - ccache: true - ccache-dir: "/home/runner/.ccache" - - os: "linux" - name: "amd64-avx-cuda-11-7" - runs-on: "ubuntu-20-04-cuda-11-7" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" - run-e2e: false - vulkan: false - ccache: true - ccache-dir: "/home/runner/.ccache" - - os: "linux" - name: "amd64-avx512-cuda-11-7" - runs-on: "ubuntu-20-04-cuda-11-7" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" - run-e2e: false - vulkan: false - ccache: true - ccache-dir: "/home/runner/.ccache" - - os: "linux" - name: "amd64-noavx-cuda-12-0" - runs-on: "ubuntu-20-04-cuda-12-0" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_FMA=OFF -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" - run-e2e: false - vulkan: false - ccache: true - ccache-dir: "/home/runner/.ccache" - - os: "linux" - name: "amd64-avx2-cuda-12-0" - runs-on: "ubuntu-20-04-cuda-12-0" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" - run-e2e: false - vulkan: false - ccache: true - ccache-dir: "/home/runner/.ccache" - - os: "linux" - name: "amd64-avx-cuda-12-0" - runs-on: "ubuntu-20-04-cuda-12-0" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" - run-e2e: false - vulkan: false - ccache: true - ccache-dir: "/home/runner/.ccache" - - os: "linux" - name: "amd64-avx512-cuda-12-0" - runs-on: "ubuntu-20-04-cuda-12-0" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" - run-e2e: false - vulkan: false - ccache: true - ccache-dir: "/home/runner/.ccache" + # - os: "linux" + # name: "amd64-noavx" + # runs-on: "ubuntu-20-04" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" + # run-e2e: false + # vulkan: false + # ccache: true + # ccache-dir: "/home/runner/.ccache" + # - os: "linux" + # name: "amd64-avx" + # runs-on: "ubuntu-20-04" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" + # run-e2e: false + # vulkan: false + # ccache: true + # ccache-dir: "/home/runner/.ccache" + # - os: "linux" + # name: "amd64-avx512" + # runs-on: "ubuntu-20-04" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" + # run-e2e: false + # vulkan: false + # ccache: true + # ccache-dir: "/home/runner/.ccache" + # - os: "linux" + # name: "amd64-vulkan" + # runs-on: "ubuntu-22-04" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" + # run-e2e: false + # vulkan: true + # ccache: true + # ccache-dir: "/home/runner/.ccache" + # - os: "linux" + # name: "amd64-noavx-cuda-11-7" + # runs-on: "ubuntu-20-04-cuda-11-7" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" + # run-e2e: false + # vulkan: false + # ccache: true + # ccache-dir: "/home/runner/.ccache" + # - os: "linux" + # name: "amd64-avx2-cuda-11-7" + # runs-on: "ubuntu-20-04-cuda-11-7" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" + # run-e2e: false + # vulkan: false + # ccache: true + # ccache-dir: "/home/runner/.ccache" + # - os: "linux" + # name: "amd64-avx-cuda-11-7" + # runs-on: "ubuntu-20-04-cuda-11-7" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" + # run-e2e: false + # vulkan: false + # ccache: true + # ccache-dir: "/home/runner/.ccache" + # - os: "linux" + # name: "amd64-avx512-cuda-11-7" + # runs-on: "ubuntu-20-04-cuda-11-7" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" + # run-e2e: false + # vulkan: false + # ccache: true + # ccache-dir: "/home/runner/.ccache" + # - os: "linux" + # name: "amd64-noavx-cuda-12-0" + # runs-on: "ubuntu-20-04-cuda-12-0" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_FMA=OFF -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" + # run-e2e: false + # vulkan: false + # ccache: true + # ccache-dir: "/home/runner/.ccache" + # - os: "linux" + # name: "amd64-avx2-cuda-12-0" + # runs-on: "ubuntu-20-04-cuda-12-0" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" + # run-e2e: false + # vulkan: false + # ccache: true + # ccache-dir: "/home/runner/.ccache" + # - os: "linux" + # name: "amd64-avx-cuda-12-0" + # runs-on: "ubuntu-20-04-cuda-12-0" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" + # run-e2e: false + # vulkan: false + # ccache: true + # ccache-dir: "/home/runner/.ccache" + # - os: "linux" + # name: "amd64-avx512-cuda-12-0" + # runs-on: "ubuntu-20-04-cuda-12-0" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" + # run-e2e: false + # vulkan: false + # ccache: true + # ccache-dir: "/home/runner/.ccache" - os: "mac" name: "amd64" runs-on: "macos-selfhosted-12" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL=OFF" + cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL=OFF" run-e2e: true vulkan: false ccache: false @@ -151,7 +151,7 @@ jobs: - os: "mac" name: "arm64" runs-on: "macos-selfhosted-12-arm64" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL_EMBED_LIBRARY=ON" + cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL_EMBED_LIBRARY=ON" run-e2e: true vulkan: false ccache: false @@ -159,107 +159,107 @@ jobs: - os: "windows" name: "amd64-avx2" runs-on: "windows-cuda-11-7" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja" - run-e2e: true - vulkan: false - ccache: false - ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' - - os: "windows" - name: "amd64-noavx" - runs-on: "windows-cuda-11-7" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja" - run-e2e: false - vulkan: false - ccache: false - ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' - - os: "windows" - name: "amd64-avx" - runs-on: "windows-cuda-12-0" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja" + cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja" run-e2e: true vulkan: false ccache: false ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' - - os: "windows" - name: "amd64-avx512" - runs-on: "windows-cuda-12-0" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja" - run-e2e: false - vulkan: false - ccache: false - ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' - - os: "windows" - name: "amd64-vulkan" - runs-on: "windows-cuda-12-0" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja" - run-e2e: false - vulkan: true - ccache: false - ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' - - os: "windows" - name: "amd64-noavx-cuda-12-0" - runs-on: "windows-cuda-12-0" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" - run-e2e: false - vulkan: false - ccache: true - ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' - - os: "windows" - name: "amd64-avx2-cuda-12-0" - runs-on: "windows-cuda-12-0" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" - run-e2e: false - vulkan: false - ccache: true - ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' - - os: "windows" - name: "amd64-avx-cuda-12-0" - runs-on: "windows-cuda-12-0" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" - run-e2e: false - vulkan: false - ccache: true - ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' - - os: "windows" - name: "amd64-avx512-cuda-12-0" - runs-on: "windows-cuda-12-0" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" - run-e2e: false - vulkan: false - ccache: true - ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' - - os: "windows" - name: "amd64-noavx-cuda-11-7" - runs-on: "windows-cuda-11-7" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" - run-e2e: false - vulkan: false - ccache: true - ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' - - os: "windows" - name: "amd64-avx2-cuda-11-7" - runs-on: "windows-cuda-11-7" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" - run-e2e: false - vulkan: false - ccache: true - ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' - - os: "windows" - name: "amd64-avx-cuda-11-7" - runs-on: "windows-cuda-11-7" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" - run-e2e: false - vulkan: false - ccache: true - ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' - - os: "windows" - name: "amd64-avx512-cuda-11-7" - runs-on: "windows-cuda-11-7" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" - run-e2e: false - vulkan: false - ccache: true - ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' + # - os: "windows" + # name: "amd64-noavx" + # runs-on: "windows-cuda-11-7" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja" + # run-e2e: false + # vulkan: false + # ccache: false + # ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' + # - os: "windows" + # name: "amd64-avx" + # runs-on: "windows-cuda-12-0" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja" + # run-e2e: true + # vulkan: false + # ccache: false + # ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' + # - os: "windows" + # name: "amd64-avx512" + # runs-on: "windows-cuda-12-0" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja" + # run-e2e: false + # vulkan: false + # ccache: false + # ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' + # - os: "windows" + # name: "amd64-vulkan" + # runs-on: "windows-cuda-12-0" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja" + # run-e2e: false + # vulkan: true + # ccache: false + # ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' + # - os: "windows" + # name: "amd64-noavx-cuda-12-0" + # runs-on: "windows-cuda-12-0" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" + # run-e2e: false + # vulkan: false + # ccache: true + # ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' + # - os: "windows" + # name: "amd64-avx2-cuda-12-0" + # runs-on: "windows-cuda-12-0" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" + # run-e2e: false + # vulkan: false + # ccache: true + # ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' + # - os: "windows" + # name: "amd64-avx-cuda-12-0" + # runs-on: "windows-cuda-12-0" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" + # run-e2e: false + # vulkan: false + # ccache: true + # ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' + # - os: "windows" + # name: "amd64-avx512-cuda-12-0" + # runs-on: "windows-cuda-12-0" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" + # run-e2e: false + # vulkan: false + # ccache: true + # ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' + # - os: "windows" + # name: "amd64-noavx-cuda-11-7" + # runs-on: "windows-cuda-11-7" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" + # run-e2e: false + # vulkan: false + # ccache: true + # ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' + # - os: "windows" + # name: "amd64-avx2-cuda-11-7" + # runs-on: "windows-cuda-11-7" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" + # run-e2e: false + # vulkan: false + # ccache: true + # ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' + # - os: "windows" + # name: "amd64-avx-cuda-11-7" + # runs-on: "windows-cuda-11-7" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" + # run-e2e: false + # vulkan: false + # ccache: true + # ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' + # - os: "windows" + # name: "amd64-avx512-cuda-11-7" + # runs-on: "windows-cuda-11-7" + # cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" + # run-e2e: false + # vulkan: false + # ccache: true + # ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' steps: - name: Clone @@ -271,7 +271,7 @@ jobs: - name: Apply patch file run: | cd llama.cpp - git apply ../patches/0001-Add-API-query-buffer-size.patch + git apply ../patches/0002-Build-llama-cpp-examples.patch - name: use python for linux continue-on-error: true diff --git a/Makefile b/Makefile index a91b73d..7db679e 100644 --- a/Makefile +++ b/Makefile @@ -97,16 +97,18 @@ ifeq ($(RUN_TESTS),false) @exit 0 endif ifeq ($(OS),Windows_NT) - @powershell -Command "mkdir -p examples\server\build\engines\cortex.llamacpp; cd examples\server\build; cp ..\..\..\build\engine.dll engines\cortex.llamacpp; ..\..\..\.github\scripts\e2e-test-server-windows.bat server.exe $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);" + @powershell -Command "mkdir -p examples\server\build\engines\cortex.llamacpp; cd examples\server\build; cp ..\..\..\build\engine.dll engines\cortex.llamacpp; cp ..\..\..\build\bin\llama-server.exe engines\cortex.llamacpp; ..\..\..\.github\scripts\e2e-test-server-windows.bat server.exe $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);" else ifeq ($(shell uname -s),Linux) @mkdir -p examples/server/build/engines/cortex.llamacpp; \ cd examples/server/build/; \ cp ../../../build/libengine.so engines/cortex.llamacpp/; \ + cp ../../../build/bin/llama-server engines/cortex.llamacpp/; \ chmod +x ../../../.github/scripts/e2e-test-server-linux-and-mac.sh && ../../../.github/scripts/e2e-test-server-linux-and-mac.sh ./server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL); else @mkdir -p examples/server/build/engines/cortex.llamacpp; \ cd examples/server/build/; \ cp ../../../build/libengine.dylib engines/cortex.llamacpp/; \ + cp ../../../build/bin/llama-server engines/cortex.llamacpp/; \ chmod +x ../../../.github/scripts/e2e-test-server-linux-and-mac.sh && ../../../.github/scripts/e2e-test-server-linux-and-mac.sh ./server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL); endif @@ -118,13 +120,14 @@ endif ifeq ($(OS),Windows_NT) @powershell -Command "python -m pip install --upgrade pip" @powershell -Command "python -m pip install requests;" - @powershell -Command "mkdir -p examples\server\build\engines\cortex.llamacpp; cd examples\server\build; cp ..\..\..\build\engine.dll engines\cortex.llamacpp; python ..\..\..\.github\scripts\e2e-test-server.py server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);" + @powershell -Command "mkdir -p examples\server\build\engines\cortex.llamacpp; cd examples\server\build; cp ..\..\..\build\engine.dll engines\cortex.llamacpp; cp ..\..\..\build\bin\llama-server.exe engines\cortex.llamacpp; python ..\..\..\.github\scripts\e2e-test-server.py server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);" else ifeq ($(shell uname -s),Linux) python -m pip install --upgrade pip; python -m pip install requests; @mkdir -p examples/server/build/engines/cortex.llamacpp; \ cd examples/server/build/; \ cp ../../../build/libengine.so engines/cortex.llamacpp/; \ + cp ../../../build/bin/llama-server engines/cortex.llamacpp/; \ python ../../../.github/scripts/e2e-test-server.py server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL); else python -m pip install --upgrade pip; @@ -132,6 +135,7 @@ else @mkdir -p examples/server/build/engines/cortex.llamacpp; \ cd examples/server/build/; \ cp ../../../build/libengine.dylib engines/cortex.llamacpp/; \ + cp ../../../build/bin/llama-server engines/cortex.llamacpp/; \ python ../../../.github/scripts/e2e-test-server.py server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL); endif diff --git a/src/llama_engine.cc b/src/llama_engine.cc index c3af7c3..e2571a2 100644 --- a/src/llama_engine.cc +++ b/src/llama_engine.cc @@ -634,8 +634,7 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr json_body) { // Spawn llama.cpp server only if it is chat model if (!json_body->isMember("mmproj")) { - SpawnLlamaServer(*json_body); - return true; + return SpawnLlamaServer(*json_body); } common_params params; std::string model_type; @@ -1388,11 +1387,24 @@ bool LlamaEngine::HasForceStopInferenceModel(const std::string& id) const { } bool LlamaEngine::SpawnLlamaServer(const Json::Value& json_params) { + auto wait_for_server_up = [](const std::string& host, int port) { + for (size_t i = 0; i < 10; i++) { + httplib::Client cli(host + ":" + std::to_string(port)); + auto res = cli.Get("/health"); + if (res && res->status == httplib::StatusCode::OK_200) { + return true; + } else { + LOG_INFO << "Wait for server up: " << i; + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + } + return false; + }; + // TODO(sang) clean up resources if any errors LOG_DEBUG << "Start to spawn llama-server"; - std::string model; - if (auto o = json_params["model"]; !o.isNull()) { - model = o.asString(); + auto model = llama_utils::GetModelId(json_params); + if (!model.empty()) { llama_server_map_[model].host = "127.0.0.1"; llama_server_map_[model].port = llama_utils::GenerateRandomInteger(39400, 39999); @@ -1440,6 +1452,8 @@ bool LlamaEngine::SpawnLlamaServer(const Json::Value& json_params) { std::cout << "Could not start server: " << GetLastError() << std::endl; return false; } else { + if (!wait_for_server_up(s.host, s.port)) + return false; std::cout << "Server started" << std::endl; } #else @@ -1483,6 +1497,8 @@ bool LlamaEngine::SpawnLlamaServer(const Json::Value& json_params) { execv(p.c_str(), exec_args.data()); } else { // Parent process + if (!wait_for_server_up(s.host, s.port)) + return false; std::cout << "Server started" << std::endl; } #endif @@ -1494,13 +1510,28 @@ std::string LlamaEngine::ConvertJsonToParams(const Json::Value& root) { std::string errors; for (const auto& member : root.getMemberNames()) { - if (member == "model_path") { + if (member == "model_path" || member == "llama_model_path") { ss << "--model" << " "; ss << "\"" << root[member].asString() << "\" "; continue; - } else if (member == "model") { + } else if (member == "model" || member == "model_alias" || + member == "embedding") { + continue; + } else if (member == "ctx_len") { + ss << "--ctx-size" << " "; + ss << "\"" << std::to_string(root[member].asInt()) << "\" "; + continue; + } else if (member == "ngl") { + ss << "-ngl" << " "; + ss << "\"" << std::to_string(root[member].asInt()) << "\" "; + continue; + } else if (member == "model_type") { + if (root[member].asString() == "embedding") { + ss << "--embedding" << " "; + } continue; } + ss << "--" << member << " "; if (root[member].isString()) { ss << "\"" << root[member].asString() << "\" "; @@ -1529,11 +1560,25 @@ std::vector LlamaEngine::ConvertJsonToParamsVector( std::string errors; for (const auto& member : root.getMemberNames()) { - if (member == "model_path") { + if (member == "model_path" || member == "llama_model_path") { res.push_back("--model"); res.push_back(root[member].asString()); continue; - } else if (member == "model") { + } else if (member == "model" || member == "model_alias" || + member == "embedding") { + continue; + } else if (member == "ctx_len") { + res.push_back("--ctx-size"); + res.push_back(std::to_string(root[member].asInt())); + continue; + } else if (member == "ngl") { + res.push_back("-ngl"); + res.push_back(std::to_string(root[member].asInt())); + continue; + } else if (member == "model_type") { + if (root[member].asString() == "embedding") { + res.push_back("--embedding"); + } continue; } @@ -1584,18 +1629,25 @@ bool LlamaEngine::HandleLlamaCppChatCompletion( req.content_receiver = [cb](const char* data, size_t data_length, uint64_t offset, uint64_t total_length) { std::string s(data, data_length); - Json::Value respData; - respData["data"] = s; + Json::Value resp_data; Json::Value status; + + if (s.find("[DONE]") != std::string::npos) { + LOG_DEBUG << "[DONE]"; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = true; + status["status_code"] = k200OK; + cb(std::move(status), std::move(resp_data)); + return false; + } + + resp_data["data"] = s; status["is_done"] = false; status["has_error"] = false; status["is_stream"] = true; status["status_code"] = k200OK; - cb(std::move(status), std::move(respData)); - if (s == "[DONE]") { - LOG_DEBUG << "[DONE]"; - return false; - } + cb(std::move(status), std::move(resp_data)); LOG_DEBUG << s; return true; }; diff --git a/src/llama_utils.h b/src/llama_utils.h index 910eba3..2465ec0 100644 --- a/src/llama_utils.h +++ b/src/llama_utils.h @@ -10,6 +10,10 @@ #include #include +#if defined(__APPLE__) +#include +#endif + namespace llama_utils { inline std::string models_folder = "./models"; @@ -195,7 +199,7 @@ std::filesystem::path GetExecutableFolderContainerPath() { // CTL_DBG("Executable path: " << buffer); return std::filesystem::path{buffer}.parent_path(); } else { - CTL_ERR("Failed to get executable path"); + LOG_ERROR << "Failed to get executable path"; return std::filesystem::current_path(); } #elif defined(__linux__)