From 9c20f5620e5c707c0eead50522a68739806d2fff Mon Sep 17 00:00:00 2001 From: Gordon Smith Date: Mon, 26 Aug 2024 13:47:21 +0100 Subject: [PATCH] feat: Add llama.cpp web assembly support Primarily for calculating embeddings on the client Signed-off-by: Gordon Smith --- .gitignore | 5 +- .vscode/settings.json | 4 +- CMakeLists.txt | 3 +- docs/.vitepress/config.js | 1 + docs/index.md | 3 + package-lock.json | 300 ++++++- package.json | 20 +- packages/llama/.vscode/launch.json | 44 + packages/llama/.vscode/tasks.json | 73 ++ packages/llama/CHANGELOG.md | 2 + packages/llama/README.md | 25 + packages/llama/esbuild.mjs | 9 + packages/llama/karma.conf.cjs | 20 + packages/llama/package.json | 57 ++ packages/llama/src-cpp/CMakeLists.txt | 50 ++ packages/llama/src-cpp/embedding.cpp | 428 +++++++++ packages/llama/src-cpp/main.cpp | 1199 +++++++++++++++++++++++++ packages/llama/src-cpp/util.cpp | 67 ++ packages/llama/src-cpp/util.hpp | 38 + packages/llama/src/index.ts | 2 + packages/llama/src/llama.ts | 96 ++ packages/llama/src/web-blob.ts | 113 +++ packages/llama/test/index-browser.ts | 1 + packages/llama/test/index-node.ts | 1 + packages/llama/test/llama.ts | 67 ++ packages/llama/tsconfig.json | 11 + packages/tsconfig.json | 1 + src-cpp/CMakeLists.txt | 0 typedoc.json | 1 + vcpkg-overlays/llama/portfile.cmake | 28 + vcpkg-overlays/llama/vcpkg.json | 16 + vcpkg.json | 3 + 32 files changed, 2641 insertions(+), 47 deletions(-) create mode 100644 packages/llama/.vscode/launch.json create mode 100644 packages/llama/.vscode/tasks.json create mode 100644 packages/llama/CHANGELOG.md create mode 100644 packages/llama/README.md create mode 100644 packages/llama/esbuild.mjs create mode 100644 packages/llama/karma.conf.cjs create mode 100644 packages/llama/package.json create mode 100644 packages/llama/src-cpp/CMakeLists.txt create mode 100644 packages/llama/src-cpp/embedding.cpp create mode 100644 packages/llama/src-cpp/main.cpp create mode 100644 packages/llama/src-cpp/util.cpp create mode 100644 packages/llama/src-cpp/util.hpp create mode 100644 packages/llama/src/index.ts create mode 100644 packages/llama/src/llama.ts create mode 100644 packages/llama/src/web-blob.ts create mode 100644 packages/llama/test/index-browser.ts create mode 100644 packages/llama/test/index-node.ts create mode 100644 packages/llama/test/llama.ts create mode 100644 packages/llama/tsconfig.json delete mode 100644 src-cpp/CMakeLists.txt create mode 100644 vcpkg-overlays/llama/portfile.cmake create mode 100644 vcpkg-overlays/llama/vcpkg.json diff --git a/.gitignore b/.gitignore index 25310caa..12581d52 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ .nyc_output/ -/.vscode/c_cpp_properties.json -/.vscode/ipch +.vscode/c_cpp_properties.json +.vscode/ipch +.vscode/settings.json .nx/ bin/ build/ diff --git a/.vscode/settings.json b/.vscode/settings.json index 3f575e5e..e09c010a 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,5 +1,3 @@ { - "cmake.cmakePath": "${workspaceFolder}/scripts/cmake.sh", - "ecl.launchConfiguration": "not found", - "ecl.targetCluster": {} + "cmake.cmakePath": "${workspaceFolder}/scripts/cmake.sh" } \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 68aea66b..5799d624 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,6 +35,7 @@ set(EM_LINK_FLAGS "-sUSE_GLFW=0" "-sALLOW_UNIMPLEMENTED_SYSCALLS=1" "-sINCOMING_MODULE_JS_API=\"['wasmBinary']\"" + "--no-entry" "--pre-js ${CMAKE_CURRENT_SOURCE_DIR}/src-cpp/src/pre.js" "--post-js ${CMAKE_CURRENT_SOURCE_DIR}/src-cpp/src/post.js" ) @@ -79,8 +80,8 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug") set(PACK_MODE "-d") endif () -add_subdirectory(src-cpp) add_subdirectory(packages/base91/src-cpp) add_subdirectory(packages/expat/src-cpp) add_subdirectory(packages/graphviz/src-cpp) +add_subdirectory(packages/llama/src-cpp) add_subdirectory(packages/zstd/src-cpp) diff --git a/docs/.vitepress/config.js b/docs/.vitepress/config.js index ff555074..e5353879 100644 --- a/docs/.vitepress/config.js +++ b/docs/.vitepress/config.js @@ -38,6 +38,7 @@ export default { { text: 'DuckDB', link: '/duckdb/src/duckdb/classes/DuckDB' }, { text: 'Expat', link: '/expat/src/expat/classes/Expat' }, { text: 'Graphviz', link: '/graphviz/src/graphviz/classes/Graphviz' }, + { text: 'Llama', link: '/llama/src/llama/classes/Llama' }, { text: 'Zstd', link: '/zstd/src/zstd/classes/Zstd' }, ] } diff --git a/docs/index.md b/docs/index.md index 97757c49..79f53b90 100644 --- a/docs/index.md +++ b/docs/index.md @@ -35,6 +35,9 @@ features: - title: GraphViz details: The Graphviz layout algorithms take descriptions of graphs in a simple text language, and make diagrams in useful formats, such as images and SVG for web pages or display in an interactive graph browser. link: /graphviz/src/graphviz/classes/Graphviz + - title: Llama + details: Inference of Meta's LLaMA model (and others) in pure C/C++. + link: /llama/src/llama/classes/Llama - title: Zstd details: Zstandard is a fast compression algorithm, providing high compression ratios and is backed by an extremely fast decoder. link: /zstd/src/zstd/classes/Zstd diff --git a/package-lock.json b/package-lock.json index e355c8dc..63b95a99 100644 --- a/package-lock.json +++ b/package-lock.json @@ -13,7 +13,7 @@ ], "devDependencies": { "@eslint/js": "9.9.0", - "@hpcc-js/esbuild-plugins": "1.0.10", + "@hpcc-js/esbuild-plugins": "file:../Visualization/packages/esbuild-plugins", "@istanbuljs/nyc-config-typescript": "1.0.2", "@types/chai": "4.3.17", "@types/emscripten": "1.39.13", @@ -25,8 +25,10 @@ "assemblyscript": "0.27.29", "c8": "10.1.2", "chai": "5.1.1", + "chokidar-cli": "^3.0.0", "coveralls": "3.1.1", "eslint": "9.9.0", + "fzstd": "^0.1.1", "globals": "15.9.0", "karma": "6.4.4", "karma-chai": "0.1.0", @@ -50,6 +52,18 @@ "vitepress": "1.3.2" } }, + "../Visualization/packages/esbuild-plugins": { + "name": "@hpcc-js/esbuild-plugins", + "version": "1.1.0", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@hpcc-js/wasm-base91": "1.0.1", + "@hpcc-js/wasm-zstd": "1.0.1", + "fzstd": "0.1.1", + "yargs": "17.7.2" + } + }, "node_modules/@75lb/deep-merge": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/@75lb/deep-merge/-/deep-merge-1.1.2.tgz", @@ -1507,37 +1521,8 @@ "license": "BSD-3-Clause" }, "node_modules/@hpcc-js/esbuild-plugins": { - "version": "1.0.10", - "resolved": "https://registry.npmjs.org/@hpcc-js/esbuild-plugins/-/esbuild-plugins-1.0.10.tgz", - "integrity": "sha512-sXJEo5GKDu7uMbuQ72XV8jZBflyKKappUYNSjs6wITxcMzbi3o+WsWyyPeUYOPhj9bgguJ4Biy6AAPd1ma3Mxw==", - "dev": true, - "license": "Apache-2.0", - "dependencies": { - "@hpcc-js/wasm-base91": "1.0.1", - "@hpcc-js/wasm-zstd": "1.0.1", - "fzstd": "0.1.1", - "yargs": "17.7.2" - } - }, - "node_modules/@hpcc-js/esbuild-plugins/node_modules/@hpcc-js/wasm-base91": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/@hpcc-js/wasm-base91/-/wasm-base91-1.0.1.tgz", - "integrity": "sha512-MinpvSv+yTf1PRJ/cuhZzgi+vkle1YAVPkdD6CFzN7YqkW5K75WYfSo2YlnniKLh3SqksCO3uFqEIg8vBHhp1g==", - "dev": true, - "license": "Apache-2.0", - "dependencies": { - "yargs": "17.7.2" - } - }, - "node_modules/@hpcc-js/esbuild-plugins/node_modules/@hpcc-js/wasm-zstd": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/@hpcc-js/wasm-zstd/-/wasm-zstd-1.0.1.tgz", - "integrity": "sha512-at8AGwzQVdyg6vSHX85HW1FtEeqU+mo7kzH3nsiWRi9QPyl0s2uu7YXkR9zCq6cEP0uPKImo3ip3xRAkPzNizA==", - "dev": true, - "license": "Apache-2.0", - "dependencies": { - "yargs": "17.7.2" - } + "resolved": "../Visualization/packages/esbuild-plugins", + "link": true }, "node_modules/@hpcc-js/wasm": { "resolved": "packages/wasm", @@ -1563,6 +1548,10 @@ "resolved": "packages/graphviz-cli", "link": true }, + "node_modules/@hpcc-js/wasm-llama": { + "resolved": "packages/llama", + "link": true + }, "node_modules/@hpcc-js/wasm-zstd": { "resolved": "packages/zstd", "link": true @@ -5455,6 +5444,240 @@ "fsevents": "~2.3.2" } }, + "node_modules/chokidar-cli": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/chokidar-cli/-/chokidar-cli-3.0.0.tgz", + "integrity": "sha512-xVW+Qeh7z15uZRxHOkP93Ux8A0xbPzwK4GaqD8dQOYc34TlkqUhVSS59fK36DOp5WdJlrRzlYSy02Ht99FjZqQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "chokidar": "^3.5.2", + "lodash.debounce": "^4.0.8", + "lodash.throttle": "^4.1.1", + "yargs": "^13.3.0" + }, + "bin": { + "chokidar": "index.js" + }, + "engines": { + "node": ">= 8.10.0" + } + }, + "node_modules/chokidar-cli/node_modules/ansi-regex": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-4.1.1.tgz", + "integrity": "sha512-ILlv4k/3f6vfQ4OoP2AGvirOktlQ98ZEL1k9FaQjxa3L1abBgbuTDAdPOpvbGncC0BTVQrl+OM8xZGK6tWXt7g==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/chokidar-cli/node_modules/ansi-styles": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.1.tgz", + "integrity": "sha512-VT0ZI6kZRdTh8YyJw3SMbYm/u+NqfsAxEpWO0Pf9sq8/e94WxxOpPKx9FR1FlyCtOVDNOQ+8ntlqFxiRc+r5qA==", + "dev": true, + "license": "MIT", + "dependencies": { + "color-convert": "^1.9.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/chokidar-cli/node_modules/cliui": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/cliui/-/cliui-5.0.0.tgz", + "integrity": "sha512-PYeGSEmmHM6zvoef2w8TPzlrnNpXIjTipYK780YswmIP9vjxmd6Y2a3CB2Ks6/AU8NHjZugXvo8w3oWM2qnwXA==", + "dev": true, + "license": "ISC", + "dependencies": { + "string-width": "^3.1.0", + "strip-ansi": "^5.2.0", + "wrap-ansi": "^5.1.0" + } + }, + "node_modules/chokidar-cli/node_modules/color-convert": { + "version": "1.9.3", + "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz", + "integrity": "sha512-QfAUtd+vFdAtFQcC8CCyYt1fYWxSqAiK2cSD6zDB8N3cpsEBAvRxp9zOGg6G/SHHJYAT88/az/IuDGALsNVbGg==", + "dev": true, + "license": "MIT", + "dependencies": { + "color-name": "1.1.3" + } + }, + "node_modules/chokidar-cli/node_modules/color-name": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz", + "integrity": "sha512-72fSenhMw2HZMTVHeCA9KCmpEIbzWiQsjN+BHcBbS9vr1mtt+vJjPdksIBNUmKAW8TFUDPJK5SUU3QhE9NEXDw==", + "dev": true, + "license": "MIT" + }, + "node_modules/chokidar-cli/node_modules/emoji-regex": { + "version": "7.0.3", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-7.0.3.tgz", + "integrity": "sha512-CwBLREIQ7LvYFB0WyRvwhq5N5qPhc6PMjD6bYggFlI5YyDgl+0vxq5VHbMOFqLg7hfWzmu8T5Z1QofhmTIhItA==", + "dev": true, + "license": "MIT" + }, + "node_modules/chokidar-cli/node_modules/find-up": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-3.0.0.tgz", + "integrity": "sha512-1yD6RmLI1XBfxugvORwlck6f75tYL+iR0jqwsOrOxMZyGYqUuDhJ0l4AXdO1iX/FTs9cBAMEk1gWSEx1kSbylg==", + "dev": true, + "license": "MIT", + "dependencies": { + "locate-path": "^3.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/chokidar-cli/node_modules/is-fullwidth-code-point": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz", + "integrity": "sha512-VHskAKYM8RfSFXwee5t5cbN5PZeq1Wrh6qd5bkyiXIf6UQcN6w/A0eXM9r6t8d+GYOh+o6ZhiEnb88LN/Y8m2w==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/chokidar-cli/node_modules/locate-path": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-3.0.0.tgz", + "integrity": "sha512-7AO748wWnIhNqAuaty2ZWHkQHRSNfPVIsPIfwEOWO22AmaoVrWavlOcMR5nzTLNYvp36X220/maaRsrec1G65A==", + "dev": true, + "license": "MIT", + "dependencies": { + "p-locate": "^3.0.0", + "path-exists": "^3.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/chokidar-cli/node_modules/p-limit": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz", + "integrity": "sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==", + "dev": true, + "license": "MIT", + "dependencies": { + "p-try": "^2.0.0" + }, + "engines": { + "node": ">=6" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/chokidar-cli/node_modules/p-locate": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-3.0.0.tgz", + "integrity": "sha512-x+12w/To+4GFfgJhBEpiDcLozRJGegY+Ei7/z0tSLkMmxGZNybVMSfWj9aJn8Z5Fc7dBUNJOOVgPv2H7IwulSQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "p-limit": "^2.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/chokidar-cli/node_modules/path-exists": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", + "integrity": "sha512-bpC7GYwiDYQ4wYLe+FA8lhRjhQCMcQGuSgGGqDkg/QerRWw9CmGRT0iSOVRSZJ29NMLZgIzqaljJ63oaL4NIJQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/chokidar-cli/node_modules/string-width": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-3.1.0.tgz", + "integrity": "sha512-vafcv6KjVZKSgz06oM/H6GDBrAtz8vdhQakGjFIvNrHA6y3HCF1CInLy+QLq8dTJPQ1b+KDUqDFctkdRW44e1w==", + "dev": true, + "license": "MIT", + "dependencies": { + "emoji-regex": "^7.0.1", + "is-fullwidth-code-point": "^2.0.0", + "strip-ansi": "^5.1.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/chokidar-cli/node_modules/strip-ansi": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-5.2.0.tgz", + "integrity": "sha512-DuRs1gKbBqsMKIZlrffwlug8MHkcnpjs5VPmL1PAh+mA30U0DTotfDZ0d2UUsXpPmPmMMJ6W773MaA3J+lbiWA==", + "dev": true, + "license": "MIT", + "dependencies": { + "ansi-regex": "^4.1.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/chokidar-cli/node_modules/wrap-ansi": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-5.1.0.tgz", + "integrity": "sha512-QC1/iN/2/RPVJ5jYK8BGttj5z83LmSKmvbvrXPNCLZSEb32KKVDJDl/MOt2N01qU2H/FkzEa9PKto1BqDjtd7Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "ansi-styles": "^3.2.0", + "string-width": "^3.0.0", + "strip-ansi": "^5.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/chokidar-cli/node_modules/y18n": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/y18n/-/y18n-4.0.3.tgz", + "integrity": "sha512-JKhqTOwSrqNA1NY5lSztJ1GrBiUodLMmIZuLiDaMRJ+itFd+ABVE8XBjOvIWL+rSqNDC74LCSFmlb/U4UZ4hJQ==", + "dev": true, + "license": "ISC" + }, + "node_modules/chokidar-cli/node_modules/yargs": { + "version": "13.3.2", + "resolved": "https://registry.npmjs.org/yargs/-/yargs-13.3.2.tgz", + "integrity": "sha512-AX3Zw5iPruN5ie6xGRIDgqkT+ZhnRlZMLMHAs8tg7nRruy2Nb+i5o9bwghAogtM08q1dpr2LVoS8KSTMYpWXUw==", + "dev": true, + "license": "MIT", + "dependencies": { + "cliui": "^5.0.0", + "find-up": "^3.0.0", + "get-caller-file": "^2.0.1", + "require-directory": "^2.1.1", + "require-main-filename": "^2.0.0", + "set-blocking": "^2.0.0", + "string-width": "^3.0.0", + "which-module": "^2.0.0", + "y18n": "^4.0.0", + "yargs-parser": "^13.1.2" + } + }, + "node_modules/chokidar-cli/node_modules/yargs-parser": { + "version": "13.1.2", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-13.1.2.tgz", + "integrity": "sha512-3lbsNRf/j+A4QuSZfDRA7HRSfWrzO0YjqTJd5kjAq37Zep1CEgaYmrH9Q3GwPiB9cHyd1Y1UwggGhJGoxipbzg==", + "dev": true, + "license": "ISC", + "dependencies": { + "camelcase": "^5.0.0", + "decamelize": "^1.2.0" + } + }, "node_modules/chokidar/node_modules/glob-parent": { "version": "5.1.2", "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz", @@ -11236,6 +11459,13 @@ "dev": true, "license": "MIT" }, + "node_modules/lodash.debounce": { + "version": "4.0.8", + "resolved": "https://registry.npmjs.org/lodash.debounce/-/lodash.debounce-4.0.8.tgz", + "integrity": "sha512-FT1yDzDYEoYWhnSGnpE/4Kj1fLZkDFyqRb7fNt6FdYOSxlUWAtp42Eh6Wb0rGIv/m9Bgo7x4GhQbm5Ys4SG5ow==", + "dev": true, + "license": "MIT" + }, "node_modules/lodash.flattendeep": { "version": "4.4.0", "resolved": "https://registry.npmjs.org/lodash.flattendeep/-/lodash.flattendeep-4.4.0.tgz", @@ -18267,6 +18497,12 @@ }, "devDependencies": {} }, + "packages/llama": { + "name": "@hpcc-js/wasm-llama", + "version": "1.2.0", + "license": "Apache-2.0", + "devDependencies": {} + }, "packages/wasm": { "name": "@hpcc-js/wasm", "version": "2.20.0", diff --git a/package.json b/package.json index 8895e40c..0d7b59b8 100644 --- a/package.json +++ b/package.json @@ -36,13 +36,14 @@ "clean-root": "run-p clean-build clean-docs", "clean": "lerna run clean", "copy-res": "cp ./docs/*.png ./docs/.vitepress/dist", - "compile-asm": "asc ./src-asm/index.ts --target release", - "compile-cpp": "run-script-os", - "compile-cpp:linux": "./scripts/cpp-build.sh", - "compile-cpp:win32": "wsl -e ./scripts/cpp-build.sh", + "build-asm": "asc ./src-asm/index.ts --target release", "pack-duckdb": "lerna run pack-duckdb", "gen-docs": "typedoc", "gen-docs-watch": "typedoc --watch", + "build-cpp": "run-script-os", + "build-cpp:linux": "./scripts/cpp-build.sh", + "build-cpp:win32": "wsl -e ./scripts/cpp-build.sh", + "build-cpp-watch": "chokidar 'packages/*/src-cpp/**/*' -c 'npm run build-cpp'", "build-docs-vitepress": "vitepress build docs", "build-docs": "run-s pack-duckdb gen-docs build-docs-vitepress copy-res", "build-docs-watch": "vitepress dev docs", @@ -50,7 +51,7 @@ "build-docker-wasm": "npx -y mkdirp dist && docker run --rm -it --mount source=\"/hpcc-js-wasm/dist\",target=/usr/src/app/dist hpcc-js-wasm-build:latest", "build-docker": "run-s build-docker-image build-docker-wasm", "build-ws": "lerna run build", - "build": "run-s compile-cpp build-ws", + "build": "run-s build-cpp build-ws", "git-push": "git push --follow-tags upstream trunk", "serve-docs": "vitepress serve docs", "serve": "ws", @@ -68,9 +69,9 @@ "update-major-root": "npx -y npm-check-updates -u", "update-major": "npm run update-major-root && lerna run update-major" }, - "dependencies": {}, "devDependencies": { "@eslint/js": "9.9.0", + "@hpcc-js/esbuild-plugins": "file:../Visualization/packages/esbuild-plugins", "@istanbuljs/nyc-config-typescript": "1.0.2", "@types/chai": "4.3.17", "@types/emscripten": "1.39.13", @@ -80,11 +81,12 @@ "@typescript-eslint/eslint-plugin": "8.1.0", "@typescript-eslint/parser": "8.1.0", "assemblyscript": "0.27.29", + "c8": "10.1.2", "chai": "5.1.1", + "chokidar-cli": "^3.0.0", "coveralls": "3.1.1", - "c8": "10.1.2", - "@hpcc-js/esbuild-plugins": "1.0.10", "eslint": "9.9.0", + "fzstd": "^0.1.1", "globals": "15.9.0", "karma": "6.4.4", "karma-chai": "0.1.0", @@ -121,4 +123,4 @@ "url": "https://github.com/hpcc-systems/hpcc-js-wasm/issues" }, "homepage": "https://hpcc-systems.github.io/hpcc-js-wasm/" -} \ No newline at end of file +} diff --git a/packages/llama/.vscode/launch.json b/packages/llama/.vscode/launch.json new file mode 100644 index 00000000..a5e6e939 --- /dev/null +++ b/packages/llama/.vscode/launch.json @@ -0,0 +1,44 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "test-browser", + "type": "msedge", + "request": "launch", + "url": "file://${workspaceRoot}/test.html", + "webRoot": "${workspaceRoot}", + "sourceMapPathOverrides": { + "webpack:///./*": "${workspaceRoot}/*", + "webpack:///*": "/*" + }, + }, + { + "name": "test-node", + "type": "node", + "request": "launch", + "runtimeArgs": [ + "run-script", + "test-node" + ], + "runtimeExecutable": "npm", + "skipFiles": [ + "/**" + ], + "outFiles": [ + "${workspaceFolder}/**/*.js", + "${workspaceFolder}/**/*.c", + "!**/node_modules/**" + ], + }, + { + "name": "esbuild", + "type": "node", + "program": "${workspaceFolder}/esbuild.mjs", + "request": "launch", + "args": [], + "skipFiles": [ + "/**" + ] + } + ] +} \ No newline at end of file diff --git a/packages/llama/.vscode/tasks.json b/packages/llama/.vscode/tasks.json new file mode 100644 index 00000000..44163830 --- /dev/null +++ b/packages/llama/.vscode/tasks.json @@ -0,0 +1,73 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "type": "npm", + "label": "build-types-watch", + "script": "build-types-watch", + "problemMatcher": [ + "$tsc-watch" + ], + "presentation": { + "group": "group-build" + } + }, + { + "type": "npm", + "label": "build-ts-watch", + "script": "build-ts-watch", + "problemMatcher": [ + "$tsc-watch" + ], + "presentation": { + "group": "group-build" + } + }, + { + "type": "npm", + "label": "build-cpp-watch", + "script": "build-cpp-watch", + "options": { + "cwd": "${workspaceFolder}/../.." + }, + "problemMatcher": { + "owner": "cpp", + "fileLocation": [ + "relative", + "${workspaceFolder}" + ], + "pattern": { + "regexp": "^(.*):(\\d+):(\\d+):\\s+(warning|error):\\s+(.*)$", + "file": 1, + "line": 2, + "column": 3, + "severity": 4, + "message": 5 + } + }, + "presentation": { + "group": "group-build" + } + }, + { + "type": "npm", + "label": "Web Server", + "script": "serve", + "presentation": { + "group": "group-build" + } + }, + { + "label": "build", + "dependsOn": [ + "build-types-watch", + "build-ts-watch", + "build-cpp-watch" + ], + "group": { + "kind": "build", + "isDefault": true + } + } + ] +} \ No newline at end of file diff --git a/packages/llama/CHANGELOG.md b/packages/llama/CHANGELOG.md new file mode 100644 index 00000000..4dc68c6f --- /dev/null +++ b/packages/llama/CHANGELOG.md @@ -0,0 +1,2 @@ +# Changelog + diff --git a/packages/llama/README.md b/packages/llama/README.md new file mode 100644 index 00000000..7b98ee1a --- /dev/null +++ b/packages/llama/README.md @@ -0,0 +1,25 @@ +# @hpcc-js/wasm-llama + +## Installation + +```sh +npm install @hpcc-js/wasm-llama +``` + +## Quick Start + +```typescript +import { Llama, WebBlob } from "@hpcc-js/wasm-llama"; + +let llama = await Llama.load(); +const model = "https://huggingface.co/CompendiumLabs/bge-base-en-v1.5-gguf/resolve/main/bge-base-en-v1.5-q4_k_m.gguf"; +const webBlob: Blob = await WebBlob.create(new URL(model)); + +const data: ArrayBuffer = await webBlob.arrayBuffer(); + +const embeddings = llama.embedding("Hello and Welcome!", new Uint8Array(data)); +``` + +## Reference + +* [API Documentation](https://hpcc-systems.github.io/hpcc-js-wasm/llama/src/llama/classes/llama.html) diff --git a/packages/llama/esbuild.mjs b/packages/llama/esbuild.mjs new file mode 100644 index 00000000..163a14d7 --- /dev/null +++ b/packages/llama/esbuild.mjs @@ -0,0 +1,9 @@ +import { browserTpl, neutralTpl, nodeTpl } from "@hpcc-js/esbuild-plugins"; + +// config --- +await neutralTpl("src/index.ts", "dist/index"); +await Promise.all([ + browserTpl("test/index-browser.ts", "dist-test/index.browser"), + nodeTpl("test/index-node.ts", "dist-test/index.node"), +]); + diff --git a/packages/llama/karma.conf.cjs b/packages/llama/karma.conf.cjs new file mode 100644 index 00000000..7910435e --- /dev/null +++ b/packages/llama/karma.conf.cjs @@ -0,0 +1,20 @@ +module.exports = function (config) { + config.set({ + frameworks: ['mocha'], + files: [ + { pattern: 'dist-test/index.browser.js', type: 'module' }, + { pattern: 'dist-test/*.js', watched: false, included: false, served: true, type: module } + ], + proxies: { + "/dist/": "/base/dist/", + "/dist-test/": "/base/dist-test/" + }, + reporters: ['spec'], + port: 9876, // karma web server port + colors: true, + logLevel: config.LOG_INFO, + browsers: ["Firefox", "ChromeHeadless"], + autoWatch: false, + concurrency: Infinity + }) +} \ No newline at end of file diff --git a/packages/llama/package.json b/packages/llama/package.json new file mode 100644 index 00000000..eb891382 --- /dev/null +++ b/packages/llama/package.json @@ -0,0 +1,57 @@ +{ + "name": "@hpcc-js/wasm-llama", + "version": "1.2.0", + "description": "hpcc-js - WASM llama", + "type": "module", + "exports": { + ".": { + "types": "./types/index.d.ts", + "default": "./dist/index.js" + } + }, + "main": "./dist/index.js", + "types": "./types/index.d.ts", + "files": [ + "dist/**/*", + "src/**/*", + "types/**/*" + ], + "scripts": { + "clean": "rimraf ./dist ./dist-test ./types", + "build-cpp": "cmake --build ../../build --target llamalib", + "build-cpp-watch": "chokidar 'src-cpp/**.*' -c 'npm run build-cpp'", + "build-types": "tsc --project tsconfig.json --emitDeclarationOnly", + "build-types-watch": "npm run build-types -- --watch", + "build-ts": "node esbuild.mjs", + "build-ts-dev": "npm run build-ts -- --mode=development", + "build-ts-watch": "npm run build-ts-dev -- --watch", + "build-dev": "run-p build-types build-ts-dev", + "build": "run-p build-cpp build-types build-ts", + "lint-skypack": "npx -y @skypack/package-check", + "lint-eslint": "eslint src/**/*.ts", + "lint": "run-p lint-eslint", + "test-chrome": "karma start --single-run --browsers ChromiumHeadless karma.conf.cjs", + "test-firefox": "karma start --single-run --browsers Firefox karma.conf.cjs", + "test-node": "mocha ./dist-test/index.node.js --reporter spec", + "test": "run-s test-chrome test-node", + "update": "npx -y npm-check-updates -u -t minor", + "update-major": "npx -y npm-check-updates -u" + }, + "devDependencies": {}, + "keywords": [ + "graphviz", + "typescript", + "webassembly", + "wasm", + "dot", + "neato", + "twopi" + ], + "author": "hpcc-systems", + "repository": { + "type": "git", + "url": "git+https://github.com/hpcc-systems/hpcc-js-wasm.git" + }, + "homepage": "https://hpcc-systems.github.io/hpcc-js-wasm/", + "license": "Apache-2.0" +} \ No newline at end of file diff --git a/packages/llama/src-cpp/CMakeLists.txt b/packages/llama/src-cpp/CMakeLists.txt new file mode 100644 index 00000000..f4308c7e --- /dev/null +++ b/packages/llama/src-cpp/CMakeLists.txt @@ -0,0 +1,50 @@ +project(llamalib) + +set(CMAKE_CXX_STANDARD 11) + +find_package(Llama CONFIG REQUIRED) + +# See: https://github.com/emscripten-core/emscripten/blob/main/src/settings.js + +set(EM_CPP_FLAGS + ${EM_CPP_FLAGS} + "-fwasm-exceptions" +) +string(REPLACE ";" " " CPP_FLAGS "${EM_CPP_FLAGS}") + +set(EM_LINK_FLAGS + ${EM_LINK_FLAGS} + "-sEXPORT_NAME='${CMAKE_PROJECT_NAME}'" + "-sFILESYSTEM=1" + "-sFORCE_FILESYSTEM=1" + "-sWASMFS=1" + "-lembind" + "-fwasm-exceptions" + "--emit-tsd ${CMAKE_CURRENT_BINARY_DIR}/llamalib.d.ts" +) +string(REPLACE ";" " " LINK_FLAGS "${EM_LINK_FLAGS}") + +include_directories( + ${VCPKG_INCLUDE_DIR} + ${CMAKE_CURRENT_BINARY_DIR} + ${Llama_DIR}/common +) + +add_executable(llamalib + main.cpp + embedding.cpp + util.cpp + ${Llama_DIR}/common/common.cpp + ${Llama_DIR}/common/sampling.cpp + ${Llama_DIR}/common/console.cpp + ${Llama_DIR}/common/grammar-parser.cpp + ${Llama_DIR}/common/json-schema-to-grammar.cpp + ${Llama_DIR}/common/build-info.cpp +) + +set_target_properties(llamalib PROPERTIES COMPILE_FLAGS "${CPP_FLAGS}") +set_target_properties(llamalib PROPERTIES LINK_FLAGS "${LINK_FLAGS}") + +target_link_libraries(llamalib + PRIVATE llama +) diff --git a/packages/llama/src-cpp/embedding.cpp b/packages/llama/src-cpp/embedding.cpp new file mode 100644 index 00000000..33d9f806 --- /dev/null +++ b/packages/llama/src-cpp/embedding.cpp @@ -0,0 +1,428 @@ +// See: https://github.com/ggerganov/llama.cpp/blob/master/examples/embedding/embedding.cpp --- + +#include "common.h" +#include "llama.h" + +#include + +#if defined(_MSC_VER) +#pragma warning(disable : 4244 4267) // possible loss of data +#endif + +namespace embedding +{ + + static std::vector split_lines(const std::string &s, const std::string &separator = "\n") + { + std::vector lines; + size_t start = 0; + size_t end = s.find(separator); + + while (end != std::string::npos) + { + lines.push_back(s.substr(start, end - start)); + start = end + separator.length(); + end = s.find(separator, start); + } + + lines.push_back(s.substr(start)); // Add the last part + + return lines; + } + + static void batch_add_seq(llama_batch &batch, const std::vector &tokens, llama_seq_id seq_id) + { + size_t n_tokens = tokens.size(); + for (size_t i = 0; i < n_tokens; i++) + { + llama_batch_add(batch, tokens[i], i, {seq_id}, true); + } + } + + static void batch_decode(llama_context *ctx, llama_batch &batch, float *output, int n_seq, int n_embd, int embd_norm) + { + const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); + const struct llama_model *model = llama_get_model(ctx); + + // clear previous kv_cache values (irrelevant for embeddings) + llama_kv_cache_clear(ctx); + + // run model + fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); + if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) + { + // encoder-only model + if (llama_encode(ctx, batch) < 0) + { + fprintf(stderr, "%s : failed to encode\n", __func__); + } + } + else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) + { + // decoder-only model + if (llama_decode(ctx, batch) < 0) + { + fprintf(stderr, "%s : failed to decode\n", __func__); + } + } + + for (int i = 0; i < batch.n_tokens; i++) + { + if (!batch.logits[i]) + { + continue; + } + + const float *embd = nullptr; + int embd_pos = 0; + + if (pooling_type == LLAMA_POOLING_TYPE_NONE) + { + // try to get token embeddings + embd = llama_get_embeddings_ith(ctx, i); + embd_pos = i; + GGML_ASSERT(embd != NULL && "failed to get token embeddings"); + } + else + { + // try to get sequence embeddings - supported only when pooling_type is not NONE + embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); + embd_pos = batch.seq_id[i][0]; + GGML_ASSERT(embd != NULL && "failed to get sequence embeddings"); + } + + float *out = output + embd_pos * n_embd; + llama_embd_normalize(embd, out, n_embd, embd_norm); + } + } + + int main(int argc, char **argv) + { + gpt_params params; + + if (!gpt_params_parse(argc, argv, params)) + { + gpt_params_print_usage(argc, argv, params); + return 1; + } + + params.embedding = true; + // For non-causal models, batch size must be equal to ubatch size + params.n_ubatch = params.n_batch; + + print_build_info(); + + if (params.seed == LLAMA_DEFAULT_SEED) + { + params.seed = time(NULL); + } + + fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + + llama_backend_init(); + llama_numa_init(params.numa); + + // load the model + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model *model = llama_init.model; + llama_context *ctx = llama_init.context; + if (model == NULL) + { + fprintf(stderr, "%s: error: unable to load model\n", __func__); + return 1; + } + + const int n_ctx_train = llama_n_ctx_train(model); + const int n_ctx = llama_n_ctx(ctx); + + const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); + + if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) + { + fprintf(stderr, "%s: error: computing embeddings in encoder-decoder models is not supported\n", __func__); + return 1; + } + + if (n_ctx > n_ctx_train) + { + fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", + __func__, n_ctx_train, n_ctx); + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); + } + + // split the prompt into lines + std::vector prompts = split_lines(params.prompt, params.embd_sep); + + // max batch size + const uint64_t n_batch = params.n_batch; + GGML_ASSERT(params.n_batch >= params.n_ctx); + + // tokenize the prompts and trim + std::vector> inputs; + for (const auto &prompt : prompts) + { + auto inp = ::llama_tokenize(ctx, prompt, true, false); + if (inp.size() > n_batch) + { + fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n", + __func__, (long long int)inp.size(), (long long int)n_batch); + return 1; + } + inputs.push_back(inp); + } + + // check if the last token is SEP + // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true' + for (auto &inp : inputs) + { + if (inp.empty() || inp.back() != llama_token_sep(model)) + { + fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__); + fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__); + } + } + + // tokenization stats + if (params.verbose_prompt) + { + for (int i = 0; i < (int)inputs.size(); i++) + { + fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str()); + fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size()); + for (int j = 0; j < (int)inputs[i].size(); j++) + { + fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str()); + } + fprintf(stderr, "\n\n"); + } + } + + // initialize batch + const int n_prompts = prompts.size(); + struct llama_batch batch = llama_batch_init(n_batch, 0, 1); + + // count number of embeddings + int n_embd_count = 0; + if (pooling_type == LLAMA_POOLING_TYPE_NONE) + { + for (int k = 0; k < n_prompts; k++) + { + n_embd_count += inputs[k].size(); + } + } + else + { + n_embd_count = n_prompts; + } + + // allocate output + const int n_embd = llama_n_embd(model); + std::vector embeddings(n_embd_count * n_embd, 0); + float *emb = embeddings.data(); + + // break into batches + int e = 0; // number of embeddings already stored + int s = 0; // number of prompts in current batch + for (int k = 0; k < n_prompts; k++) + { + // clamp to n_batch tokens + auto &inp = inputs[k]; + + const uint64_t n_toks = inp.size(); + + // encode if at capacity + if (batch.n_tokens + n_toks > n_batch) + { + float *out = emb + e * n_embd; + batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); + e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s; + s = 0; + llama_batch_clear(batch); + } + + // add to batch + batch_add_seq(batch, inp, s); + s += 1; + } + + // final batch + float *out = emb + e * n_embd; + batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); + + if (params.embd_out.empty()) + { + fprintf(stdout, "\n"); + + if (pooling_type == LLAMA_POOLING_TYPE_NONE) + { + for (int j = 0; j < n_embd_count; j++) + { + fprintf(stdout, "embedding %d: ", j); + for (int i = 0; i < std::min(3, n_embd); i++) + { + if (params.embd_normalize == 0) + { + fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); + } + else + { + fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); + } + } + fprintf(stdout, " ... "); + for (int i = n_embd - 3; i < n_embd; i++) + { + if (params.embd_normalize == 0) + { + fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); + } + else + { + fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); + } + } + fprintf(stdout, "\n"); + } + } + else + { + // print the first part of the embeddings or for a single prompt, the full embedding + for (int j = 0; j < n_prompts; j++) + { + fprintf(stdout, "embedding %d: ", j); + for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) + { + if (params.embd_normalize == 0) + { + fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); + } + else + { + fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); + } + } + fprintf(stdout, "\n"); + } + + // print cosine similarity matrix + if (n_prompts > 1) + { + fprintf(stdout, "\n"); + printf("cosine similarity matrix:\n\n"); + for (int i = 0; i < n_prompts; i++) + { + fprintf(stdout, "%6.6s ", prompts[i].c_str()); + } + fprintf(stdout, "\n"); + for (int i = 0; i < n_prompts; i++) + { + for (int j = 0; j < n_prompts; j++) + { + float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); + fprintf(stdout, "%6.2f ", sim); + } + fprintf(stdout, "%1.10s", prompts[i].c_str()); + fprintf(stdout, "\n"); + } + } + } + } + + if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") + { + const bool notArray = params.embd_out != "array"; + + fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "["); + for (int j = 0;;) + { // at least one iteration (one prompt) + if (notArray) + fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ", j); + fprintf(stdout, "["); + for (int i = 0;;) + { // at least one iteration (n_embd > 0) + fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]); + i++; + if (i < n_embd) + fprintf(stdout, ","); + else + break; + } + fprintf(stdout, notArray ? "]\n }" : "]"); + j++; + if (j < n_embd_count) + fprintf(stdout, notArray ? ",\n" : ","); + else + break; + } + fprintf(stdout, notArray ? "\n ]" : "]\n"); + + if (params.embd_out == "json+" && n_prompts > 1) + { + fprintf(stdout, ",\n \"cosineSimilarity\": [\n"); + for (int i = 0;;) + { // at least two iteration (n_embd_count > 1) + fprintf(stdout, " ["); + for (int j = 0;;) + { // at least two iteration (n_embd_count > 1) + float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); + fprintf(stdout, "%6.2f", sim); + j++; + if (j < n_embd_count) + fprintf(stdout, ", "); + else + break; + } + fprintf(stdout, " ]"); + i++; + if (i < n_embd_count) + fprintf(stdout, ",\n"); + else + break; + } + fprintf(stdout, "\n ]"); + } + + if (notArray) + fprintf(stdout, "\n}\n"); + } + + // clean up + llama_print_timings(ctx); + llama_batch_free(batch); + llama_free(ctx); + llama_free_model(model); + llama_backend_free(); + + return 0; + } +} +// --- EMSCRIPTEN BINDINGS --- EMSCRIPTEN BINDINGS --- EMSCRIPTEN BINDINGS --- EMSCRIPTEN BINDINGS --- + +#include "util.hpp" +int embeddingMain(const std::vector &args, std::vector &retVal) +{ + ArgBuffer argBuffer(args); + int ret = 0; + { + OutErrRedirect outerr; + ret = embedding::main(argBuffer.argc, argBuffer.argv); + } + readOutFile(retVal); + readErrorFile(retVal); + + return ret; +} + +#include +EMSCRIPTEN_BINDINGS(llama_embedding) +{ + emscripten::function("embedding", &embeddingMain); +} diff --git a/packages/llama/src-cpp/main.cpp b/packages/llama/src-cpp/main.cpp new file mode 100644 index 00000000..71fbdb43 --- /dev/null +++ b/packages/llama/src-cpp/main.cpp @@ -0,0 +1,1199 @@ +// See: https://github.com/ggerganov/llama.cpp/blob/master/examples/main/main.cpp --- + +#include "common.h" + +#include "console.h" +#include "llama.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) +#include +#include +#elif defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#include +#endif + +#if defined(_MSC_VER) +#pragma warning(disable : 4244 4267) // possible loss of data +#endif + +namespace main +{ + static llama_context **g_ctx; + static llama_model **g_model; + static gpt_params *g_params; + static std::vector *g_input_tokens; + static std::ostringstream *g_output_ss; + static std::vector *g_output_tokens; + static bool is_interacting = false; + static bool need_insert_eot = false; + + static bool file_exists(const std::string &path) + { + std::ifstream f(path.c_str()); + return f.good(); + } + + static bool file_is_empty(const std::string &path) + { + std::ifstream f; + f.exceptions(std::ifstream::failbit | std::ifstream::badbit); + f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate); + return f.tellg() == 0; + } + + static void write_logfile( + const llama_context *ctx, const gpt_params ¶ms, const llama_model *model, + const std::vector &input_tokens, const std::string &output, + const std::vector &output_tokens) + { + if (params.logdir.empty()) + { + return; + } + + const std::string timestamp = string_get_sortable_timestamp(); + + const bool success = fs_create_directory_with_parents(params.logdir); + if (!success) + { + fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", + __func__, params.logdir.c_str()); + return; + } + + const std::string logfile_path = params.logdir + timestamp + ".yml"; + FILE *logfile = fopen(logfile_path.c_str(), "w"); + + if (logfile == NULL) + { + fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); + return; + } + + fprintf(logfile, "binary: main\n"); + char model_desc[128]; + llama_model_desc(model, model_desc, sizeof(model_desc)); + yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc); + + fprintf(logfile, "\n"); + fprintf(logfile, "######################\n"); + fprintf(logfile, "# Generation Results #\n"); + fprintf(logfile, "######################\n"); + fprintf(logfile, "\n"); + + yaml_dump_string_multiline(logfile, "output", output.c_str()); + yaml_dump_vector_int(logfile, "output_tokens", output_tokens); + + llama_dump_timing_info_yaml(logfile, ctx); + fclose(logfile); + } + +#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32) + static void sigint_handler(int signo) + { + if (signo == SIGINT) + { + if (!is_interacting && g_params->interactive) + { + is_interacting = true; + need_insert_eot = true; + } + else + { + console::cleanup(); + printf("\n"); + llama_print_timings(*g_ctx); + write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); + _exit(130); + } + } + } +#endif + + static void llama_log_callback_logTee(ggml_log_level level, const char *text, void *user_data) + { + (void)level; + (void)user_data; + LOG_TEE("%s", text); + } + + static std::string chat_add_and_format(struct llama_model *model, std::vector &chat_msgs, std::string role, std::string content) + { + llama_chat_msg new_msg{role, content}; + auto formatted = llama_chat_format_single( + model, g_params->chat_template, chat_msgs, new_msg, role == "user"); + chat_msgs.push_back({role, content}); + LOG("formatted: %s\n", formatted.c_str()); + return formatted; + } + + int main(int argc, char **argv) + { + gpt_params params; + g_params = ¶ms; + + if (!gpt_params_parse(argc, argv, params)) + { + gpt_params_print_usage(argc, argv, params); + return 1; + } + + llama_sampling_params &sparams = params.sparams; + +#ifndef LOG_DISABLE_LOGS + log_set_target(log_filename_generator("main", "log")); + LOG_TEE("Log start\n"); + log_dump_cmdline(argc, argv); + llama_log_set(llama_log_callback_logTee, nullptr); +#endif // LOG_DISABLE_LOGS + + // TODO: Dump params ? + // LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity)); + + // save choice to use color for later + // (note for later: this is a slightly awkward choice) + console::init(params.simple_io, params.use_color); + atexit([]() + { console::cleanup(); }); + + if (params.logits_all) + { + printf("\n************\n"); + printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); + printf("************\n\n"); + + return 0; + } + + if (params.embedding) + { + printf("\n************\n"); + printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__); + printf("************\n\n"); + + return 0; + } + + if (params.n_ctx != 0 && params.n_ctx < 8) + { + LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); + params.n_ctx = 8; + } + + if (params.rope_freq_base != 0.0) + { + LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); + } + + if (params.rope_freq_scale != 0.0) + { + LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); + } + + LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); + + if (params.seed == LLAMA_DEFAULT_SEED) + { + params.seed = time(NULL); + } + + LOG_TEE("%s: seed = %u\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + + LOG("%s: llama backend init\n", __func__); + llama_backend_init(); + llama_numa_init(params.numa); + + llama_model *model; + llama_context *ctx; + llama_context *ctx_guidance = NULL; + std::vector chat_msgs; + g_model = &model; + g_ctx = &ctx; + + // load the model and apply lora adapter, if any + LOG("%s: load the model and apply lora adapter, if any\n", __func__); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + model = llama_init.model; + ctx = llama_init.context; + if (sparams.cfg_scale > 1.f) + { + struct llama_context_params lparams = llama_context_params_from_gpt_params(params); + ctx_guidance = llama_new_context_with_model(model, lparams); + } + + if (model == NULL) + { + LOG_TEE("%s: error: unable to load model\n", __func__); + return 1; + } + + const int n_ctx_train = llama_n_ctx_train(model); + const int n_ctx = llama_n_ctx(ctx); + LOG("n_ctx: %d\n", n_ctx); + + if (n_ctx > n_ctx_train) + { + LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n", + __func__, n_ctx_train, n_ctx); + } + + // print chat template example in conversation mode + if (params.conversation) + { + if (params.enable_chat_template) + { + LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str()); + } + else + { + LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); + } + } + + // print system information + { + LOG_TEE("\n"); + LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str()); + } + + std::string path_session = params.path_prompt_cache; + std::vector session_tokens; + + if (!path_session.empty()) + { + LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); + if (!file_exists(path_session)) + { + LOG_TEE("%s: session file does not exist, will create.\n", __func__); + } + else if (file_is_empty(path_session)) + { + LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__); + } + else + { + // The file exists and is not empty + session_tokens.resize(n_ctx); + size_t n_token_count_out = 0; + if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) + { + LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); + return 1; + } + session_tokens.resize(n_token_count_out); + LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size()); + } + } + + const bool add_bos = llama_add_bos_token(model); + if (!llama_model_has_encoder(model)) + { + GGML_ASSERT(!llama_add_eos_token(model)); + } + LOG("add_bos: %d\n", add_bos); + + std::vector embd_inp; + + { + auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty()) + ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode + : params.prompt; + if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) + { + LOG("tokenize the prompt\n"); + embd_inp = ::llama_tokenize(ctx, prompt, true, true); + } + else + { + LOG("use session tokens\n"); + embd_inp = session_tokens; + } + + LOG("prompt: \"%s\"\n", log_tostr(prompt)); + LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + } + + // Should not run without any tokens + if (embd_inp.empty()) + { + if (add_bos) + { + embd_inp.push_back(llama_token_bos(model)); + LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + } + else + { + LOG_TEE("error: input is empty\n"); + return -1; + } + } + + // Tokenize negative prompt + std::vector guidance_inp; + int guidance_offset = 0; + int original_prompt_len = 0; + if (ctx_guidance) + { + LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt)); + + guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true); + LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str()); + + std::vector original_inp = ::llama_tokenize(ctx, params.prompt, true, true); + LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str()); + + original_prompt_len = original_inp.size(); + guidance_offset = (int)guidance_inp.size() - original_prompt_len; + LOG("original_prompt_len: %s", log_tostr(original_prompt_len)); + LOG("guidance_offset: %s", log_tostr(guidance_offset)); + } + + if ((int)embd_inp.size() > n_ctx - 4) + { + LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int)embd_inp.size(), n_ctx - 4); + return 1; + } + + // debug message about similarity of saved session, if applicable + size_t n_matching_session_tokens = 0; + if (!session_tokens.empty()) + { + for (llama_token id : session_tokens) + { + if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) + { + break; + } + n_matching_session_tokens++; + } + if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) + { + LOG_TEE("%s: using full prompt from session file\n", __func__); + } + else if (n_matching_session_tokens >= embd_inp.size()) + { + LOG_TEE("%s: session file has exact match for prompt!\n", __func__); + } + else if (n_matching_session_tokens < (embd_inp.size() / 2)) + { + LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", + __func__, n_matching_session_tokens, embd_inp.size()); + } + else + { + LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n", + __func__, n_matching_session_tokens, embd_inp.size()); + } + + // remove any "future" tokens that we might have inherited from the previous session + llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1); + } + + LOGLN( + "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu", + log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size()); + + // if we will use the cache for the full prompt without reaching the end of the cache, force + // reevaluation of the last token to recalculate the cached logits + if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) + { + LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1); + + session_tokens.resize(embd_inp.size() - 1); + } + + // number of tokens to keep when resetting context + if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size()) + { + params.n_keep = (int)embd_inp.size(); + } + else + { + params.n_keep += add_bos; // always keep the BOS token + } + + if (params.conversation) + { + params.interactive_first = true; + } + + // enable interactive mode if interactive start is specified + if (params.interactive_first) + { + params.interactive = true; + } + + if (params.verbose_prompt) + { + LOG_TEE("\n"); + LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + for (int i = 0; i < (int)embd_inp.size(); i++) + { + LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); + } + + if (ctx_guidance) + { + LOG_TEE("\n"); + LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str()); + LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); + for (int i = 0; i < (int)guidance_inp.size(); i++) + { + LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); + } + } + + if (params.n_keep > add_bos) + { + LOG_TEE("%s: static prompt based on n_keep: '", __func__); + for (int i = 0; i < params.n_keep; i++) + { + LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); + } + LOG_TEE("'\n"); + } + LOG_TEE("\n"); + } + + // ctrl+C handling + { +#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) + struct sigaction sigint_action; + sigint_action.sa_handler = sigint_handler; + sigemptyset(&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); +#elif defined(_WIN32) + auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL + { + return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; + }; + SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); +#endif + } + + if (params.interactive) + { + LOG_TEE("%s: interactive mode on.\n", __func__); + + if (!params.antiprompt.empty()) + { + for (const auto &antiprompt : params.antiprompt) + { + LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str()); + if (params.verbose_prompt) + { + auto tmp = ::llama_tokenize(ctx, antiprompt, false, true); + for (int i = 0; i < (int)tmp.size(); i++) + { + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + } + } + } + } + + if (params.input_prefix_bos) + { + LOG_TEE("Input prefix with BOS\n"); + } + + if (!params.input_prefix.empty()) + { + LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str()); + if (params.verbose_prompt) + { + auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true); + for (int i = 0; i < (int)tmp.size(); i++) + { + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + } + } + } + + if (!params.input_suffix.empty()) + { + LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); + if (params.verbose_prompt) + { + auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true); + for (int i = 0; i < (int)tmp.size(); i++) + { + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + } + } + } + } + LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str()); + LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str()); + LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); + + // group-attention state + // number of grouped KV tokens so far (used only if params.grp_attn_n > 1) + int ga_i = 0; + + const int ga_n = params.grp_attn_n; + const int ga_w = params.grp_attn_w; + + if (ga_n != 1) + { + GGML_ASSERT(ga_n > 0 && "grp_attn_n must be positive"); // NOLINT + GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT + // GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT + // GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT + LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w); + } + LOG_TEE("\n\n"); + + if (params.interactive) + { + const char *control_message; + if (params.multiline_input) + { + control_message = " - To return control to the AI, end your input with '\\'.\n" + " - To return control without starting a new line, end your input with '/'.\n"; + } + else + { + control_message = " - Press Return to return control to the AI.\n" + " - To return control without starting a new line, end your input with '/'.\n" + " - If you want to submit another line, end your input with '\\'.\n"; + } + LOG_TEE("== Running in interactive mode. ==\n"); +#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32) + LOG_TEE(" - Press Ctrl+C to interject at any time.\n"); +#endif + LOG_TEE("%s\n", control_message); + + is_interacting = params.interactive_first; + } + + bool is_antiprompt = false; + bool input_echo = true; + bool display = true; + bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size(); + + int n_past = 0; + int n_remain = params.n_predict; + int n_consumed = 0; + int n_session_consumed = 0; + int n_past_guidance = 0; + + std::vector input_tokens; + g_input_tokens = &input_tokens; + std::vector output_tokens; + g_output_tokens = &output_tokens; + std::ostringstream output_ss; + g_output_ss = &output_ss; + std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode + + // the first thing we will do is to output the prompt, so set color accordingly + console::set_display(console::prompt); + display = params.display_prompt; + + std::vector embd; + std::vector embd_guidance; + + // tokenized antiprompts + std::vector> antiprompt_ids; + + antiprompt_ids.reserve(params.antiprompt.size()); + for (const std::string &antiprompt : params.antiprompt) + { + antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true)); + } + + struct llama_sampling_context *ctx_sampling = llama_sampling_init(sparams); + if (!ctx_sampling) + { + fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); + exit(1); + } + + if (llama_model_has_encoder(model)) + { + int enc_input_size = embd_inp.size(); + llama_token *enc_input_buf = embd_inp.data(); + + if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) + { + LOG_TEE("%s : failed to eval\n", __func__); + return 1; + } + + llama_token decoder_start_token_id = llama_model_decoder_start_token(model); + if (decoder_start_token_id == -1) + { + decoder_start_token_id = llama_token_bos(model); + } + + embd_inp.clear(); + embd_inp.push_back(decoder_start_token_id); + } + + while ((n_remain != 0 && !is_antiprompt) || params.interactive) + { + // predict + if (!embd.empty()) + { + // Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via + // --prompt or --file which uses the same value. + int max_embd_size = n_ctx - 4; + + // Ensure the input doesn't exceed the context size by truncating embd if necessary. + if ((int)embd.size() > max_embd_size) + { + const int skipped_tokens = (int)embd.size() - max_embd_size; + embd.resize(max_embd_size); + + console::set_display(console::error); + printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); + console::set_display(console::reset); + fflush(stdout); + } + + if (ga_n == 1) + { + // infinite text generation via context shifting + // if we run out of context: + // - take the n_keep first tokens from the original prompt (via n_past) + // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches + if (n_past + (int)embd.size() + std::max(0, guidance_offset) >= n_ctx) + { + if (params.n_predict == -2) + { + LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); + break; + } + + const int n_left = n_past - params.n_keep; + const int n_discard = n_left / 2; + + LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", + n_past, n_left, n_ctx, params.n_keep, n_discard); + + llama_kv_cache_seq_rm(ctx, 0, params.n_keep, params.n_keep + n_discard); + llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); + + n_past -= n_discard; + + if (ctx_guidance) + { + n_past_guidance -= n_discard; + } + + LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance); + + LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); + + LOG("clear session path\n"); + path_session.clear(); + } + } + else + { + // context extension via Self-Extend + while (n_past >= ga_i + ga_w) + { + const int ib = (ga_n * ga_i) / ga_w; + const int bd = (ga_w / ga_n) * (ga_n - 1); + const int dd = (ga_w / ga_n) - ib * bd - ga_w; + + LOG("\n"); + LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib * bd, ga_i + ib * bd, n_past + ib * bd); + LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib * bd, ga_i + ib * bd + ga_w, ga_n, (ga_i + ib * bd) / ga_n, (ga_i + ib * bd + ga_w) / ga_n); + LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib * bd + ga_w, n_past + ib * bd, dd, ga_i + ib * bd + ga_w + dd, n_past + ib * bd + dd); + + llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib * bd); + llama_kv_cache_seq_div(ctx, 0, ga_i + ib * bd, ga_i + ib * bd + ga_w, ga_n); + llama_kv_cache_seq_add(ctx, 0, ga_i + ib * bd + ga_w, n_past + ib * bd, dd); + + n_past -= bd; + + ga_i += ga_w / ga_n; + + LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i); + } + } + + // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past) + if (n_session_consumed < (int)session_tokens.size()) + { + size_t i = 0; + for (; i < embd.size(); i++) + { + if (embd[i] != session_tokens[n_session_consumed]) + { + session_tokens.resize(n_session_consumed); + break; + } + + n_past++; + n_session_consumed++; + + if (n_session_consumed >= (int)session_tokens.size()) + { + ++i; + break; + } + } + if (i > 0) + { + embd.erase(embd.begin(), embd.begin() + i); + } + } + + // evaluate tokens in batches + // embd is typically prepared beforehand to fit within a batch, but not always + if (ctx_guidance) + { + int input_size = 0; + llama_token *input_buf = NULL; + + if (n_past_guidance < (int)guidance_inp.size()) + { + // Guidance context should have the same data with these modifications: + // + // * Replace the initial prompt + // * Shift everything by guidance_offset + embd_guidance = guidance_inp; + if (embd.begin() + original_prompt_len < embd.end()) + { + embd_guidance.insert( + embd_guidance.end(), + embd.begin() + original_prompt_len, + embd.end()); + } + + input_buf = embd_guidance.data(); + input_size = embd_guidance.size(); + + LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str()); + } + else + { + input_buf = embd.data(); + input_size = embd.size(); + } + + for (int i = 0; i < input_size; i += params.n_batch) + { + int n_eval = std::min(input_size - i, params.n_batch); + if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) + { + LOG_TEE("%s : failed to eval\n", __func__); + return 1; + } + + n_past_guidance += n_eval; + } + } + + for (int i = 0; i < (int)embd.size(); i += params.n_batch) + { + int n_eval = (int)embd.size() - i; + if (n_eval > params.n_batch) + { + n_eval = params.n_batch; + } + + LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); + + if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) + { + LOG_TEE("%s : failed to eval\n", __func__); + return 1; + } + + n_past += n_eval; + + LOG("n_past = %d\n", n_past); + // Display total tokens alongside total time + if (params.n_print > 0 && n_past % params.n_print == 0) + { + LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); + } + } + + if (!embd.empty() && !path_session.empty()) + { + session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); + n_session_consumed = session_tokens.size(); + } + } + + embd.clear(); + embd_guidance.clear(); + + if ((int)embd_inp.size() <= n_consumed && !is_interacting) + { + // optionally save the session on first sample (for faster prompt loading next time) + if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) + { + need_to_save_session = false; + llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); + + LOG("saved session to %s\n", path_session.c_str()); + } + + const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance); + + llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true); + + LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); + + embd.push_back(id); + + // echo this to console + input_echo = true; + + // decrement remaining sampling budget + --n_remain; + + LOG("n_remain: %d\n", n_remain); + } + else + { + // some user input remains from prompt or interaction, forward it to processing + LOG("embd_inp.size(): %d, n_consumed: %d\n", (int)embd_inp.size(), n_consumed); + while ((int)embd_inp.size() > n_consumed) + { + embd.push_back(embd_inp[n_consumed]); + + // push the prompt in the sampling context in order to apply repetition penalties later + // for the prompt, we don't apply grammar rules + llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false); + + ++n_consumed; + if ((int)embd.size() >= params.n_batch) + { + break; + } + } + } + + // display text + if (input_echo && display) + { + for (auto id : embd) + { + const std::string token_str = llama_token_to_piece(ctx, id, params.special); + + // Console/Stream Output + fprintf(stdout, "%s", token_str.c_str()); + + // Record Displayed Tokens To Log + // Note: Generated tokens are created one by one hence this check + if (embd.size() > 1) + { + // Incoming Requested Tokens + input_tokens.push_back(id); + } + else + { + // Outgoing Generated Tokens + output_tokens.push_back(id); + output_ss << token_str; + } + + fflush(stdout); + } + } + + // reset color to default if there is no pending user input + if (input_echo && (int)embd_inp.size() == n_consumed) + { + console::set_display(console::reset); + display = true; + } + + // if not currently processing queued inputs; + if ((int)embd_inp.size() <= n_consumed) + { + // check for reverse prompt in the last n_prev tokens + if (!params.antiprompt.empty()) + { + const int n_prev = 32; + const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev); + + is_antiprompt = false; + // Check if each of the reverse prompts appears at the end of the output. + // If we're not running interactively, the reverse prompt might be tokenized with some following characters + // so we'll compensate for that by widening the search window a bit. + for (std::string &antiprompt : params.antiprompt) + { + size_t extra_padding = params.interactive ? 0 : 2; + size_t search_start_pos = last_output.length() > static_cast(antiprompt.length() + extra_padding) + ? last_output.length() - static_cast(antiprompt.length() + extra_padding) + : 0; + + if (last_output.find(antiprompt, search_start_pos) != std::string::npos) + { + if (params.interactive) + { + is_interacting = true; + } + is_antiprompt = true; + break; + } + } + + // check for reverse prompt using special tokens + llama_token last_token = llama_sampling_last(ctx_sampling); + for (std::vector ids : antiprompt_ids) + { + if (ids.size() == 1 && last_token == ids[0]) + { + if (params.interactive) + { + is_interacting = true; + } + is_antiprompt = true; + break; + } + } + + if (is_antiprompt) + { + LOG("found antiprompt: %s\n", last_output.c_str()); + } + } + + // deal with end of generation tokens in interactive mode + if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) + { + LOG("found an EOG token\n"); + + if (params.interactive) + { + if (!params.antiprompt.empty()) + { + // tokenize and inject first reverse prompt + const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true); + embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); + is_antiprompt = true; + } + + if (params.enable_chat_template) + { + chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str()); + } + is_interacting = true; + printf("\n"); + } + } + + // if current token is not EOG, we add it to current assistant message + if (params.conversation) + { + auto id = llama_sampling_last(ctx_sampling); + assistant_ss << llama_token_to_piece(ctx, id, false); + } + + if (n_past > 0 && is_interacting) + { + LOG("waiting for user input\n"); + + if (params.conversation) + { + printf("\n> "); + } + + if (params.input_prefix_bos) + { + LOG("adding input prefix BOS token\n"); + embd_inp.push_back(llama_token_bos(model)); + } + + std::string buffer; + if (!params.input_prefix.empty() && !params.conversation) + { + LOG("appending input prefix: '%s'\n", params.input_prefix.c_str()); + printf("%s", params.input_prefix.c_str()); + } + + // color user input only + console::set_display(console::user_input); + display = params.display_prompt; + + std::string line; + bool another_line = true; + do + { + another_line = console::readline(line, params.multiline_input); + buffer += line; + } while (another_line); + + // done taking input, reset color + console::set_display(console::reset); + display = true; + + // Add tokens to embd only if the input buffer is non-empty + // Entering a empty line lets the user pass control back + if (buffer.length() > 1) + { + // append input suffix if any + if (!params.input_suffix.empty() && !params.conversation) + { + LOG("appending input suffix: '%s'\n", params.input_suffix.c_str()); + printf("%s", params.input_suffix.c_str()); + } + + LOG("buffer: '%s'\n", buffer.c_str()); + + const size_t original_size = embd_inp.size(); + + if (params.escape) + { + string_process_escapes(buffer); + } + + bool format_chat = params.conversation && params.enable_chat_template; + std::string user_inp = format_chat + ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer)) + : std::move(buffer); + // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix) + const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); + const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat); + const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); + + LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); + + // if user stop generation mid-way, we must add EOT to finish model's last response + if (need_insert_eot && format_chat) + { + llama_token eot = llama_token_eot(model); + embd_inp.push_back(eot == -1 ? llama_token_eos(model) : eot); + need_insert_eot = false; + } + + embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end()); + embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); + embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end()); + + for (size_t i = original_size; i < embd_inp.size(); ++i) + { + const llama_token token = embd_inp[i]; + output_tokens.push_back(token); + output_ss << llama_token_to_piece(ctx, token); + } + + // reset assistant message + assistant_ss.str(""); + + n_remain -= line_inp.size(); + LOG("n_remain: %d\n", n_remain); + } + else + { + LOG("empty line, passing control back\n"); + } + + input_echo = false; // do not echo this again + } + + if (n_past > 0) + { + if (is_interacting) + { + llama_sampling_reset(ctx_sampling); + } + is_interacting = false; + } + } + + // end of generation + if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) + { + LOG_TEE(" [end of text]\n"); + break; + } + + // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. + // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size). + if (params.interactive && n_remain <= 0 && params.n_predict >= 0) + { + n_remain = params.n_predict; + is_interacting = true; + } + } + + if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) + { + LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); + llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); + } + + llama_print_timings(ctx); + write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); + + if (ctx_guidance) + { + llama_free(ctx_guidance); + } + llama_free(ctx); + llama_free_model(model); + + llama_sampling_free(ctx_sampling); + llama_backend_free(); + +#ifndef LOG_DISABLE_LOGS + LOG_TEE("Log end\n"); +#endif // LOG_DISABLE_LOGS + + return 0; + } +} + +// --- EMSCRIPTEN BINDINGS --- EMSCRIPTEN BINDINGS --- EMSCRIPTEN BINDINGS --- EMSCRIPTEN BINDINGS --- +#include "util.hpp" +int mainMain(std::vector &args, std::vector &retVal) +{ + args.insert(args.begin(), "llamalib.wasm"); + + int argc = args.size(); + char **argv = new char *[argc]; + for (int i = 0; i < argc; i++) + { + argv[i] = new char[args[i].size() + 1]; + strcpy(argv[i], args[i].c_str()); + } + + int ret = 0; + { + OutErrRedirect outerr(); + ret = main::main(argc, argv); + } + readOutFile(retVal); + readErrorFile(retVal); + + return ret; +} + +#include +EMSCRIPTEN_BINDINGS(llama_module) +{ + emscripten::register_vector("VectorString"); + emscripten::function("main", &mainMain); +} diff --git a/packages/llama/src-cpp/util.cpp b/packages/llama/src-cpp/util.cpp new file mode 100644 index 00000000..bbb68f3b --- /dev/null +++ b/packages/llama/src-cpp/util.cpp @@ -0,0 +1,67 @@ +#include "util.hpp" +#include +#include +#include +#include + +const char *const LLAMALIB_WASM = "llamalib.wasm"; + +ArgBuffer::ArgBuffer(const std::vector &args) +{ + argc = args.size() + 1; + argv = new char *[argc]; + argv[0] = const_cast(LLAMALIB_WASM); + for (int i = 1; i < argc; i++) + { + argv[i] = const_cast(args.at(i - 1).c_str()); + } +} + +ArgBuffer::~ArgBuffer() +{ + delete[] argv; +} + +OutErrRedirect::OutErrRedirect() +{ + fflush(stdout); + outBackup = dup(fileno(stdout)); + freopen("output.txt", "w", stdout); + + fflush(stderr); + errBackup = dup(fileno(stderr)); + freopen("error.txt", "w", stderr); +} + +OutErrRedirect::~OutErrRedirect() +{ + if (errBackup != -1) + { + fflush(stderr); + dup2(errBackup, fileno(stderr)); + close(errBackup); + } + + if (outBackup != -1) + { + fflush(stdout); + dup2(outBackup, fileno(stdout)); + close(outBackup); + } +} + +void readOutFile(std::vector &retVal) +{ + std::ifstream file("output.txt"); + std::stringstream output; + output << file.rdbuf(); + retVal.push_back(output.str()); +} + +void readErrorFile(std::vector &retVal) +{ + std::ifstream file("error.txt"); + std::stringstream output; + output << file.rdbuf(); + retVal.push_back(output.str()); +} diff --git a/packages/llama/src-cpp/util.hpp b/packages/llama/src-cpp/util.hpp new file mode 100644 index 00000000..d57a2684 --- /dev/null +++ b/packages/llama/src-cpp/util.hpp @@ -0,0 +1,38 @@ +#pragma once + +#include +#include + +class ArgBuffer +{ +public: + int argc; + char **argv; + + ArgBuffer(const std::vector &args); + ~ArgBuffer(); + + ArgBuffer(const ArgBuffer &) = delete; + ArgBuffer(ArgBuffer &&) = delete; + ArgBuffer &operator=(const ArgBuffer &) = delete; + ArgBuffer &operator=(ArgBuffer &&) = delete; +}; + +class OutErrRedirect +{ +private: + int outBackup = -1; + int errBackup = -1; + +public: + OutErrRedirect(); + ~OutErrRedirect(); + + OutErrRedirect(const OutErrRedirect &) = delete; + OutErrRedirect(OutErrRedirect &&) = delete; + OutErrRedirect &operator=(const OutErrRedirect &) = delete; + OutErrRedirect &operator=(OutErrRedirect &&) = delete; +}; + +void readOutFile(std::vector &retVal); +void readErrorFile(std::vector &retVal); diff --git a/packages/llama/src/index.ts b/packages/llama/src/index.ts new file mode 100644 index 00000000..c162fac5 --- /dev/null +++ b/packages/llama/src/index.ts @@ -0,0 +1,2 @@ +export * from "./llama.ts"; +export * from "./web-blob.ts"; \ No newline at end of file diff --git a/packages/llama/src/llama.ts b/packages/llama/src/llama.ts new file mode 100644 index 00000000..0af4155d --- /dev/null +++ b/packages/llama/src/llama.ts @@ -0,0 +1,96 @@ +// @ts-expect-error importing from a wasm file is resolved via a custom esbuild plugin +import load, { reset } from "../../../build/packages/llama/src-cpp/llamalib.wasm"; +import type { MainModule } from "../../../build/packages/llama/src-cpp/llamalib.js"; +import llamaMeta from "../../../vcpkg-overlays/llama/vcpkg.json" with { type: "json" }; + +// Ref: https://github.com/ggerganov/llama.cpp +// Ref: http://facebook.github.io/llama/llama_manual.html +// Ref: https://github.com/facebook/llama + +/** + * The llama WASM library, provides a simplified wrapper around the llama.cpp library. + * + * See [llama.cpp](https://github.com/ggerganov/llama.cpp) for more details. + * + * ```ts + * import { Llama, WebBlob } from "@hpcc-js/wasm-llama"; + * + * let llama = await Llama.load(); + * const model = "https://huggingface.co/CompendiumLabs/bge-base-en-v1.5-gguf/resolve/main/bge-base-en-v1.5-q4_k_m.gguf"; + * const webBlob: Blob = await WebBlob.create(new URL(model)); + * + * const data: ArrayBuffer = await webBlob.arrayBuffer(); + * + * const embeddings = llama.embedding("Hello and Welcome!", new Uint8Array(data)); + * ``` + */ +export class Llama { + + private constructor(protected _module: MainModule) { + } + + /** + * Compiles and instantiates the raw wasm. + * + * ::: info + * In general WebAssembly compilation is disallowed on the main thread if the buffer size is larger than 4KB, hence forcing `load` to be asynchronous; + * ::: + * + * @returns A promise to an instance of the Llama class. + */ + static load(): Promise { + return load().then((module: any) => { + return new Llama(module); + }); + } + + /** + * Unloades the compiled wasm instance. + */ + static unload() { + reset(); + } + + /** + * @returns The Llama c++ version + */ + version(): string { + return llamaMeta["version-string"]; + } + + /** + * Calculates the vector representation of the input text. + * + * @param text The input text. + * @param model The model to use for the embedding. + * + * @returns The embedding of the text using the model. + */ + embedding(text: string, model: Uint8Array): [number[]?] { + try { + this._module.FS_createDataFile("/", "embeddingModel.gguf", model, true, false, false); + } catch (e) { + console.error(e); + } + const args = new this._module.VectorString(); + args.push_back("-m"); args.push_back("/embeddingModel.gguf"); + args.push_back("--pooling"); args.push_back("mean"); + args.push_back("--log-disable"); + args.push_back("-p"); args.push_back(text); + args.push_back("--embd-output-format"); args.push_back("array"); + const embeddingResult = new this._module.VectorString(); + let retVal: [number[]?] = []; + try { + this._module.embedding(args, embeddingResult); + const cout = embeddingResult.get(0); + retVal = JSON.parse(cout); + } catch (e) { + console.error(e); + } finally { + embeddingResult.delete(); + args.delete(); + this._module.FS_unlink("/embeddingModel.gguf"); + } + return retVal; + } +} diff --git a/packages/llama/src/web-blob.ts b/packages/llama/src/web-blob.ts new file mode 100644 index 00000000..c5d2f4f7 --- /dev/null +++ b/packages/llama/src/web-blob.ts @@ -0,0 +1,113 @@ +// See: https://github.com/huggingface/huggingface.js/blob/main/packages/hub/src/utils/WebBlob.ts + +/** + * WebBlob is a Blob implementation for web resources that supports range requests. + */ + +interface WebBlobCreateOptions { + /** + * @default 1_000_000 + * + * Objects below that size will immediately be fetched and put in RAM, rather + * than streamed ad-hoc + */ + cacheBelow?: number; + /** + * Custom fetch function to use instead of the default one, for example to use a proxy or edit headers. + */ + fetch?: typeof fetch; +} + +export class WebBlob extends Blob { + static async create(url: URL, opts?: WebBlobCreateOptions): Promise { + const customFetch = opts?.fetch ?? fetch; + const response = await customFetch(url, { method: "HEAD" }); + + const size = Number(response.headers.get("content-length")); + const contentType = response.headers.get("content-type") || ""; + const supportRange = response.headers.get("accept-ranges") === "bytes"; + + if (!supportRange || size < (opts?.cacheBelow ?? 1_000_000)) { + return await (await customFetch(url)).blob(); + } + + return new WebBlob(url, 0, size, contentType, true, customFetch); + } + + private url: URL; + private start: number; + private end: number; + private contentType: string; + private full: boolean; + private fetch: typeof fetch; + + constructor(url: URL, start: number, end: number, contentType: string, full: boolean, customFetch: typeof fetch) { + super([]); + + this.url = url; + this.start = start; + this.end = end; + this.contentType = contentType; + this.full = full; + this.fetch = customFetch; + } + + override get size(): number { + return this.end - this.start; + } + + override get type(): string { + return this.contentType; + } + + override slice(start = 0, end = this.size): WebBlob { + if (start < 0 || end < 0) { + new TypeError("Unsupported negative start/end on FileBlob.slice"); + } + + const slice = new WebBlob( + this.url, + this.start + start, + Math.min(this.start + end, this.end), + this.contentType, + start === 0 && end === this.size ? this.full : false, + this.fetch + ); + + return slice; + } + + override async arrayBuffer(): Promise { + const result = await this.fetchRange(); + + return result.arrayBuffer(); + } + + override async text(): Promise { + const result = await this.fetchRange(); + + return result.text(); + } + + override stream(): ReturnType { + const stream = new TransformStream(); + + this.fetchRange() + .then((response) => response.body?.pipeThrough(stream)) + .catch((error) => stream.writable.abort(error.message)); + + return stream.readable; + } + + private fetchRange(): Promise { + const fetch = this.fetch; // to avoid this.fetch() which is bound to the instance instead of globalThis + if (this.full) { + return fetch(this.url); + } + return fetch(this.url, { + headers: { + Range: `bytes=${this.start}-${this.end - 1}`, + }, + }); + } +} \ No newline at end of file diff --git a/packages/llama/test/index-browser.ts b/packages/llama/test/index-browser.ts new file mode 100644 index 00000000..9ae2931d --- /dev/null +++ b/packages/llama/test/index-browser.ts @@ -0,0 +1 @@ +export * from "./llama.ts"; diff --git a/packages/llama/test/index-node.ts b/packages/llama/test/index-node.ts new file mode 100644 index 00000000..9ae2931d --- /dev/null +++ b/packages/llama/test/index-node.ts @@ -0,0 +1 @@ +export * from "./llama.ts"; diff --git a/packages/llama/test/llama.ts b/packages/llama/test/llama.ts new file mode 100644 index 00000000..0347db67 --- /dev/null +++ b/packages/llama/test/llama.ts @@ -0,0 +1,67 @@ +import { expect } from "chai"; +import { Llama, WebBlob } from "@hpcc-js/wasm-llama"; + +describe.only("llama", function () { + it("version", async function () { + let llama = await Llama.load(); + let v = llama.version(); + const v1 = v; + expect(v).to.be.a.string; + expect(v).to.be.not.empty; + expect(v).to.equal("b3623"); // Update README.md with the new version!!! + + llama = await Llama.load(); + v = llama.version(); + expect(v).to.be.a.string; + expect(v).to.be.not.empty; + expect(v).equals(v1); + Llama.unload(); + + llama = await Llama.load(); + v = llama.version(); + expect(v).to.be.a.string; + expect(v).to.be.not.empty; + expect(v).equals(v1); + Llama.unload(); + }); + + it("test", async function () { + this.timeout(10000); + let llama = await Llama.load(); + const model = "https://huggingface.co/CompendiumLabs/bge-base-en-v1.5-gguf/resolve/main/bge-base-en-v1.5-q4_k_m.gguf"; + const webBlob: Blob = await WebBlob.create(new URL(model)); + expect(webBlob.type).to.be.a.string; + expect(webBlob.type).equals("binary/octet-stream"); + const data: ArrayBuffer = await webBlob.arrayBuffer(); + expect(data).to.be.instanceOf(ArrayBuffer); + expect(data.byteLength).to.be.greaterThan(0); + + const embeddings = llama.embedding("Hello and Welcome!", new Uint8Array(data)); + expect(embeddings).to.be.instanceOf(Array); + expect(embeddings.length).equals(1); + expect(embeddings[0]).to.be.a.instanceOf(Array); + expect(embeddings[0].length).to.be.greaterThan(0); + expect(embeddings[0][0]).to.be.a("number"); + + const embeddings2 = llama.embedding("Hello and Welcome!", new Uint8Array(data)); + expect(embeddings2).to.be.instanceOf(Array); + expect(embeddings2.length).equals(1); + expect(embeddings2[0]).to.be.a.instanceOf(Array); + expect(embeddings2[0].length).to.be.greaterThan(0); + expect(embeddings2[0][0]).to.be.a("number"); + + expect(embeddings).to.deep.equal(embeddings2); + + Llama.unload(); + llama = await Llama.load(); + + const embeddings3 = llama.embedding("Hello and Welcome!", new Uint8Array(data)); + expect(embeddings3).to.be.instanceOf(Array); + expect(embeddings3.length).equals(1); + expect(embeddings3[0]).to.be.a.instanceOf(Array); + expect(embeddings3[0].length).to.be.greaterThan(0); + expect(embeddings3[0][0]).to.be.a("number"); + + expect(embeddings).to.deep.equal(embeddings3); + }); +}); diff --git a/packages/llama/tsconfig.json b/packages/llama/tsconfig.json new file mode 100644 index 00000000..20a93433 --- /dev/null +++ b/packages/llama/tsconfig.json @@ -0,0 +1,11 @@ +{ + "extends": "../tsconfig.json", + "compilerOptions": { + "rootDir": "./src", + "declarationDir": "./types" + }, + "include": [ + "./src/**/*" + ], + "references": [] +} \ No newline at end of file diff --git a/packages/tsconfig.json b/packages/tsconfig.json index 19ce58c9..ba0e6674 100644 --- a/packages/tsconfig.json +++ b/packages/tsconfig.json @@ -4,6 +4,7 @@ "target": "ESNext", "declaration": true, "emitDeclarationOnly": true, + "resolveJsonModule": true, "strict": true, "skipLibCheck": true, "allowImportingTsExtensions": true, diff --git a/src-cpp/CMakeLists.txt b/src-cpp/CMakeLists.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/typedoc.json b/typedoc.json index 7acaff59..996337cd 100644 --- a/typedoc.json +++ b/typedoc.json @@ -5,6 +5,7 @@ "./packages/duckdb/src/duckdb.ts", "./packages/expat/src/expat.ts", "./packages/graphviz/src/graphviz.ts", + "./packages/llama/src/llama.ts", "./packages/zstd/src/zstd.ts", ], "out": "./docs", diff --git a/vcpkg-overlays/llama/portfile.cmake b/vcpkg-overlays/llama/portfile.cmake new file mode 100644 index 00000000..0f42b80c --- /dev/null +++ b/vcpkg-overlays/llama/portfile.cmake @@ -0,0 +1,28 @@ +vcpkg_from_github( + OUT_SOURCE_PATH SOURCE_PATH + REPO ggerganov/llama.cpp + REF "${VERSION}" + SHA512 f59c5b4b0f24ace3e997bbaf69239d1b0c09f640cfdc1730976e5333aff2300f1c822b4a464c6d7b765f92264d48c5f79ccedb153cbeeaa55793785905136130 + HEAD_REF master +) + +vcpkg_cmake_configure( + SOURCE_PATH "${SOURCE_PATH}" + OPTIONS + -DLLAMA_BUILD_TESTS=OFF + -DLLAMA_BUILD_EXAMPLES=OFF + -DLLAMA_BUILD_SERVER=OFF + -DGGML_OPENMP=OFF +) + +vcpkg_cmake_install() + +vcpkg_copy_pdbs() +vcpkg_fixup_pkgconfig() +vcpkg_cmake_config_fixup(CONFIG_PATH lib/cmake/llama) + +file(INSTALL ${SOURCE_PATH}/common DESTINATION ${CURRENT_PACKAGES_DIR}/share/${PORT}) + +file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include") +file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/share") +file(INSTALL ${SOURCE_PATH}/LICENSE DESTINATION ${CURRENT_PACKAGES_DIR}/share/${PORT} RENAME copyright) diff --git a/vcpkg-overlays/llama/vcpkg.json b/vcpkg-overlays/llama/vcpkg.json new file mode 100644 index 00000000..6b27a210 --- /dev/null +++ b/vcpkg-overlays/llama/vcpkg.json @@ -0,0 +1,16 @@ +{ + "name": "llama", + "version-string": "b3623", + "homepage": "https://github.com/ggerganov/llama.cpp", + "description": "Inference of LLaMA model in pure C/C++.", + "dependencies": [ + { + "name": "vcpkg-cmake", + "host": true + }, + { + "name": "vcpkg-cmake-config", + "host": true + } + ] +} \ No newline at end of file diff --git a/vcpkg.json b/vcpkg.json index 48696c1f..5dd83a56 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -9,6 +9,9 @@ { "name": "graphviz" }, + { + "name": "llama" + }, { "name": "triangle" },