From 9c20f5620e5c707c0eead50522a68739806d2fff Mon Sep 17 00:00:00 2001
From: Gordon Smith <GordonJSmith@gmail.com>
Date: Mon, 26 Aug 2024 13:47:21 +0100
Subject: [PATCH] feat:  Add llama.cpp web assembly support

Primarily for calculating embeddings on the client

Signed-off-by: Gordon Smith <GordonJSmith@gmail.com>
---
 .gitignore                            |    5 +-
 .vscode/settings.json                 |    4 +-
 CMakeLists.txt                        |    3 +-
 docs/.vitepress/config.js             |    1 +
 docs/index.md                         |    3 +
 package-lock.json                     |  300 ++++++-
 package.json                          |   20 +-
 packages/llama/.vscode/launch.json    |   44 +
 packages/llama/.vscode/tasks.json     |   73 ++
 packages/llama/CHANGELOG.md           |    2 +
 packages/llama/README.md              |   25 +
 packages/llama/esbuild.mjs            |    9 +
 packages/llama/karma.conf.cjs         |   20 +
 packages/llama/package.json           |   57 ++
 packages/llama/src-cpp/CMakeLists.txt |   50 ++
 packages/llama/src-cpp/embedding.cpp  |  428 +++++++++
 packages/llama/src-cpp/main.cpp       | 1199 +++++++++++++++++++++++++
 packages/llama/src-cpp/util.cpp       |   67 ++
 packages/llama/src-cpp/util.hpp       |   38 +
 packages/llama/src/index.ts           |    2 +
 packages/llama/src/llama.ts           |   96 ++
 packages/llama/src/web-blob.ts        |  113 +++
 packages/llama/test/index-browser.ts  |    1 +
 packages/llama/test/index-node.ts     |    1 +
 packages/llama/test/llama.ts          |   67 ++
 packages/llama/tsconfig.json          |   11 +
 packages/tsconfig.json                |    1 +
 src-cpp/CMakeLists.txt                |    0
 typedoc.json                          |    1 +
 vcpkg-overlays/llama/portfile.cmake   |   28 +
 vcpkg-overlays/llama/vcpkg.json       |   16 +
 vcpkg.json                            |    3 +
 32 files changed, 2641 insertions(+), 47 deletions(-)
 create mode 100644 packages/llama/.vscode/launch.json
 create mode 100644 packages/llama/.vscode/tasks.json
 create mode 100644 packages/llama/CHANGELOG.md
 create mode 100644 packages/llama/README.md
 create mode 100644 packages/llama/esbuild.mjs
 create mode 100644 packages/llama/karma.conf.cjs
 create mode 100644 packages/llama/package.json
 create mode 100644 packages/llama/src-cpp/CMakeLists.txt
 create mode 100644 packages/llama/src-cpp/embedding.cpp
 create mode 100644 packages/llama/src-cpp/main.cpp
 create mode 100644 packages/llama/src-cpp/util.cpp
 create mode 100644 packages/llama/src-cpp/util.hpp
 create mode 100644 packages/llama/src/index.ts
 create mode 100644 packages/llama/src/llama.ts
 create mode 100644 packages/llama/src/web-blob.ts
 create mode 100644 packages/llama/test/index-browser.ts
 create mode 100644 packages/llama/test/index-node.ts
 create mode 100644 packages/llama/test/llama.ts
 create mode 100644 packages/llama/tsconfig.json
 delete mode 100644 src-cpp/CMakeLists.txt
 create mode 100644 vcpkg-overlays/llama/portfile.cmake
 create mode 100644 vcpkg-overlays/llama/vcpkg.json

diff --git a/.gitignore b/.gitignore
index 25310caa..12581d52 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 .nyc_output/
-/.vscode/c_cpp_properties.json
-/.vscode/ipch
+.vscode/c_cpp_properties.json
+.vscode/ipch
+.vscode/settings.json
 .nx/
 bin/
 build/
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 3f575e5e..e09c010a 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,5 +1,3 @@
 {
-    "cmake.cmakePath": "${workspaceFolder}/scripts/cmake.sh",
-    "ecl.launchConfiguration": "not found",
-    "ecl.targetCluster": {}
+    "cmake.cmakePath": "${workspaceFolder}/scripts/cmake.sh"
 }
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 68aea66b..5799d624 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -35,6 +35,7 @@ set(EM_LINK_FLAGS
     "-sUSE_GLFW=0"
     "-sALLOW_UNIMPLEMENTED_SYSCALLS=1"
     "-sINCOMING_MODULE_JS_API=\"['wasmBinary']\""
+    "--no-entry"
     "--pre-js ${CMAKE_CURRENT_SOURCE_DIR}/src-cpp/src/pre.js"
     "--post-js ${CMAKE_CURRENT_SOURCE_DIR}/src-cpp/src/post.js"
 )
@@ -79,8 +80,8 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug")
     set(PACK_MODE "-d")
 endif ()
 
-add_subdirectory(src-cpp)
 add_subdirectory(packages/base91/src-cpp)
 add_subdirectory(packages/expat/src-cpp)
 add_subdirectory(packages/graphviz/src-cpp)
+add_subdirectory(packages/llama/src-cpp)
 add_subdirectory(packages/zstd/src-cpp)
diff --git a/docs/.vitepress/config.js b/docs/.vitepress/config.js
index ff555074..e5353879 100644
--- a/docs/.vitepress/config.js
+++ b/docs/.vitepress/config.js
@@ -38,6 +38,7 @@ export default {
                     { text: 'DuckDB', link: '/duckdb/src/duckdb/classes/DuckDB' },
                     { text: 'Expat', link: '/expat/src/expat/classes/Expat' },
                     { text: 'Graphviz', link: '/graphviz/src/graphviz/classes/Graphviz' },
+                    { text: 'Llama', link: '/llama/src/llama/classes/Llama' },
                     { text: 'Zstd', link: '/zstd/src/zstd/classes/Zstd' },
                 ]
             }
diff --git a/docs/index.md b/docs/index.md
index 97757c49..79f53b90 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -35,6 +35,9 @@ features:
   - title: GraphViz
     details: The Graphviz layout algorithms take descriptions of graphs in a simple text language, and make diagrams in useful formats, such as images and SVG for web pages or display in an interactive graph browser.
     link: /graphviz/src/graphviz/classes/Graphviz
+  - title: Llama
+    details: Inference of Meta's LLaMA model (and others) in pure C/C++.
+    link: /llama/src/llama/classes/Llama
   - title: Zstd
     details:  Zstandard is a fast compression algorithm, providing high compression ratios and is backed by an extremely fast decoder.
     link: /zstd/src/zstd/classes/Zstd
diff --git a/package-lock.json b/package-lock.json
index e355c8dc..63b95a99 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -13,7 +13,7 @@
       ],
       "devDependencies": {
         "@eslint/js": "9.9.0",
-        "@hpcc-js/esbuild-plugins": "1.0.10",
+        "@hpcc-js/esbuild-plugins": "file:../Visualization/packages/esbuild-plugins",
         "@istanbuljs/nyc-config-typescript": "1.0.2",
         "@types/chai": "4.3.17",
         "@types/emscripten": "1.39.13",
@@ -25,8 +25,10 @@
         "assemblyscript": "0.27.29",
         "c8": "10.1.2",
         "chai": "5.1.1",
+        "chokidar-cli": "^3.0.0",
         "coveralls": "3.1.1",
         "eslint": "9.9.0",
+        "fzstd": "^0.1.1",
         "globals": "15.9.0",
         "karma": "6.4.4",
         "karma-chai": "0.1.0",
@@ -50,6 +52,18 @@
         "vitepress": "1.3.2"
       }
     },
+    "../Visualization/packages/esbuild-plugins": {
+      "name": "@hpcc-js/esbuild-plugins",
+      "version": "1.1.0",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@hpcc-js/wasm-base91": "1.0.1",
+        "@hpcc-js/wasm-zstd": "1.0.1",
+        "fzstd": "0.1.1",
+        "yargs": "17.7.2"
+      }
+    },
     "node_modules/@75lb/deep-merge": {
       "version": "1.1.2",
       "resolved": "https://registry.npmjs.org/@75lb/deep-merge/-/deep-merge-1.1.2.tgz",
@@ -1507,37 +1521,8 @@
       "license": "BSD-3-Clause"
     },
     "node_modules/@hpcc-js/esbuild-plugins": {
-      "version": "1.0.10",
-      "resolved": "https://registry.npmjs.org/@hpcc-js/esbuild-plugins/-/esbuild-plugins-1.0.10.tgz",
-      "integrity": "sha512-sXJEo5GKDu7uMbuQ72XV8jZBflyKKappUYNSjs6wITxcMzbi3o+WsWyyPeUYOPhj9bgguJ4Biy6AAPd1ma3Mxw==",
-      "dev": true,
-      "license": "Apache-2.0",
-      "dependencies": {
-        "@hpcc-js/wasm-base91": "1.0.1",
-        "@hpcc-js/wasm-zstd": "1.0.1",
-        "fzstd": "0.1.1",
-        "yargs": "17.7.2"
-      }
-    },
-    "node_modules/@hpcc-js/esbuild-plugins/node_modules/@hpcc-js/wasm-base91": {
-      "version": "1.0.1",
-      "resolved": "https://registry.npmjs.org/@hpcc-js/wasm-base91/-/wasm-base91-1.0.1.tgz",
-      "integrity": "sha512-MinpvSv+yTf1PRJ/cuhZzgi+vkle1YAVPkdD6CFzN7YqkW5K75WYfSo2YlnniKLh3SqksCO3uFqEIg8vBHhp1g==",
-      "dev": true,
-      "license": "Apache-2.0",
-      "dependencies": {
-        "yargs": "17.7.2"
-      }
-    },
-    "node_modules/@hpcc-js/esbuild-plugins/node_modules/@hpcc-js/wasm-zstd": {
-      "version": "1.0.1",
-      "resolved": "https://registry.npmjs.org/@hpcc-js/wasm-zstd/-/wasm-zstd-1.0.1.tgz",
-      "integrity": "sha512-at8AGwzQVdyg6vSHX85HW1FtEeqU+mo7kzH3nsiWRi9QPyl0s2uu7YXkR9zCq6cEP0uPKImo3ip3xRAkPzNizA==",
-      "dev": true,
-      "license": "Apache-2.0",
-      "dependencies": {
-        "yargs": "17.7.2"
-      }
+      "resolved": "../Visualization/packages/esbuild-plugins",
+      "link": true
     },
     "node_modules/@hpcc-js/wasm": {
       "resolved": "packages/wasm",
@@ -1563,6 +1548,10 @@
       "resolved": "packages/graphviz-cli",
       "link": true
     },
+    "node_modules/@hpcc-js/wasm-llama": {
+      "resolved": "packages/llama",
+      "link": true
+    },
     "node_modules/@hpcc-js/wasm-zstd": {
       "resolved": "packages/zstd",
       "link": true
@@ -5455,6 +5444,240 @@
         "fsevents": "~2.3.2"
       }
     },
+    "node_modules/chokidar-cli": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/chokidar-cli/-/chokidar-cli-3.0.0.tgz",
+      "integrity": "sha512-xVW+Qeh7z15uZRxHOkP93Ux8A0xbPzwK4GaqD8dQOYc34TlkqUhVSS59fK36DOp5WdJlrRzlYSy02Ht99FjZqQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "chokidar": "^3.5.2",
+        "lodash.debounce": "^4.0.8",
+        "lodash.throttle": "^4.1.1",
+        "yargs": "^13.3.0"
+      },
+      "bin": {
+        "chokidar": "index.js"
+      },
+      "engines": {
+        "node": ">= 8.10.0"
+      }
+    },
+    "node_modules/chokidar-cli/node_modules/ansi-regex": {
+      "version": "4.1.1",
+      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-4.1.1.tgz",
+      "integrity": "sha512-ILlv4k/3f6vfQ4OoP2AGvirOktlQ98ZEL1k9FaQjxa3L1abBgbuTDAdPOpvbGncC0BTVQrl+OM8xZGK6tWXt7g==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/chokidar-cli/node_modules/ansi-styles": {
+      "version": "3.2.1",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.1.tgz",
+      "integrity": "sha512-VT0ZI6kZRdTh8YyJw3SMbYm/u+NqfsAxEpWO0Pf9sq8/e94WxxOpPKx9FR1FlyCtOVDNOQ+8ntlqFxiRc+r5qA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "color-convert": "^1.9.0"
+      },
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/chokidar-cli/node_modules/cliui": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/cliui/-/cliui-5.0.0.tgz",
+      "integrity": "sha512-PYeGSEmmHM6zvoef2w8TPzlrnNpXIjTipYK780YswmIP9vjxmd6Y2a3CB2Ks6/AU8NHjZugXvo8w3oWM2qnwXA==",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "string-width": "^3.1.0",
+        "strip-ansi": "^5.2.0",
+        "wrap-ansi": "^5.1.0"
+      }
+    },
+    "node_modules/chokidar-cli/node_modules/color-convert": {
+      "version": "1.9.3",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz",
+      "integrity": "sha512-QfAUtd+vFdAtFQcC8CCyYt1fYWxSqAiK2cSD6zDB8N3cpsEBAvRxp9zOGg6G/SHHJYAT88/az/IuDGALsNVbGg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "color-name": "1.1.3"
+      }
+    },
+    "node_modules/chokidar-cli/node_modules/color-name": {
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz",
+      "integrity": "sha512-72fSenhMw2HZMTVHeCA9KCmpEIbzWiQsjN+BHcBbS9vr1mtt+vJjPdksIBNUmKAW8TFUDPJK5SUU3QhE9NEXDw==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/chokidar-cli/node_modules/emoji-regex": {
+      "version": "7.0.3",
+      "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-7.0.3.tgz",
+      "integrity": "sha512-CwBLREIQ7LvYFB0WyRvwhq5N5qPhc6PMjD6bYggFlI5YyDgl+0vxq5VHbMOFqLg7hfWzmu8T5Z1QofhmTIhItA==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/chokidar-cli/node_modules/find-up": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/find-up/-/find-up-3.0.0.tgz",
+      "integrity": "sha512-1yD6RmLI1XBfxugvORwlck6f75tYL+iR0jqwsOrOxMZyGYqUuDhJ0l4AXdO1iX/FTs9cBAMEk1gWSEx1kSbylg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "locate-path": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/chokidar-cli/node_modules/is-fullwidth-code-point": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz",
+      "integrity": "sha512-VHskAKYM8RfSFXwee5t5cbN5PZeq1Wrh6qd5bkyiXIf6UQcN6w/A0eXM9r6t8d+GYOh+o6ZhiEnb88LN/Y8m2w==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/chokidar-cli/node_modules/locate-path": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-3.0.0.tgz",
+      "integrity": "sha512-7AO748wWnIhNqAuaty2ZWHkQHRSNfPVIsPIfwEOWO22AmaoVrWavlOcMR5nzTLNYvp36X220/maaRsrec1G65A==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "p-locate": "^3.0.0",
+        "path-exists": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/chokidar-cli/node_modules/p-limit": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz",
+      "integrity": "sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "p-try": "^2.0.0"
+      },
+      "engines": {
+        "node": ">=6"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/chokidar-cli/node_modules/p-locate": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-3.0.0.tgz",
+      "integrity": "sha512-x+12w/To+4GFfgJhBEpiDcLozRJGegY+Ei7/z0tSLkMmxGZNybVMSfWj9aJn8Z5Fc7dBUNJOOVgPv2H7IwulSQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "p-limit": "^2.0.0"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/chokidar-cli/node_modules/path-exists": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz",
+      "integrity": "sha512-bpC7GYwiDYQ4wYLe+FA8lhRjhQCMcQGuSgGGqDkg/QerRWw9CmGRT0iSOVRSZJ29NMLZgIzqaljJ63oaL4NIJQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/chokidar-cli/node_modules/string-width": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/string-width/-/string-width-3.1.0.tgz",
+      "integrity": "sha512-vafcv6KjVZKSgz06oM/H6GDBrAtz8vdhQakGjFIvNrHA6y3HCF1CInLy+QLq8dTJPQ1b+KDUqDFctkdRW44e1w==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "emoji-regex": "^7.0.1",
+        "is-fullwidth-code-point": "^2.0.0",
+        "strip-ansi": "^5.1.0"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/chokidar-cli/node_modules/strip-ansi": {
+      "version": "5.2.0",
+      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-5.2.0.tgz",
+      "integrity": "sha512-DuRs1gKbBqsMKIZlrffwlug8MHkcnpjs5VPmL1PAh+mA30U0DTotfDZ0d2UUsXpPmPmMMJ6W773MaA3J+lbiWA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "ansi-regex": "^4.1.0"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/chokidar-cli/node_modules/wrap-ansi": {
+      "version": "5.1.0",
+      "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-5.1.0.tgz",
+      "integrity": "sha512-QC1/iN/2/RPVJ5jYK8BGttj5z83LmSKmvbvrXPNCLZSEb32KKVDJDl/MOt2N01qU2H/FkzEa9PKto1BqDjtd7Q==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "ansi-styles": "^3.2.0",
+        "string-width": "^3.0.0",
+        "strip-ansi": "^5.0.0"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/chokidar-cli/node_modules/y18n": {
+      "version": "4.0.3",
+      "resolved": "https://registry.npmjs.org/y18n/-/y18n-4.0.3.tgz",
+      "integrity": "sha512-JKhqTOwSrqNA1NY5lSztJ1GrBiUodLMmIZuLiDaMRJ+itFd+ABVE8XBjOvIWL+rSqNDC74LCSFmlb/U4UZ4hJQ==",
+      "dev": true,
+      "license": "ISC"
+    },
+    "node_modules/chokidar-cli/node_modules/yargs": {
+      "version": "13.3.2",
+      "resolved": "https://registry.npmjs.org/yargs/-/yargs-13.3.2.tgz",
+      "integrity": "sha512-AX3Zw5iPruN5ie6xGRIDgqkT+ZhnRlZMLMHAs8tg7nRruy2Nb+i5o9bwghAogtM08q1dpr2LVoS8KSTMYpWXUw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "cliui": "^5.0.0",
+        "find-up": "^3.0.0",
+        "get-caller-file": "^2.0.1",
+        "require-directory": "^2.1.1",
+        "require-main-filename": "^2.0.0",
+        "set-blocking": "^2.0.0",
+        "string-width": "^3.0.0",
+        "which-module": "^2.0.0",
+        "y18n": "^4.0.0",
+        "yargs-parser": "^13.1.2"
+      }
+    },
+    "node_modules/chokidar-cli/node_modules/yargs-parser": {
+      "version": "13.1.2",
+      "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-13.1.2.tgz",
+      "integrity": "sha512-3lbsNRf/j+A4QuSZfDRA7HRSfWrzO0YjqTJd5kjAq37Zep1CEgaYmrH9Q3GwPiB9cHyd1Y1UwggGhJGoxipbzg==",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "camelcase": "^5.0.0",
+        "decamelize": "^1.2.0"
+      }
+    },
     "node_modules/chokidar/node_modules/glob-parent": {
       "version": "5.1.2",
       "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz",
@@ -11236,6 +11459,13 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/lodash.debounce": {
+      "version": "4.0.8",
+      "resolved": "https://registry.npmjs.org/lodash.debounce/-/lodash.debounce-4.0.8.tgz",
+      "integrity": "sha512-FT1yDzDYEoYWhnSGnpE/4Kj1fLZkDFyqRb7fNt6FdYOSxlUWAtp42Eh6Wb0rGIv/m9Bgo7x4GhQbm5Ys4SG5ow==",
+      "dev": true,
+      "license": "MIT"
+    },
     "node_modules/lodash.flattendeep": {
       "version": "4.4.0",
       "resolved": "https://registry.npmjs.org/lodash.flattendeep/-/lodash.flattendeep-4.4.0.tgz",
@@ -18267,6 +18497,12 @@
       },
       "devDependencies": {}
     },
+    "packages/llama": {
+      "name": "@hpcc-js/wasm-llama",
+      "version": "1.2.0",
+      "license": "Apache-2.0",
+      "devDependencies": {}
+    },
     "packages/wasm": {
       "name": "@hpcc-js/wasm",
       "version": "2.20.0",
diff --git a/package.json b/package.json
index 8895e40c..0d7b59b8 100644
--- a/package.json
+++ b/package.json
@@ -36,13 +36,14 @@
     "clean-root": "run-p clean-build clean-docs",
     "clean": "lerna run clean",
     "copy-res": "cp ./docs/*.png ./docs/.vitepress/dist",
-    "compile-asm": "asc ./src-asm/index.ts --target release",
-    "compile-cpp": "run-script-os",
-    "compile-cpp:linux": "./scripts/cpp-build.sh",
-    "compile-cpp:win32": "wsl -e ./scripts/cpp-build.sh",
+    "build-asm": "asc ./src-asm/index.ts --target release",
     "pack-duckdb": "lerna run pack-duckdb",
     "gen-docs": "typedoc",
     "gen-docs-watch": "typedoc --watch",
+    "build-cpp": "run-script-os",
+    "build-cpp:linux": "./scripts/cpp-build.sh",
+    "build-cpp:win32": "wsl -e ./scripts/cpp-build.sh",
+    "build-cpp-watch": "chokidar 'packages/*/src-cpp/**/*' -c 'npm run build-cpp'",
     "build-docs-vitepress": "vitepress build docs",
     "build-docs": "run-s pack-duckdb gen-docs build-docs-vitepress copy-res",
     "build-docs-watch": "vitepress dev docs",
@@ -50,7 +51,7 @@
     "build-docker-wasm": "npx -y mkdirp dist && docker run --rm -it --mount source=\"/hpcc-js-wasm/dist\",target=/usr/src/app/dist hpcc-js-wasm-build:latest",
     "build-docker": "run-s build-docker-image build-docker-wasm",
     "build-ws": "lerna run build",
-    "build": "run-s compile-cpp build-ws",
+    "build": "run-s build-cpp build-ws",
     "git-push": "git push --follow-tags upstream trunk",
     "serve-docs": "vitepress serve docs",
     "serve": "ws",
@@ -68,9 +69,9 @@
     "update-major-root": "npx -y npm-check-updates -u",
     "update-major": "npm run update-major-root && lerna run update-major"
   },
-  "dependencies": {},
   "devDependencies": {
     "@eslint/js": "9.9.0",
+    "@hpcc-js/esbuild-plugins": "file:../Visualization/packages/esbuild-plugins",
     "@istanbuljs/nyc-config-typescript": "1.0.2",
     "@types/chai": "4.3.17",
     "@types/emscripten": "1.39.13",
@@ -80,11 +81,12 @@
     "@typescript-eslint/eslint-plugin": "8.1.0",
     "@typescript-eslint/parser": "8.1.0",
     "assemblyscript": "0.27.29",
+    "c8": "10.1.2",
     "chai": "5.1.1",
+    "chokidar-cli": "^3.0.0",
     "coveralls": "3.1.1",
-    "c8": "10.1.2",
-    "@hpcc-js/esbuild-plugins": "1.0.10",
     "eslint": "9.9.0",
+    "fzstd": "^0.1.1",
     "globals": "15.9.0",
     "karma": "6.4.4",
     "karma-chai": "0.1.0",
@@ -121,4 +123,4 @@
     "url": "https://github.com/hpcc-systems/hpcc-js-wasm/issues"
   },
   "homepage": "https://hpcc-systems.github.io/hpcc-js-wasm/"
-}
\ No newline at end of file
+}
diff --git a/packages/llama/.vscode/launch.json b/packages/llama/.vscode/launch.json
new file mode 100644
index 00000000..a5e6e939
--- /dev/null
+++ b/packages/llama/.vscode/launch.json
@@ -0,0 +1,44 @@
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "test-browser",
+            "type": "msedge",
+            "request": "launch",
+            "url": "file://${workspaceRoot}/test.html",
+            "webRoot": "${workspaceRoot}",
+            "sourceMapPathOverrides": {
+                "webpack:///./*": "${workspaceRoot}/*",
+                "webpack:///*": "/*"
+            },
+        },
+        {
+            "name": "test-node",
+            "type": "node",
+            "request": "launch",
+            "runtimeArgs": [
+                "run-script",
+                "test-node"
+            ],
+            "runtimeExecutable": "npm",
+            "skipFiles": [
+                "<node_internals>/**"
+            ],
+            "outFiles": [
+                "${workspaceFolder}/**/*.js",
+                "${workspaceFolder}/**/*.c",
+                "!**/node_modules/**"
+            ],
+        },
+        {
+            "name": "esbuild",
+            "type": "node",
+            "program": "${workspaceFolder}/esbuild.mjs",
+            "request": "launch",
+            "args": [],
+            "skipFiles": [
+                "<node_internals>/**"
+            ]
+        }
+    ]
+}
\ No newline at end of file
diff --git a/packages/llama/.vscode/tasks.json b/packages/llama/.vscode/tasks.json
new file mode 100644
index 00000000..44163830
--- /dev/null
+++ b/packages/llama/.vscode/tasks.json
@@ -0,0 +1,73 @@
+{
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "type": "npm",
+            "label": "build-types-watch",
+            "script": "build-types-watch",
+            "problemMatcher": [
+                "$tsc-watch"
+            ],
+            "presentation": {
+                "group": "group-build"
+            }
+        },
+        {
+            "type": "npm",
+            "label": "build-ts-watch",
+            "script": "build-ts-watch",
+            "problemMatcher": [
+                "$tsc-watch"
+            ],
+            "presentation": {
+                "group": "group-build"
+            }
+        },
+        {
+            "type": "npm",
+            "label": "build-cpp-watch",
+            "script": "build-cpp-watch",
+            "options": {
+                "cwd": "${workspaceFolder}/../.."
+            },
+            "problemMatcher": {
+                "owner": "cpp",
+                "fileLocation": [
+                    "relative",
+                    "${workspaceFolder}"
+                ],
+                "pattern": {
+                    "regexp": "^(.*):(\\d+):(\\d+):\\s+(warning|error):\\s+(.*)$",
+                    "file": 1,
+                    "line": 2,
+                    "column": 3,
+                    "severity": 4,
+                    "message": 5
+                }
+            },
+            "presentation": {
+                "group": "group-build"
+            }
+        },
+        {
+            "type": "npm",
+            "label": "Web Server",
+            "script": "serve",
+            "presentation": {
+                "group": "group-build"
+            }
+        },
+        {
+            "label": "build",
+            "dependsOn": [
+                "build-types-watch",
+                "build-ts-watch",
+                "build-cpp-watch"
+            ],
+            "group": {
+                "kind": "build",
+                "isDefault": true
+            }
+        }
+    ]
+}
\ No newline at end of file
diff --git a/packages/llama/CHANGELOG.md b/packages/llama/CHANGELOG.md
new file mode 100644
index 00000000..4dc68c6f
--- /dev/null
+++ b/packages/llama/CHANGELOG.md
@@ -0,0 +1,2 @@
+# Changelog
+
diff --git a/packages/llama/README.md b/packages/llama/README.md
new file mode 100644
index 00000000..7b98ee1a
--- /dev/null
+++ b/packages/llama/README.md
@@ -0,0 +1,25 @@
+# @hpcc-js/wasm-llama
+
+## Installation
+
+```sh
+npm install @hpcc-js/wasm-llama
+```
+
+## Quick Start
+
+```typescript
+import { Llama, WebBlob } from "@hpcc-js/wasm-llama";
+
+let llama = await Llama.load();
+const model = "https://huggingface.co/CompendiumLabs/bge-base-en-v1.5-gguf/resolve/main/bge-base-en-v1.5-q4_k_m.gguf";
+const webBlob: Blob = await WebBlob.create(new URL(model));
+
+const data: ArrayBuffer = await webBlob.arrayBuffer();
+
+const embeddings = llama.embedding("Hello and Welcome!", new Uint8Array(data));
+```
+
+## Reference
+
+* [API Documentation](https://hpcc-systems.github.io/hpcc-js-wasm/llama/src/llama/classes/llama.html)
diff --git a/packages/llama/esbuild.mjs b/packages/llama/esbuild.mjs
new file mode 100644
index 00000000..163a14d7
--- /dev/null
+++ b/packages/llama/esbuild.mjs
@@ -0,0 +1,9 @@
+import { browserTpl, neutralTpl, nodeTpl } from "@hpcc-js/esbuild-plugins";
+
+//  config  ---
+await neutralTpl("src/index.ts", "dist/index");
+await Promise.all([
+    browserTpl("test/index-browser.ts", "dist-test/index.browser"),
+    nodeTpl("test/index-node.ts", "dist-test/index.node"),
+]);
+
diff --git a/packages/llama/karma.conf.cjs b/packages/llama/karma.conf.cjs
new file mode 100644
index 00000000..7910435e
--- /dev/null
+++ b/packages/llama/karma.conf.cjs
@@ -0,0 +1,20 @@
+module.exports = function (config) {
+    config.set({
+        frameworks: ['mocha'],
+        files: [
+            { pattern: 'dist-test/index.browser.js', type: 'module' },
+            { pattern: 'dist-test/*.js', watched: false, included: false, served: true, type: module }
+        ],
+        proxies: {
+            "/dist/": "/base/dist/",
+            "/dist-test/": "/base/dist-test/"
+        },
+        reporters: ['spec'],
+        port: 9876,  // karma web server port
+        colors: true,
+        logLevel: config.LOG_INFO,
+        browsers: ["Firefox", "ChromeHeadless"],
+        autoWatch: false,
+        concurrency: Infinity
+    })
+}
\ No newline at end of file
diff --git a/packages/llama/package.json b/packages/llama/package.json
new file mode 100644
index 00000000..eb891382
--- /dev/null
+++ b/packages/llama/package.json
@@ -0,0 +1,57 @@
+{
+  "name": "@hpcc-js/wasm-llama",
+  "version": "1.2.0",
+  "description": "hpcc-js - WASM llama",
+  "type": "module",
+  "exports": {
+    ".": {
+      "types": "./types/index.d.ts",
+      "default": "./dist/index.js"
+    }
+  },
+  "main": "./dist/index.js",
+  "types": "./types/index.d.ts",
+  "files": [
+    "dist/**/*",
+    "src/**/*",
+    "types/**/*"
+  ],
+  "scripts": {
+    "clean": "rimraf ./dist ./dist-test ./types",
+    "build-cpp": "cmake --build ../../build --target llamalib",
+    "build-cpp-watch": "chokidar 'src-cpp/**.*' -c 'npm run build-cpp'",
+    "build-types": "tsc --project tsconfig.json --emitDeclarationOnly",
+    "build-types-watch": "npm run build-types -- --watch",
+    "build-ts": "node esbuild.mjs",
+    "build-ts-dev": "npm run build-ts -- --mode=development",
+    "build-ts-watch": "npm run build-ts-dev -- --watch",
+    "build-dev": "run-p build-types build-ts-dev",
+    "build": "run-p build-cpp build-types build-ts",
+    "lint-skypack": "npx -y @skypack/package-check",
+    "lint-eslint": "eslint src/**/*.ts",
+    "lint": "run-p lint-eslint",
+    "test-chrome": "karma start --single-run --browsers ChromiumHeadless karma.conf.cjs",
+    "test-firefox": "karma start --single-run --browsers Firefox karma.conf.cjs",
+    "test-node": "mocha ./dist-test/index.node.js --reporter spec",
+    "test": "run-s test-chrome test-node",
+    "update": "npx -y npm-check-updates -u -t minor",
+    "update-major": "npx -y npm-check-updates -u"
+  },
+  "devDependencies": {},
+  "keywords": [
+    "graphviz",
+    "typescript",
+    "webassembly",
+    "wasm",
+    "dot",
+    "neato",
+    "twopi"
+  ],
+  "author": "hpcc-systems",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/hpcc-systems/hpcc-js-wasm.git"
+  },
+  "homepage": "https://hpcc-systems.github.io/hpcc-js-wasm/",
+  "license": "Apache-2.0"
+}
\ No newline at end of file
diff --git a/packages/llama/src-cpp/CMakeLists.txt b/packages/llama/src-cpp/CMakeLists.txt
new file mode 100644
index 00000000..f4308c7e
--- /dev/null
+++ b/packages/llama/src-cpp/CMakeLists.txt
@@ -0,0 +1,50 @@
+project(llamalib)
+
+set(CMAKE_CXX_STANDARD 11)
+
+find_package(Llama CONFIG REQUIRED)
+
+# See:  https://github.com/emscripten-core/emscripten/blob/main/src/settings.js
+
+set(EM_CPP_FLAGS 
+    ${EM_CPP_FLAGS}
+    "-fwasm-exceptions"
+)
+string(REPLACE ";" " " CPP_FLAGS "${EM_CPP_FLAGS}")
+
+set(EM_LINK_FLAGS 
+    ${EM_LINK_FLAGS}
+    "-sEXPORT_NAME='${CMAKE_PROJECT_NAME}'"
+    "-sFILESYSTEM=1"
+    "-sFORCE_FILESYSTEM=1"
+    "-sWASMFS=1"
+    "-lembind"
+    "-fwasm-exceptions"
+    "--emit-tsd ${CMAKE_CURRENT_BINARY_DIR}/llamalib.d.ts"
+)
+string(REPLACE ";" " " LINK_FLAGS "${EM_LINK_FLAGS}")
+
+include_directories(
+    ${VCPKG_INCLUDE_DIR}
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${Llama_DIR}/common
+)
+
+add_executable(llamalib
+    main.cpp
+    embedding.cpp
+    util.cpp
+    ${Llama_DIR}/common/common.cpp
+    ${Llama_DIR}/common/sampling.cpp
+    ${Llama_DIR}/common/console.cpp
+    ${Llama_DIR}/common/grammar-parser.cpp
+    ${Llama_DIR}/common/json-schema-to-grammar.cpp
+    ${Llama_DIR}/common/build-info.cpp
+)
+
+set_target_properties(llamalib PROPERTIES COMPILE_FLAGS "${CPP_FLAGS}")
+set_target_properties(llamalib PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+
+target_link_libraries(llamalib
+    PRIVATE llama 
+)
diff --git a/packages/llama/src-cpp/embedding.cpp b/packages/llama/src-cpp/embedding.cpp
new file mode 100644
index 00000000..33d9f806
--- /dev/null
+++ b/packages/llama/src-cpp/embedding.cpp
@@ -0,0 +1,428 @@
+//  See: https://github.com/ggerganov/llama.cpp/blob/master/examples/embedding/embedding.cpp  ---
+
+#include "common.h"
+#include "llama.h"
+
+#include <ctime>
+
+#if defined(_MSC_VER)
+#pragma warning(disable : 4244 4267) // possible loss of data
+#endif
+
+namespace embedding
+{
+
+    static std::vector<std::string> split_lines(const std::string &s, const std::string &separator = "\n")
+    {
+        std::vector<std::string> lines;
+        size_t start = 0;
+        size_t end = s.find(separator);
+
+        while (end != std::string::npos)
+        {
+            lines.push_back(s.substr(start, end - start));
+            start = end + separator.length();
+            end = s.find(separator, start);
+        }
+
+        lines.push_back(s.substr(start)); // Add the last part
+
+        return lines;
+    }
+
+    static void batch_add_seq(llama_batch &batch, const std::vector<int32_t> &tokens, llama_seq_id seq_id)
+    {
+        size_t n_tokens = tokens.size();
+        for (size_t i = 0; i < n_tokens; i++)
+        {
+            llama_batch_add(batch, tokens[i], i, {seq_id}, true);
+        }
+    }
+
+    static void batch_decode(llama_context *ctx, llama_batch &batch, float *output, int n_seq, int n_embd, int embd_norm)
+    {
+        const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+        const struct llama_model *model = llama_get_model(ctx);
+
+        // clear previous kv_cache values (irrelevant for embeddings)
+        llama_kv_cache_clear(ctx);
+
+        // run model
+        fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+        if (llama_model_has_encoder(model) && !llama_model_has_decoder(model))
+        {
+            // encoder-only model
+            if (llama_encode(ctx, batch) < 0)
+            {
+                fprintf(stderr, "%s : failed to encode\n", __func__);
+            }
+        }
+        else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model))
+        {
+            // decoder-only model
+            if (llama_decode(ctx, batch) < 0)
+            {
+                fprintf(stderr, "%s : failed to decode\n", __func__);
+            }
+        }
+
+        for (int i = 0; i < batch.n_tokens; i++)
+        {
+            if (!batch.logits[i])
+            {
+                continue;
+            }
+
+            const float *embd = nullptr;
+            int embd_pos = 0;
+
+            if (pooling_type == LLAMA_POOLING_TYPE_NONE)
+            {
+                // try to get token embeddings
+                embd = llama_get_embeddings_ith(ctx, i);
+                embd_pos = i;
+                GGML_ASSERT(embd != NULL && "failed to get token embeddings");
+            }
+            else
+            {
+                // try to get sequence embeddings - supported only when pooling_type is not NONE
+                embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+                embd_pos = batch.seq_id[i][0];
+                GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
+            }
+
+            float *out = output + embd_pos * n_embd;
+            llama_embd_normalize(embd, out, n_embd, embd_norm);
+        }
+    }
+
+    int main(int argc, char **argv)
+    {
+        gpt_params params;
+
+        if (!gpt_params_parse(argc, argv, params))
+        {
+            gpt_params_print_usage(argc, argv, params);
+            return 1;
+        }
+
+        params.embedding = true;
+        // For non-causal models, batch size must be equal to ubatch size
+        params.n_ubatch = params.n_batch;
+
+        print_build_info();
+
+        if (params.seed == LLAMA_DEFAULT_SEED)
+        {
+            params.seed = time(NULL);
+        }
+
+        fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
+
+        std::mt19937 rng(params.seed);
+
+        llama_backend_init();
+        llama_numa_init(params.numa);
+
+        // load the model
+        llama_init_result llama_init = llama_init_from_gpt_params(params);
+
+        llama_model *model = llama_init.model;
+        llama_context *ctx = llama_init.context;
+        if (model == NULL)
+        {
+            fprintf(stderr, "%s: error: unable to load model\n", __func__);
+            return 1;
+        }
+
+        const int n_ctx_train = llama_n_ctx_train(model);
+        const int n_ctx = llama_n_ctx(ctx);
+
+        const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+
+        if (llama_model_has_encoder(model) && llama_model_has_decoder(model))
+        {
+            fprintf(stderr, "%s: error: computing embeddings in encoder-decoder models is not supported\n", __func__);
+            return 1;
+        }
+
+        if (n_ctx > n_ctx_train)
+        {
+            fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+                    __func__, n_ctx_train, n_ctx);
+        }
+
+        // print system information
+        {
+            fprintf(stderr, "\n");
+            fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        }
+
+        // split the prompt into lines
+        std::vector<std::string> prompts = split_lines(params.prompt, params.embd_sep);
+
+        // max batch size
+        const uint64_t n_batch = params.n_batch;
+        GGML_ASSERT(params.n_batch >= params.n_ctx);
+
+        // tokenize the prompts and trim
+        std::vector<std::vector<int32_t>> inputs;
+        for (const auto &prompt : prompts)
+        {
+            auto inp = ::llama_tokenize(ctx, prompt, true, false);
+            if (inp.size() > n_batch)
+            {
+                fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+                        __func__, (long long int)inp.size(), (long long int)n_batch);
+                return 1;
+            }
+            inputs.push_back(inp);
+        }
+
+        // check if the last token is SEP
+        // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
+        for (auto &inp : inputs)
+        {
+            if (inp.empty() || inp.back() != llama_token_sep(model))
+            {
+                fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
+                fprintf(stderr, "%s:          'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
+            }
+        }
+
+        // tokenization stats
+        if (params.verbose_prompt)
+        {
+            for (int i = 0; i < (int)inputs.size(); i++)
+            {
+                fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
+                fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
+                for (int j = 0; j < (int)inputs[i].size(); j++)
+                {
+                    fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
+                }
+                fprintf(stderr, "\n\n");
+            }
+        }
+
+        // initialize batch
+        const int n_prompts = prompts.size();
+        struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
+
+        // count number of embeddings
+        int n_embd_count = 0;
+        if (pooling_type == LLAMA_POOLING_TYPE_NONE)
+        {
+            for (int k = 0; k < n_prompts; k++)
+            {
+                n_embd_count += inputs[k].size();
+            }
+        }
+        else
+        {
+            n_embd_count = n_prompts;
+        }
+
+        // allocate output
+        const int n_embd = llama_n_embd(model);
+        std::vector<float> embeddings(n_embd_count * n_embd, 0);
+        float *emb = embeddings.data();
+
+        // break into batches
+        int e = 0; // number of embeddings already stored
+        int s = 0; // number of prompts in current batch
+        for (int k = 0; k < n_prompts; k++)
+        {
+            // clamp to n_batch tokens
+            auto &inp = inputs[k];
+
+            const uint64_t n_toks = inp.size();
+
+            // encode if at capacity
+            if (batch.n_tokens + n_toks > n_batch)
+            {
+                float *out = emb + e * n_embd;
+                batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
+                e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
+                s = 0;
+                llama_batch_clear(batch);
+            }
+
+            // add to batch
+            batch_add_seq(batch, inp, s);
+            s += 1;
+        }
+
+        // final batch
+        float *out = emb + e * n_embd;
+        batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
+
+        if (params.embd_out.empty())
+        {
+            fprintf(stdout, "\n");
+
+            if (pooling_type == LLAMA_POOLING_TYPE_NONE)
+            {
+                for (int j = 0; j < n_embd_count; j++)
+                {
+                    fprintf(stdout, "embedding %d: ", j);
+                    for (int i = 0; i < std::min(3, n_embd); i++)
+                    {
+                        if (params.embd_normalize == 0)
+                        {
+                            fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+                        }
+                        else
+                        {
+                            fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+                        }
+                    }
+                    fprintf(stdout, " ... ");
+                    for (int i = n_embd - 3; i < n_embd; i++)
+                    {
+                        if (params.embd_normalize == 0)
+                        {
+                            fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+                        }
+                        else
+                        {
+                            fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+                        }
+                    }
+                    fprintf(stdout, "\n");
+                }
+            }
+            else
+            {
+                // print the first part of the embeddings or for a single prompt, the full embedding
+                for (int j = 0; j < n_prompts; j++)
+                {
+                    fprintf(stdout, "embedding %d: ", j);
+                    for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++)
+                    {
+                        if (params.embd_normalize == 0)
+                        {
+                            fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+                        }
+                        else
+                        {
+                            fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+                        }
+                    }
+                    fprintf(stdout, "\n");
+                }
+
+                // print cosine similarity matrix
+                if (n_prompts > 1)
+                {
+                    fprintf(stdout, "\n");
+                    printf("cosine similarity matrix:\n\n");
+                    for (int i = 0; i < n_prompts; i++)
+                    {
+                        fprintf(stdout, "%6.6s ", prompts[i].c_str());
+                    }
+                    fprintf(stdout, "\n");
+                    for (int i = 0; i < n_prompts; i++)
+                    {
+                        for (int j = 0; j < n_prompts; j++)
+                        {
+                            float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+                            fprintf(stdout, "%6.2f ", sim);
+                        }
+                        fprintf(stdout, "%1.10s", prompts[i].c_str());
+                        fprintf(stdout, "\n");
+                    }
+                }
+            }
+        }
+
+        if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array")
+        {
+            const bool notArray = params.embd_out != "array";
+
+            fprintf(stdout, notArray ? "{\n  \"object\": \"list\",\n  \"data\": [\n" : "[");
+            for (int j = 0;;)
+            { // at least one iteration (one prompt)
+                if (notArray)
+                    fprintf(stdout, "    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ", j);
+                fprintf(stdout, "[");
+                for (int i = 0;;)
+                { // at least one iteration (n_embd > 0)
+                    fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
+                    i++;
+                    if (i < n_embd)
+                        fprintf(stdout, ",");
+                    else
+                        break;
+                }
+                fprintf(stdout, notArray ? "]\n    }" : "]");
+                j++;
+                if (j < n_embd_count)
+                    fprintf(stdout, notArray ? ",\n" : ",");
+                else
+                    break;
+            }
+            fprintf(stdout, notArray ? "\n  ]" : "]\n");
+
+            if (params.embd_out == "json+" && n_prompts > 1)
+            {
+                fprintf(stdout, ",\n  \"cosineSimilarity\": [\n");
+                for (int i = 0;;)
+                { // at least two iteration (n_embd_count > 1)
+                    fprintf(stdout, "    [");
+                    for (int j = 0;;)
+                    { // at least two iteration (n_embd_count > 1)
+                        float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+                        fprintf(stdout, "%6.2f", sim);
+                        j++;
+                        if (j < n_embd_count)
+                            fprintf(stdout, ", ");
+                        else
+                            break;
+                    }
+                    fprintf(stdout, " ]");
+                    i++;
+                    if (i < n_embd_count)
+                        fprintf(stdout, ",\n");
+                    else
+                        break;
+                }
+                fprintf(stdout, "\n  ]");
+            }
+
+            if (notArray)
+                fprintf(stdout, "\n}\n");
+        }
+
+        // clean up
+        llama_print_timings(ctx);
+        llama_batch_free(batch);
+        llama_free(ctx);
+        llama_free_model(model);
+        llama_backend_free();
+
+        return 0;
+    }
+}
+//  ---  EMSCRIPTEN BINDINGS  ---  EMSCRIPTEN BINDINGS  ---  EMSCRIPTEN BINDINGS  ---  EMSCRIPTEN BINDINGS  ---
+
+#include "util.hpp"
+int embeddingMain(const std::vector<std::string> &args, std::vector<std::string> &retVal)
+{
+    ArgBuffer argBuffer(args);
+    int ret = 0;
+    {
+        OutErrRedirect outerr;
+        ret = embedding::main(argBuffer.argc, argBuffer.argv);
+    }
+    readOutFile(retVal);
+    readErrorFile(retVal);
+
+    return ret;
+}
+
+#include <emscripten/bind.h>
+EMSCRIPTEN_BINDINGS(llama_embedding)
+{
+    emscripten::function("embedding", &embeddingMain);
+}
diff --git a/packages/llama/src-cpp/main.cpp b/packages/llama/src-cpp/main.cpp
new file mode 100644
index 00000000..71fbdb43
--- /dev/null
+++ b/packages/llama/src-cpp/main.cpp
@@ -0,0 +1,1199 @@
+//  See:  https://github.com/ggerganov/llama.cpp/blob/master/examples/main/main.cpp  ---
+
+#include "common.h"
+
+#include "console.h"
+#include "llama.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
+#include <signal.h>
+#include <unistd.h>
+#elif defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#include <signal.h>
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(disable : 4244 4267) // possible loss of data
+#endif
+
+namespace main
+{
+    static llama_context **g_ctx;
+    static llama_model **g_model;
+    static gpt_params *g_params;
+    static std::vector<llama_token> *g_input_tokens;
+    static std::ostringstream *g_output_ss;
+    static std::vector<llama_token> *g_output_tokens;
+    static bool is_interacting = false;
+    static bool need_insert_eot = false;
+
+    static bool file_exists(const std::string &path)
+    {
+        std::ifstream f(path.c_str());
+        return f.good();
+    }
+
+    static bool file_is_empty(const std::string &path)
+    {
+        std::ifstream f;
+        f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
+        f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
+        return f.tellg() == 0;
+    }
+
+    static void write_logfile(
+        const llama_context *ctx, const gpt_params &params, const llama_model *model,
+        const std::vector<llama_token> &input_tokens, const std::string &output,
+        const std::vector<llama_token> &output_tokens)
+    {
+        if (params.logdir.empty())
+        {
+            return;
+        }
+
+        const std::string timestamp = string_get_sortable_timestamp();
+
+        const bool success = fs_create_directory_with_parents(params.logdir);
+        if (!success)
+        {
+            fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
+                    __func__, params.logdir.c_str());
+            return;
+        }
+
+        const std::string logfile_path = params.logdir + timestamp + ".yml";
+        FILE *logfile = fopen(logfile_path.c_str(), "w");
+
+        if (logfile == NULL)
+        {
+            fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+            return;
+        }
+
+        fprintf(logfile, "binary: main\n");
+        char model_desc[128];
+        llama_model_desc(model, model_desc, sizeof(model_desc));
+        yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
+
+        fprintf(logfile, "\n");
+        fprintf(logfile, "######################\n");
+        fprintf(logfile, "# Generation Results #\n");
+        fprintf(logfile, "######################\n");
+        fprintf(logfile, "\n");
+
+        yaml_dump_string_multiline(logfile, "output", output.c_str());
+        yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
+
+        llama_dump_timing_info_yaml(logfile, ctx);
+        fclose(logfile);
+    }
+
+#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
+    static void sigint_handler(int signo)
+    {
+        if (signo == SIGINT)
+        {
+            if (!is_interacting && g_params->interactive)
+            {
+                is_interacting = true;
+                need_insert_eot = true;
+            }
+            else
+            {
+                console::cleanup();
+                printf("\n");
+                llama_print_timings(*g_ctx);
+                write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
+                _exit(130);
+            }
+        }
+    }
+#endif
+
+    static void llama_log_callback_logTee(ggml_log_level level, const char *text, void *user_data)
+    {
+        (void)level;
+        (void)user_data;
+        LOG_TEE("%s", text);
+    }
+
+    static std::string chat_add_and_format(struct llama_model *model, std::vector<llama_chat_msg> &chat_msgs, std::string role, std::string content)
+    {
+        llama_chat_msg new_msg{role, content};
+        auto formatted = llama_chat_format_single(
+            model, g_params->chat_template, chat_msgs, new_msg, role == "user");
+        chat_msgs.push_back({role, content});
+        LOG("formatted: %s\n", formatted.c_str());
+        return formatted;
+    }
+
+    int main(int argc, char **argv)
+    {
+        gpt_params params;
+        g_params = &params;
+
+        if (!gpt_params_parse(argc, argv, params))
+        {
+            gpt_params_print_usage(argc, argv, params);
+            return 1;
+        }
+
+        llama_sampling_params &sparams = params.sparams;
+
+#ifndef LOG_DISABLE_LOGS
+        log_set_target(log_filename_generator("main", "log"));
+        LOG_TEE("Log start\n");
+        log_dump_cmdline(argc, argv);
+        llama_log_set(llama_log_callback_logTee, nullptr);
+#endif // LOG_DISABLE_LOGS
+
+        // TODO: Dump params ?
+        // LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
+
+        // save choice to use color for later
+        // (note for later: this is a slightly awkward choice)
+        console::init(params.simple_io, params.use_color);
+        atexit([]()
+               { console::cleanup(); });
+
+        if (params.logits_all)
+        {
+            printf("\n************\n");
+            printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+            printf("************\n\n");
+
+            return 0;
+        }
+
+        if (params.embedding)
+        {
+            printf("\n************\n");
+            printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+            printf("************\n\n");
+
+            return 0;
+        }
+
+        if (params.n_ctx != 0 && params.n_ctx < 8)
+        {
+            LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+            params.n_ctx = 8;
+        }
+
+        if (params.rope_freq_base != 0.0)
+        {
+            LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
+        }
+
+        if (params.rope_freq_scale != 0.0)
+        {
+            LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
+        }
+
+        LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
+        LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+
+        if (params.seed == LLAMA_DEFAULT_SEED)
+        {
+            params.seed = time(NULL);
+        }
+
+        LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
+
+        std::mt19937 rng(params.seed);
+
+        LOG("%s: llama backend init\n", __func__);
+        llama_backend_init();
+        llama_numa_init(params.numa);
+
+        llama_model *model;
+        llama_context *ctx;
+        llama_context *ctx_guidance = NULL;
+        std::vector<llama_chat_msg> chat_msgs;
+        g_model = &model;
+        g_ctx = &ctx;
+
+        // load the model and apply lora adapter, if any
+        LOG("%s: load the model and apply lora adapter, if any\n", __func__);
+        llama_init_result llama_init = llama_init_from_gpt_params(params);
+
+        model = llama_init.model;
+        ctx = llama_init.context;
+        if (sparams.cfg_scale > 1.f)
+        {
+            struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
+            ctx_guidance = llama_new_context_with_model(model, lparams);
+        }
+
+        if (model == NULL)
+        {
+            LOG_TEE("%s: error: unable to load model\n", __func__);
+            return 1;
+        }
+
+        const int n_ctx_train = llama_n_ctx_train(model);
+        const int n_ctx = llama_n_ctx(ctx);
+        LOG("n_ctx: %d\n", n_ctx);
+
+        if (n_ctx > n_ctx_train)
+        {
+            LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
+                    __func__, n_ctx_train, n_ctx);
+        }
+
+        // print chat template example in conversation mode
+        if (params.conversation)
+        {
+            if (params.enable_chat_template)
+            {
+                LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
+            }
+            else
+            {
+                LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
+            }
+        }
+
+        // print system information
+        {
+            LOG_TEE("\n");
+            LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
+        }
+
+        std::string path_session = params.path_prompt_cache;
+        std::vector<llama_token> session_tokens;
+
+        if (!path_session.empty())
+        {
+            LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
+            if (!file_exists(path_session))
+            {
+                LOG_TEE("%s: session file does not exist, will create.\n", __func__);
+            }
+            else if (file_is_empty(path_session))
+            {
+                LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
+            }
+            else
+            {
+                // The file exists and is not empty
+                session_tokens.resize(n_ctx);
+                size_t n_token_count_out = 0;
+                if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out))
+                {
+                    LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
+                    return 1;
+                }
+                session_tokens.resize(n_token_count_out);
+                LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
+            }
+        }
+
+        const bool add_bos = llama_add_bos_token(model);
+        if (!llama_model_has_encoder(model))
+        {
+            GGML_ASSERT(!llama_add_eos_token(model));
+        }
+        LOG("add_bos: %d\n", add_bos);
+
+        std::vector<llama_token> embd_inp;
+
+        {
+            auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty())
+                              ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
+                              : params.prompt;
+            if (params.interactive_first || !params.prompt.empty() || session_tokens.empty())
+            {
+                LOG("tokenize the prompt\n");
+                embd_inp = ::llama_tokenize(ctx, prompt, true, true);
+            }
+            else
+            {
+                LOG("use session tokens\n");
+                embd_inp = session_tokens;
+            }
+
+            LOG("prompt: \"%s\"\n", log_tostr(prompt));
+            LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+        }
+
+        // Should not run without any tokens
+        if (embd_inp.empty())
+        {
+            if (add_bos)
+            {
+                embd_inp.push_back(llama_token_bos(model));
+                LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+            }
+            else
+            {
+                LOG_TEE("error: input is empty\n");
+                return -1;
+            }
+        }
+
+        // Tokenize negative prompt
+        std::vector<llama_token> guidance_inp;
+        int guidance_offset = 0;
+        int original_prompt_len = 0;
+        if (ctx_guidance)
+        {
+            LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
+
+            guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
+            LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
+
+            std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
+            LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
+
+            original_prompt_len = original_inp.size();
+            guidance_offset = (int)guidance_inp.size() - original_prompt_len;
+            LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
+            LOG("guidance_offset:     %s", log_tostr(guidance_offset));
+        }
+
+        if ((int)embd_inp.size() > n_ctx - 4)
+        {
+            LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int)embd_inp.size(), n_ctx - 4);
+            return 1;
+        }
+
+        // debug message about similarity of saved session, if applicable
+        size_t n_matching_session_tokens = 0;
+        if (!session_tokens.empty())
+        {
+            for (llama_token id : session_tokens)
+            {
+                if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens])
+                {
+                    break;
+                }
+                n_matching_session_tokens++;
+            }
+            if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size())
+            {
+                LOG_TEE("%s: using full prompt from session file\n", __func__);
+            }
+            else if (n_matching_session_tokens >= embd_inp.size())
+            {
+                LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
+            }
+            else if (n_matching_session_tokens < (embd_inp.size() / 2))
+            {
+                LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
+                        __func__, n_matching_session_tokens, embd_inp.size());
+            }
+            else
+            {
+                LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
+                        __func__, n_matching_session_tokens, embd_inp.size());
+            }
+
+            // remove any "future" tokens that we might have inherited from the previous session
+            llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
+        }
+
+        LOGLN(
+            "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu",
+            log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
+
+        // if we will use the cache for the full prompt without reaching the end of the cache, force
+        // reevaluation of the last token to recalculate the cached logits
+        if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size())
+        {
+            LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
+
+            session_tokens.resize(embd_inp.size() - 1);
+        }
+
+        // number of tokens to keep when resetting context
+        if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size())
+        {
+            params.n_keep = (int)embd_inp.size();
+        }
+        else
+        {
+            params.n_keep += add_bos; // always keep the BOS token
+        }
+
+        if (params.conversation)
+        {
+            params.interactive_first = true;
+        }
+
+        // enable interactive mode if interactive start is specified
+        if (params.interactive_first)
+        {
+            params.interactive = true;
+        }
+
+        if (params.verbose_prompt)
+        {
+            LOG_TEE("\n");
+            LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+            LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+            for (int i = 0; i < (int)embd_inp.size(); i++)
+            {
+                LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+            }
+
+            if (ctx_guidance)
+            {
+                LOG_TEE("\n");
+                LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
+                LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
+                for (int i = 0; i < (int)guidance_inp.size(); i++)
+                {
+                    LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
+                }
+            }
+
+            if (params.n_keep > add_bos)
+            {
+                LOG_TEE("%s: static prompt based on n_keep: '", __func__);
+                for (int i = 0; i < params.n_keep; i++)
+                {
+                    LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                }
+                LOG_TEE("'\n");
+            }
+            LOG_TEE("\n");
+        }
+
+        // ctrl+C handling
+        {
+#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
+            struct sigaction sigint_action;
+            sigint_action.sa_handler = sigint_handler;
+            sigemptyset(&sigint_action.sa_mask);
+            sigint_action.sa_flags = 0;
+            sigaction(SIGINT, &sigint_action, NULL);
+#elif defined(_WIN32)
+            auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL
+            {
+                return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
+            };
+            SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+        }
+
+        if (params.interactive)
+        {
+            LOG_TEE("%s: interactive mode on.\n", __func__);
+
+            if (!params.antiprompt.empty())
+            {
+                for (const auto &antiprompt : params.antiprompt)
+                {
+                    LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
+                    if (params.verbose_prompt)
+                    {
+                        auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
+                        for (int i = 0; i < (int)tmp.size(); i++)
+                        {
+                            LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                        }
+                    }
+                }
+            }
+
+            if (params.input_prefix_bos)
+            {
+                LOG_TEE("Input prefix with BOS\n");
+            }
+
+            if (!params.input_prefix.empty())
+            {
+                LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
+                if (params.verbose_prompt)
+                {
+                    auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
+                    for (int i = 0; i < (int)tmp.size(); i++)
+                    {
+                        LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    }
+                }
+            }
+
+            if (!params.input_suffix.empty())
+            {
+                LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
+                if (params.verbose_prompt)
+                {
+                    auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
+                    for (int i = 0; i < (int)tmp.size(); i++)
+                    {
+                        LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    }
+                }
+            }
+        }
+        LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
+        LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
+        LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+
+        // group-attention state
+        // number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
+        int ga_i = 0;
+
+        const int ga_n = params.grp_attn_n;
+        const int ga_w = params.grp_attn_w;
+
+        if (ga_n != 1)
+        {
+            GGML_ASSERT(ga_n > 0 && "grp_attn_n must be positive");                         // NOLINT
+            GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
+                                                                                            // GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of grp_attn_w");    // NOLINT
+            // GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
+            LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
+        }
+        LOG_TEE("\n\n");
+
+        if (params.interactive)
+        {
+            const char *control_message;
+            if (params.multiline_input)
+            {
+                control_message = " - To return control to the AI, end your input with '\\'.\n"
+                                  " - To return control without starting a new line, end your input with '/'.\n";
+            }
+            else
+            {
+                control_message = " - Press Return to return control to the AI.\n"
+                                  " - To return control without starting a new line, end your input with '/'.\n"
+                                  " - If you want to submit another line, end your input with '\\'.\n";
+            }
+            LOG_TEE("== Running in interactive mode. ==\n");
+#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
+            LOG_TEE(" - Press Ctrl+C to interject at any time.\n");
+#endif
+            LOG_TEE("%s\n", control_message);
+
+            is_interacting = params.interactive_first;
+        }
+
+        bool is_antiprompt = false;
+        bool input_echo = true;
+        bool display = true;
+        bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size();
+
+        int n_past = 0;
+        int n_remain = params.n_predict;
+        int n_consumed = 0;
+        int n_session_consumed = 0;
+        int n_past_guidance = 0;
+
+        std::vector<int> input_tokens;
+        g_input_tokens = &input_tokens;
+        std::vector<int> output_tokens;
+        g_output_tokens = &output_tokens;
+        std::ostringstream output_ss;
+        g_output_ss = &output_ss;
+        std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
+
+        // the first thing we will do is to output the prompt, so set color accordingly
+        console::set_display(console::prompt);
+        display = params.display_prompt;
+
+        std::vector<llama_token> embd;
+        std::vector<llama_token> embd_guidance;
+
+        // tokenized antiprompts
+        std::vector<std::vector<llama_token>> antiprompt_ids;
+
+        antiprompt_ids.reserve(params.antiprompt.size());
+        for (const std::string &antiprompt : params.antiprompt)
+        {
+            antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
+        }
+
+        struct llama_sampling_context *ctx_sampling = llama_sampling_init(sparams);
+        if (!ctx_sampling)
+        {
+            fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
+            exit(1);
+        }
+
+        if (llama_model_has_encoder(model))
+        {
+            int enc_input_size = embd_inp.size();
+            llama_token *enc_input_buf = embd_inp.data();
+
+            if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0)))
+            {
+                LOG_TEE("%s : failed to eval\n", __func__);
+                return 1;
+            }
+
+            llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
+            if (decoder_start_token_id == -1)
+            {
+                decoder_start_token_id = llama_token_bos(model);
+            }
+
+            embd_inp.clear();
+            embd_inp.push_back(decoder_start_token_id);
+        }
+
+        while ((n_remain != 0 && !is_antiprompt) || params.interactive)
+        {
+            // predict
+            if (!embd.empty())
+            {
+                // Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via
+                // --prompt or --file which uses the same value.
+                int max_embd_size = n_ctx - 4;
+
+                // Ensure the input doesn't exceed the context size by truncating embd if necessary.
+                if ((int)embd.size() > max_embd_size)
+                {
+                    const int skipped_tokens = (int)embd.size() - max_embd_size;
+                    embd.resize(max_embd_size);
+
+                    console::set_display(console::error);
+                    printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                    console::set_display(console::reset);
+                    fflush(stdout);
+                }
+
+                if (ga_n == 1)
+                {
+                    // infinite text generation via context shifting
+                    // if we run out of context:
+                    // - take the n_keep first tokens from the original prompt (via n_past)
+                    // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
+                    if (n_past + (int)embd.size() + std::max<int>(0, guidance_offset) >= n_ctx)
+                    {
+                        if (params.n_predict == -2)
+                        {
+                            LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                            break;
+                        }
+
+                        const int n_left = n_past - params.n_keep;
+                        const int n_discard = n_left / 2;
+
+                        LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                            n_past, n_left, n_ctx, params.n_keep, n_discard);
+
+                        llama_kv_cache_seq_rm(ctx, 0, params.n_keep, params.n_keep + n_discard);
+                        llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
+
+                        n_past -= n_discard;
+
+                        if (ctx_guidance)
+                        {
+                            n_past_guidance -= n_discard;
+                        }
+
+                        LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
+
+                        LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+
+                        LOG("clear session path\n");
+                        path_session.clear();
+                    }
+                }
+                else
+                {
+                    // context extension via Self-Extend
+                    while (n_past >= ga_i + ga_w)
+                    {
+                        const int ib = (ga_n * ga_i) / ga_w;
+                        const int bd = (ga_w / ga_n) * (ga_n - 1);
+                        const int dd = (ga_w / ga_n) - ib * bd - ga_w;
+
+                        LOG("\n");
+                        LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib * bd, ga_i + ib * bd, n_past + ib * bd);
+                        LOG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib * bd, ga_i + ib * bd + ga_w, ga_n, (ga_i + ib * bd) / ga_n, (ga_i + ib * bd + ga_w) / ga_n);
+                        LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib * bd + ga_w, n_past + ib * bd, dd, ga_i + ib * bd + ga_w + dd, n_past + ib * bd + dd);
+
+                        llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib * bd);
+                        llama_kv_cache_seq_div(ctx, 0, ga_i + ib * bd, ga_i + ib * bd + ga_w, ga_n);
+                        llama_kv_cache_seq_add(ctx, 0, ga_i + ib * bd + ga_w, n_past + ib * bd, dd);
+
+                        n_past -= bd;
+
+                        ga_i += ga_w / ga_n;
+
+                        LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
+                    }
+                }
+
+                // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
+                if (n_session_consumed < (int)session_tokens.size())
+                {
+                    size_t i = 0;
+                    for (; i < embd.size(); i++)
+                    {
+                        if (embd[i] != session_tokens[n_session_consumed])
+                        {
+                            session_tokens.resize(n_session_consumed);
+                            break;
+                        }
+
+                        n_past++;
+                        n_session_consumed++;
+
+                        if (n_session_consumed >= (int)session_tokens.size())
+                        {
+                            ++i;
+                            break;
+                        }
+                    }
+                    if (i > 0)
+                    {
+                        embd.erase(embd.begin(), embd.begin() + i);
+                    }
+                }
+
+                // evaluate tokens in batches
+                // embd is typically prepared beforehand to fit within a batch, but not always
+                if (ctx_guidance)
+                {
+                    int input_size = 0;
+                    llama_token *input_buf = NULL;
+
+                    if (n_past_guidance < (int)guidance_inp.size())
+                    {
+                        // Guidance context should have the same data with these modifications:
+                        //
+                        // * Replace the initial prompt
+                        // * Shift everything by guidance_offset
+                        embd_guidance = guidance_inp;
+                        if (embd.begin() + original_prompt_len < embd.end())
+                        {
+                            embd_guidance.insert(
+                                embd_guidance.end(),
+                                embd.begin() + original_prompt_len,
+                                embd.end());
+                        }
+
+                        input_buf = embd_guidance.data();
+                        input_size = embd_guidance.size();
+
+                        LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
+                    }
+                    else
+                    {
+                        input_buf = embd.data();
+                        input_size = embd.size();
+                    }
+
+                    for (int i = 0; i < input_size; i += params.n_batch)
+                    {
+                        int n_eval = std::min(input_size - i, params.n_batch);
+                        if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0)))
+                        {
+                            LOG_TEE("%s : failed to eval\n", __func__);
+                            return 1;
+                        }
+
+                        n_past_guidance += n_eval;
+                    }
+                }
+
+                for (int i = 0; i < (int)embd.size(); i += params.n_batch)
+                {
+                    int n_eval = (int)embd.size() - i;
+                    if (n_eval > params.n_batch)
+                    {
+                        n_eval = params.n_batch;
+                    }
+
+                    LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+
+                    if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0)))
+                    {
+                        LOG_TEE("%s : failed to eval\n", __func__);
+                        return 1;
+                    }
+
+                    n_past += n_eval;
+
+                    LOG("n_past = %d\n", n_past);
+                    // Display total tokens alongside total time
+                    if (params.n_print > 0 && n_past % params.n_print == 0)
+                    {
+                        LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
+                    }
+                }
+
+                if (!embd.empty() && !path_session.empty())
+                {
+                    session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
+                    n_session_consumed = session_tokens.size();
+                }
+            }
+
+            embd.clear();
+            embd_guidance.clear();
+
+            if ((int)embd_inp.size() <= n_consumed && !is_interacting)
+            {
+                // optionally save the session on first sample (for faster prompt loading next time)
+                if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro)
+                {
+                    need_to_save_session = false;
+                    llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+
+                    LOG("saved session to %s\n", path_session.c_str());
+                }
+
+                const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
+
+                llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
+
+                LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
+
+                embd.push_back(id);
+
+                // echo this to console
+                input_echo = true;
+
+                // decrement remaining sampling budget
+                --n_remain;
+
+                LOG("n_remain: %d\n", n_remain);
+            }
+            else
+            {
+                // some user input remains from prompt or interaction, forward it to processing
+                LOG("embd_inp.size(): %d, n_consumed: %d\n", (int)embd_inp.size(), n_consumed);
+                while ((int)embd_inp.size() > n_consumed)
+                {
+                    embd.push_back(embd_inp[n_consumed]);
+
+                    // push the prompt in the sampling context in order to apply repetition penalties later
+                    // for the prompt, we don't apply grammar rules
+                    llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
+
+                    ++n_consumed;
+                    if ((int)embd.size() >= params.n_batch)
+                    {
+                        break;
+                    }
+                }
+            }
+
+            // display text
+            if (input_echo && display)
+            {
+                for (auto id : embd)
+                {
+                    const std::string token_str = llama_token_to_piece(ctx, id, params.special);
+
+                    // Console/Stream Output
+                    fprintf(stdout, "%s", token_str.c_str());
+
+                    // Record Displayed Tokens To Log
+                    // Note: Generated tokens are created one by one hence this check
+                    if (embd.size() > 1)
+                    {
+                        // Incoming Requested Tokens
+                        input_tokens.push_back(id);
+                    }
+                    else
+                    {
+                        // Outgoing Generated Tokens
+                        output_tokens.push_back(id);
+                        output_ss << token_str;
+                    }
+
+                    fflush(stdout);
+                }
+            }
+
+            // reset color to default if there is no pending user input
+            if (input_echo && (int)embd_inp.size() == n_consumed)
+            {
+                console::set_display(console::reset);
+                display = true;
+            }
+
+            // if not currently processing queued inputs;
+            if ((int)embd_inp.size() <= n_consumed)
+            {
+                // check for reverse prompt in the last n_prev tokens
+                if (!params.antiprompt.empty())
+                {
+                    const int n_prev = 32;
+                    const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
+
+                    is_antiprompt = false;
+                    // Check if each of the reverse prompts appears at the end of the output.
+                    // If we're not running interactively, the reverse prompt might be tokenized with some following characters
+                    // so we'll compensate for that by widening the search window a bit.
+                    for (std::string &antiprompt : params.antiprompt)
+                    {
+                        size_t extra_padding = params.interactive ? 0 : 2;
+                        size_t search_start_pos = last_output.length() > static_cast<size_t>(antiprompt.length() + extra_padding)
+                                                      ? last_output.length() - static_cast<size_t>(antiprompt.length() + extra_padding)
+                                                      : 0;
+
+                        if (last_output.find(antiprompt, search_start_pos) != std::string::npos)
+                        {
+                            if (params.interactive)
+                            {
+                                is_interacting = true;
+                            }
+                            is_antiprompt = true;
+                            break;
+                        }
+                    }
+
+                    // check for reverse prompt using special tokens
+                    llama_token last_token = llama_sampling_last(ctx_sampling);
+                    for (std::vector<llama_token> ids : antiprompt_ids)
+                    {
+                        if (ids.size() == 1 && last_token == ids[0])
+                        {
+                            if (params.interactive)
+                            {
+                                is_interacting = true;
+                            }
+                            is_antiprompt = true;
+                            break;
+                        }
+                    }
+
+                    if (is_antiprompt)
+                    {
+                        LOG("found antiprompt: %s\n", last_output.c_str());
+                    }
+                }
+
+                // deal with end of generation tokens in interactive mode
+                if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling)))
+                {
+                    LOG("found an EOG token\n");
+
+                    if (params.interactive)
+                    {
+                        if (!params.antiprompt.empty())
+                        {
+                            // tokenize and inject first reverse prompt
+                            const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
+                            embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
+                            is_antiprompt = true;
+                        }
+
+                        if (params.enable_chat_template)
+                        {
+                            chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
+                        }
+                        is_interacting = true;
+                        printf("\n");
+                    }
+                }
+
+                // if current token is not EOG, we add it to current assistant message
+                if (params.conversation)
+                {
+                    auto id = llama_sampling_last(ctx_sampling);
+                    assistant_ss << llama_token_to_piece(ctx, id, false);
+                }
+
+                if (n_past > 0 && is_interacting)
+                {
+                    LOG("waiting for user input\n");
+
+                    if (params.conversation)
+                    {
+                        printf("\n> ");
+                    }
+
+                    if (params.input_prefix_bos)
+                    {
+                        LOG("adding input prefix BOS token\n");
+                        embd_inp.push_back(llama_token_bos(model));
+                    }
+
+                    std::string buffer;
+                    if (!params.input_prefix.empty() && !params.conversation)
+                    {
+                        LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
+                        printf("%s", params.input_prefix.c_str());
+                    }
+
+                    // color user input only
+                    console::set_display(console::user_input);
+                    display = params.display_prompt;
+
+                    std::string line;
+                    bool another_line = true;
+                    do
+                    {
+                        another_line = console::readline(line, params.multiline_input);
+                        buffer += line;
+                    } while (another_line);
+
+                    // done taking input, reset color
+                    console::set_display(console::reset);
+                    display = true;
+
+                    // Add tokens to embd only if the input buffer is non-empty
+                    // Entering a empty line lets the user pass control back
+                    if (buffer.length() > 1)
+                    {
+                        // append input suffix if any
+                        if (!params.input_suffix.empty() && !params.conversation)
+                        {
+                            LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
+                            printf("%s", params.input_suffix.c_str());
+                        }
+
+                        LOG("buffer: '%s'\n", buffer.c_str());
+
+                        const size_t original_size = embd_inp.size();
+
+                        if (params.escape)
+                        {
+                            string_process_escapes(buffer);
+                        }
+
+                        bool format_chat = params.conversation && params.enable_chat_template;
+                        std::string user_inp = format_chat
+                                                   ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
+                                                   : std::move(buffer);
+                        // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
+                        const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
+                        const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
+                        const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
+
+                        LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
+
+                        // if user stop generation mid-way, we must add EOT to finish model's last response
+                        if (need_insert_eot && format_chat)
+                        {
+                            llama_token eot = llama_token_eot(model);
+                            embd_inp.push_back(eot == -1 ? llama_token_eos(model) : eot);
+                            need_insert_eot = false;
+                        }
+
+                        embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
+                        embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
+                        embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
+
+                        for (size_t i = original_size; i < embd_inp.size(); ++i)
+                        {
+                            const llama_token token = embd_inp[i];
+                            output_tokens.push_back(token);
+                            output_ss << llama_token_to_piece(ctx, token);
+                        }
+
+                        // reset assistant message
+                        assistant_ss.str("");
+
+                        n_remain -= line_inp.size();
+                        LOG("n_remain: %d\n", n_remain);
+                    }
+                    else
+                    {
+                        LOG("empty line, passing control back\n");
+                    }
+
+                    input_echo = false; // do not echo this again
+                }
+
+                if (n_past > 0)
+                {
+                    if (is_interacting)
+                    {
+                        llama_sampling_reset(ctx_sampling);
+                    }
+                    is_interacting = false;
+                }
+            }
+
+            // end of generation
+            if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive))
+            {
+                LOG_TEE(" [end of text]\n");
+                break;
+            }
+
+            // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
+            // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
+            if (params.interactive && n_remain <= 0 && params.n_predict >= 0)
+            {
+                n_remain = params.n_predict;
+                is_interacting = true;
+            }
+        }
+
+        if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro)
+        {
+            LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
+            llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+        }
+
+        llama_print_timings(ctx);
+        write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
+
+        if (ctx_guidance)
+        {
+            llama_free(ctx_guidance);
+        }
+        llama_free(ctx);
+        llama_free_model(model);
+
+        llama_sampling_free(ctx_sampling);
+        llama_backend_free();
+
+#ifndef LOG_DISABLE_LOGS
+        LOG_TEE("Log end\n");
+#endif // LOG_DISABLE_LOGS
+
+        return 0;
+    }
+}
+
+//  ---  EMSCRIPTEN BINDINGS  ---  EMSCRIPTEN BINDINGS  ---  EMSCRIPTEN BINDINGS  ---  EMSCRIPTEN BINDINGS  ---
+#include "util.hpp"
+int mainMain(std::vector<std::string> &args, std::vector<std::string> &retVal)
+{
+    args.insert(args.begin(), "llamalib.wasm");
+
+    int argc = args.size();
+    char **argv = new char *[argc];
+    for (int i = 0; i < argc; i++)
+    {
+        argv[i] = new char[args[i].size() + 1];
+        strcpy(argv[i], args[i].c_str());
+    }
+
+    int ret = 0;
+    {
+        OutErrRedirect outerr();
+        ret = main::main(argc, argv);
+    }
+    readOutFile(retVal);
+    readErrorFile(retVal);
+
+    return ret;
+}
+
+#include <emscripten/bind.h>
+EMSCRIPTEN_BINDINGS(llama_module)
+{
+    emscripten::register_vector<std::string>("VectorString");
+    emscripten::function("main", &mainMain);
+}
diff --git a/packages/llama/src-cpp/util.cpp b/packages/llama/src-cpp/util.cpp
new file mode 100644
index 00000000..bbb68f3b
--- /dev/null
+++ b/packages/llama/src-cpp/util.cpp
@@ -0,0 +1,67 @@
+#include "util.hpp"
+#include <stdio.h>
+#include <unistd.h>
+#include <fstream>
+#include <sstream>
+
+const char *const LLAMALIB_WASM = "llamalib.wasm";
+
+ArgBuffer::ArgBuffer(const std::vector<std::string> &args)
+{
+    argc = args.size() + 1;
+    argv = new char *[argc];
+    argv[0] = const_cast<char *>(LLAMALIB_WASM);
+    for (int i = 1; i < argc; i++)
+    {
+        argv[i] = const_cast<char *>(args.at(i - 1).c_str());
+    }
+}
+
+ArgBuffer::~ArgBuffer()
+{
+    delete[] argv;
+}
+
+OutErrRedirect::OutErrRedirect()
+{
+    fflush(stdout);
+    outBackup = dup(fileno(stdout));
+    freopen("output.txt", "w", stdout);
+
+    fflush(stderr);
+    errBackup = dup(fileno(stderr));
+    freopen("error.txt", "w", stderr);
+}
+
+OutErrRedirect::~OutErrRedirect()
+{
+    if (errBackup != -1)
+    {
+        fflush(stderr);
+        dup2(errBackup, fileno(stderr));
+        close(errBackup);
+    }
+
+    if (outBackup != -1)
+    {
+        fflush(stdout);
+        dup2(outBackup, fileno(stdout));
+        close(outBackup);
+    }
+}
+
+void readOutFile(std::vector<std::string> &retVal)
+{
+    std::ifstream file("output.txt");
+    std::stringstream output;
+    output << file.rdbuf();
+    retVal.push_back(output.str());
+}
+
+void readErrorFile(std::vector<std::string> &retVal)
+{
+    std::ifstream file("error.txt");
+    std::stringstream output;
+    output << file.rdbuf();
+    retVal.push_back(output.str());
+}
diff --git a/packages/llama/src-cpp/util.hpp b/packages/llama/src-cpp/util.hpp
new file mode 100644
index 00000000..d57a2684
--- /dev/null
+++ b/packages/llama/src-cpp/util.hpp
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <string>
+#include <cstdio>
+
+class ArgBuffer
+{
+public:
+    int argc;
+    char **argv;
+
+    ArgBuffer(const std::vector<std::string> &args);
+    ~ArgBuffer();
+
+    ArgBuffer(const ArgBuffer &) = delete;
+    ArgBuffer(ArgBuffer &&) = delete;
+    ArgBuffer &operator=(const ArgBuffer &) = delete;
+    ArgBuffer &operator=(ArgBuffer &&) = delete;
+};
+
+class OutErrRedirect
+{
+private:
+    int outBackup = -1;
+    int errBackup = -1;
+
+public:
+    OutErrRedirect();
+    ~OutErrRedirect();
+
+    OutErrRedirect(const OutErrRedirect &) = delete;
+    OutErrRedirect(OutErrRedirect &&) = delete;
+    OutErrRedirect &operator=(const OutErrRedirect &) = delete;
+    OutErrRedirect &operator=(OutErrRedirect &&) = delete;
+};
+
+void readOutFile(std::vector<std::string> &retVal);
+void readErrorFile(std::vector<std::string> &retVal);
diff --git a/packages/llama/src/index.ts b/packages/llama/src/index.ts
new file mode 100644
index 00000000..c162fac5
--- /dev/null
+++ b/packages/llama/src/index.ts
@@ -0,0 +1,2 @@
+export * from "./llama.ts";
+export * from "./web-blob.ts";
\ No newline at end of file
diff --git a/packages/llama/src/llama.ts b/packages/llama/src/llama.ts
new file mode 100644
index 00000000..0af4155d
--- /dev/null
+++ b/packages/llama/src/llama.ts
@@ -0,0 +1,96 @@
+// @ts-expect-error importing from a wasm file is resolved via a custom esbuild plugin
+import load, { reset } from "../../../build/packages/llama/src-cpp/llamalib.wasm";
+import type { MainModule } from "../../../build/packages/llama/src-cpp/llamalib.js";
+import llamaMeta from "../../../vcpkg-overlays/llama/vcpkg.json" with { type: "json" };
+
+//  Ref:  https://github.com/ggerganov/llama.cpp
+//  Ref:  http://facebook.github.io/llama/llama_manual.html
+//  Ref:  https://github.com/facebook/llama
+
+/**
+ * The llama WASM library, provides a simplified wrapper around the llama.cpp library.
+ * 
+ * See [llama.cpp](https://github.com/ggerganov/llama.cpp) for more details.
+ * 
+ * ```ts
+ * import { Llama, WebBlob } from "@hpcc-js/wasm-llama";
+ * 
+ * let llama = await Llama.load();
+ * const model = "https://huggingface.co/CompendiumLabs/bge-base-en-v1.5-gguf/resolve/main/bge-base-en-v1.5-q4_k_m.gguf";
+ * const webBlob: Blob = await WebBlob.create(new URL(model));
+ * 
+ * const data: ArrayBuffer = await webBlob.arrayBuffer();
+ * 
+ * const embeddings = llama.embedding("Hello and Welcome!", new Uint8Array(data));
+ * ```
+ */
+export class Llama {
+
+    private constructor(protected _module: MainModule) {
+    }
+
+    /**
+     * Compiles and instantiates the raw wasm.
+     * 
+     * ::: info
+     * In general WebAssembly compilation is disallowed on the main thread if the buffer size is larger than 4KB, hence forcing `load` to be asynchronous;
+     * :::
+     * 
+     * @returns A promise to an instance of the Llama class.
+     */
+    static load(): Promise<Llama> {
+        return load().then((module: any) => {
+            return new Llama(module);
+        });
+    }
+
+    /**
+     * Unloades the compiled wasm instance.
+     */
+    static unload() {
+        reset();
+    }
+
+    /**
+     * @returns The Llama c++ version
+     */
+    version(): string {
+        return llamaMeta["version-string"];
+    }
+
+    /**
+     * Calculates the vector representation of the input text.
+     * 
+     * @param text The input text.
+     * @param model The model to use for the embedding.
+     * 
+     * @returns The embedding of the text using the model.
+     */
+    embedding(text: string, model: Uint8Array): [number[]?] {
+        try {
+            this._module.FS_createDataFile("/", "embeddingModel.gguf", model, true, false, false);
+        } catch (e) {
+            console.error(e);
+        }
+        const args = new this._module.VectorString();
+        args.push_back("-m"); args.push_back("/embeddingModel.gguf");
+        args.push_back("--pooling"); args.push_back("mean");
+        args.push_back("--log-disable");
+        args.push_back("-p"); args.push_back(text);
+        args.push_back("--embd-output-format"); args.push_back("array");
+        const embeddingResult = new this._module.VectorString();
+        let retVal: [number[]?] = [];
+        try {
+            this._module.embedding(args, embeddingResult);
+            const cout = embeddingResult.get(0);
+            retVal = JSON.parse(cout);
+        } catch (e) {
+            console.error(e);
+        } finally {
+            embeddingResult.delete();
+            args.delete();
+            this._module.FS_unlink("/embeddingModel.gguf");
+        }
+        return retVal;
+    }
+}
diff --git a/packages/llama/src/web-blob.ts b/packages/llama/src/web-blob.ts
new file mode 100644
index 00000000..c5d2f4f7
--- /dev/null
+++ b/packages/llama/src/web-blob.ts
@@ -0,0 +1,113 @@
+//  See:  https://github.com/huggingface/huggingface.js/blob/main/packages/hub/src/utils/WebBlob.ts
+
+/**
+ * WebBlob is a Blob implementation for web resources that supports range requests.
+ */
+
+interface WebBlobCreateOptions {
+    /**
+     * @default 1_000_000
+     *
+     * Objects below that size will immediately be fetched and put in RAM, rather
+     * than streamed ad-hoc
+     */
+    cacheBelow?: number;
+    /**
+     * Custom fetch function to use instead of the default one, for example to use a proxy or edit headers.
+     */
+    fetch?: typeof fetch;
+}
+
+export class WebBlob extends Blob {
+    static async create(url: URL, opts?: WebBlobCreateOptions): Promise<Blob> {
+        const customFetch = opts?.fetch ?? fetch;
+        const response = await customFetch(url, { method: "HEAD" });
+
+        const size = Number(response.headers.get("content-length"));
+        const contentType = response.headers.get("content-type") || "";
+        const supportRange = response.headers.get("accept-ranges") === "bytes";
+
+        if (!supportRange || size < (opts?.cacheBelow ?? 1_000_000)) {
+            return await (await customFetch(url)).blob();
+        }
+
+        return new WebBlob(url, 0, size, contentType, true, customFetch);
+    }
+
+    private url: URL;
+    private start: number;
+    private end: number;
+    private contentType: string;
+    private full: boolean;
+    private fetch: typeof fetch;
+
+    constructor(url: URL, start: number, end: number, contentType: string, full: boolean, customFetch: typeof fetch) {
+        super([]);
+
+        this.url = url;
+        this.start = start;
+        this.end = end;
+        this.contentType = contentType;
+        this.full = full;
+        this.fetch = customFetch;
+    }
+
+    override get size(): number {
+        return this.end - this.start;
+    }
+
+    override get type(): string {
+        return this.contentType;
+    }
+
+    override slice(start = 0, end = this.size): WebBlob {
+        if (start < 0 || end < 0) {
+            new TypeError("Unsupported negative start/end on FileBlob.slice");
+        }
+
+        const slice = new WebBlob(
+            this.url,
+            this.start + start,
+            Math.min(this.start + end, this.end),
+            this.contentType,
+            start === 0 && end === this.size ? this.full : false,
+            this.fetch
+        );
+
+        return slice;
+    }
+
+    override async arrayBuffer(): Promise<ArrayBuffer> {
+        const result = await this.fetchRange();
+
+        return result.arrayBuffer();
+    }
+
+    override async text(): Promise<string> {
+        const result = await this.fetchRange();
+
+        return result.text();
+    }
+
+    override stream(): ReturnType<Blob["stream"]> {
+        const stream = new TransformStream();
+
+        this.fetchRange()
+            .then((response) => response.body?.pipeThrough(stream))
+            .catch((error) => stream.writable.abort(error.message));
+
+        return stream.readable;
+    }
+
+    private fetchRange(): Promise<Response> {
+        const fetch = this.fetch; // to avoid this.fetch() which is bound to the instance instead of globalThis
+        if (this.full) {
+            return fetch(this.url);
+        }
+        return fetch(this.url, {
+            headers: {
+                Range: `bytes=${this.start}-${this.end - 1}`,
+            },
+        });
+    }
+}
\ No newline at end of file
diff --git a/packages/llama/test/index-browser.ts b/packages/llama/test/index-browser.ts
new file mode 100644
index 00000000..9ae2931d
--- /dev/null
+++ b/packages/llama/test/index-browser.ts
@@ -0,0 +1 @@
+export * from "./llama.ts";
diff --git a/packages/llama/test/index-node.ts b/packages/llama/test/index-node.ts
new file mode 100644
index 00000000..9ae2931d
--- /dev/null
+++ b/packages/llama/test/index-node.ts
@@ -0,0 +1 @@
+export * from "./llama.ts";
diff --git a/packages/llama/test/llama.ts b/packages/llama/test/llama.ts
new file mode 100644
index 00000000..0347db67
--- /dev/null
+++ b/packages/llama/test/llama.ts
@@ -0,0 +1,67 @@
+import { expect } from "chai";
+import { Llama, WebBlob } from "@hpcc-js/wasm-llama";
+
+describe.only("llama", function () {
+    it("version", async function () {
+        let llama = await Llama.load();
+        let v = llama.version();
+        const v1 = v;
+        expect(v).to.be.a.string;
+        expect(v).to.be.not.empty;
+        expect(v).to.equal("b3623");    //  Update README.md with the new version!!!
+
+        llama = await Llama.load();
+        v = llama.version();
+        expect(v).to.be.a.string;
+        expect(v).to.be.not.empty;
+        expect(v).equals(v1);
+        Llama.unload();
+
+        llama = await Llama.load();
+        v = llama.version();
+        expect(v).to.be.a.string;
+        expect(v).to.be.not.empty;
+        expect(v).equals(v1);
+        Llama.unload();
+    });
+
+    it("test", async function () {
+        this.timeout(10000);
+        let llama = await Llama.load();
+        const model = "https://huggingface.co/CompendiumLabs/bge-base-en-v1.5-gguf/resolve/main/bge-base-en-v1.5-q4_k_m.gguf";
+        const webBlob: Blob = await WebBlob.create(new URL(model));
+        expect(webBlob.type).to.be.a.string;
+        expect(webBlob.type).equals("binary/octet-stream");
+        const data: ArrayBuffer = await webBlob.arrayBuffer();
+        expect(data).to.be.instanceOf(ArrayBuffer);
+        expect(data.byteLength).to.be.greaterThan(0);
+
+        const embeddings = llama.embedding("Hello and Welcome!", new Uint8Array(data));
+        expect(embeddings).to.be.instanceOf(Array);
+        expect(embeddings.length).equals(1);
+        expect(embeddings[0]).to.be.a.instanceOf(Array);
+        expect(embeddings[0].length).to.be.greaterThan(0);
+        expect(embeddings[0][0]).to.be.a("number");
+
+        const embeddings2 = llama.embedding("Hello and Welcome!", new Uint8Array(data));
+        expect(embeddings2).to.be.instanceOf(Array);
+        expect(embeddings2.length).equals(1);
+        expect(embeddings2[0]).to.be.a.instanceOf(Array);
+        expect(embeddings2[0].length).to.be.greaterThan(0);
+        expect(embeddings2[0][0]).to.be.a("number");
+
+        expect(embeddings).to.deep.equal(embeddings2);
+
+        Llama.unload();
+        llama = await Llama.load();
+
+        const embeddings3 = llama.embedding("Hello and Welcome!", new Uint8Array(data));
+        expect(embeddings3).to.be.instanceOf(Array);
+        expect(embeddings3.length).equals(1);
+        expect(embeddings3[0]).to.be.a.instanceOf(Array);
+        expect(embeddings3[0].length).to.be.greaterThan(0);
+        expect(embeddings3[0][0]).to.be.a("number");
+
+        expect(embeddings).to.deep.equal(embeddings3);
+    });
+});
diff --git a/packages/llama/tsconfig.json b/packages/llama/tsconfig.json
new file mode 100644
index 00000000..20a93433
--- /dev/null
+++ b/packages/llama/tsconfig.json
@@ -0,0 +1,11 @@
+{
+    "extends": "../tsconfig.json",
+    "compilerOptions": {
+        "rootDir": "./src",
+        "declarationDir": "./types"
+    },
+    "include": [
+        "./src/**/*"
+    ],
+    "references": []
+}
\ No newline at end of file
diff --git a/packages/tsconfig.json b/packages/tsconfig.json
index 19ce58c9..ba0e6674 100644
--- a/packages/tsconfig.json
+++ b/packages/tsconfig.json
@@ -4,6 +4,7 @@
         "target": "ESNext",
         "declaration": true,
         "emitDeclarationOnly": true,
+        "resolveJsonModule": true,
         "strict": true,
         "skipLibCheck": true,
         "allowImportingTsExtensions": true,
diff --git a/src-cpp/CMakeLists.txt b/src-cpp/CMakeLists.txt
deleted file mode 100644
index e69de29b..00000000
diff --git a/typedoc.json b/typedoc.json
index 7acaff59..996337cd 100644
--- a/typedoc.json
+++ b/typedoc.json
@@ -5,6 +5,7 @@
         "./packages/duckdb/src/duckdb.ts",
         "./packages/expat/src/expat.ts",
         "./packages/graphviz/src/graphviz.ts",
+        "./packages/llama/src/llama.ts",
         "./packages/zstd/src/zstd.ts",
     ],
     "out": "./docs",
diff --git a/vcpkg-overlays/llama/portfile.cmake b/vcpkg-overlays/llama/portfile.cmake
new file mode 100644
index 00000000..0f42b80c
--- /dev/null
+++ b/vcpkg-overlays/llama/portfile.cmake
@@ -0,0 +1,28 @@
+vcpkg_from_github(
+    OUT_SOURCE_PATH SOURCE_PATH
+    REPO ggerganov/llama.cpp
+    REF "${VERSION}"
+    SHA512 f59c5b4b0f24ace3e997bbaf69239d1b0c09f640cfdc1730976e5333aff2300f1c822b4a464c6d7b765f92264d48c5f79ccedb153cbeeaa55793785905136130
+    HEAD_REF master
+)
+
+vcpkg_cmake_configure(
+    SOURCE_PATH "${SOURCE_PATH}"
+    OPTIONS
+        -DLLAMA_BUILD_TESTS=OFF
+        -DLLAMA_BUILD_EXAMPLES=OFF
+        -DLLAMA_BUILD_SERVER=OFF
+        -DGGML_OPENMP=OFF
+)
+
+vcpkg_cmake_install()
+
+vcpkg_copy_pdbs()
+vcpkg_fixup_pkgconfig()
+vcpkg_cmake_config_fixup(CONFIG_PATH lib/cmake/llama)
+
+file(INSTALL ${SOURCE_PATH}/common DESTINATION ${CURRENT_PACKAGES_DIR}/share/${PORT})
+
+file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include")
+file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/share")
+file(INSTALL ${SOURCE_PATH}/LICENSE DESTINATION ${CURRENT_PACKAGES_DIR}/share/${PORT} RENAME copyright)
diff --git a/vcpkg-overlays/llama/vcpkg.json b/vcpkg-overlays/llama/vcpkg.json
new file mode 100644
index 00000000..6b27a210
--- /dev/null
+++ b/vcpkg-overlays/llama/vcpkg.json
@@ -0,0 +1,16 @@
+{
+  "name": "llama",
+  "version-string": "b3623",
+  "homepage": "https://github.com/ggerganov/llama.cpp",
+  "description": "Inference of LLaMA model in pure C/C++.",
+  "dependencies": [
+    {
+      "name": "vcpkg-cmake",
+      "host": true
+    },
+    {
+      "name": "vcpkg-cmake-config",
+      "host": true
+    }
+  ]
+}
\ No newline at end of file
diff --git a/vcpkg.json b/vcpkg.json
index 48696c1f..5dd83a56 100644
--- a/vcpkg.json
+++ b/vcpkg.json
@@ -9,6 +9,9 @@
     {
       "name": "graphviz"
     },
+    {
+      "name": "llama"
+    },
     {
       "name": "triangle"
     },