Merge pull request #27 from Atome-FE/feature/update-llama-rs

feat: upgrade to newest llama-rs so that we can support ggjt model
Atome-FE · Apr 23, 2023 · a545f3c · a545f3c
2 parents e7d1795 + 360a83b
commit a545f3c
Show file tree

Hide file tree

Showing 21 changed files with 512 additions and 152 deletions.
diff --git a/.github/workflows/llama-build.yml b/.github/workflows/llama-build.yml
@@ -7,6 +7,7 @@ on:
   pull_request:
     branches:
       - master
+      - main
     types: 
       - ready_for_review
       - review_requested

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/README-zh-CN.md b/README-zh-CN.md
@@ -22,6 +22,8 @@ Node.js运行的大语言模型LLaMA。
   - [安装](#安装)
   - [模型获取](#模型获取)
     - [模型版本](#模型版本)
+      - [llama.cpp](#llamacpp)
+      - [llama-rs](#llama-rs)
   - [使用（llama.cpp后端）](#使用llamacpp后端)
     - [推理](#推理)
     - [分词](#分词)
@@ -87,13 +89,48 @@ llama-node底层调用llama-rs，它使用的模型格式源自llama.cpp。由
 
 ### 模型版本
 
-目前llama.cpp社区有3个版本：
+#### llama.cpp
+
+以下是llama.cpp支持的模型类型，ggml.h源码中可找到：
+
+```c
+enum ggml_type {
+    // explicitly numbered values are used in llama.cpp files
+    GGML_TYPE_F32  = 0,
+    GGML_TYPE_F16  = 1,
+    GGML_TYPE_Q4_0 = 2,
+    GGML_TYPE_Q4_1 = 3,
+    GGML_TYPE_Q4_2 = 4,
+    GGML_TYPE_Q4_3 = 5,
+    GGML_TYPE_Q8_0 = 6,
+    GGML_TYPE_I8,
+    GGML_TYPE_I16,
+    GGML_TYPE_I32,
+    GGML_TYPE_COUNT,
+};
+```
 
-- GGML：旧版格式，最早的GGML张量文件格式。
-- GGMF：也是旧版格式，比GGML新，比GGJT旧。
-- GGJT：可进行mmap映射的格式。
+#### llama-rs
+
+以下是llama-rs支持的模型类型，从llama-rs的ggml绑定中可找到：
+
+```rust
+pub enum Type {
+    /// Quantized 4-bit (type 0).
+    #[default]
+    Q4_0,
+    /// Quantized 4-bit (type 1); used by GPTQ.
+    Q4_1,
+    /// Integer 32-bit.
+    I32,
+    /// Float 16-bit.
+    F16,
+    /// Float 32-bit.
+    F32,
+}
+```
 
-llama-rs后端现在只支持GGML / GGMF模型。llama.cpp后端仅支持GGJT模型
+llama-rs也支持旧版的ggml/ggmf模型
 
 ---
 
@@ -110,7 +147,7 @@ import { LLama } from "llama-node";
 import { LLamaCpp, LoadConfig } from "llama-node/dist/llm/llama-cpp.js";
 import path from "path";
 
-const model = path.resolve(process.cwd(), "./ggml-vicuna-7b-4bit-rev1.bin");
+const model = path.resolve(process.cwd(), "./ggml-vicuna-7b-1.1-q4_1.bin");
 
 const llama = new LLama(LLamaCpp);
 
@@ -163,7 +200,7 @@ import { LLama } from "llama-node";
 import { LLamaCpp, LoadConfig } from "llama-node/dist/llm/llama-cpp.js";
 import path from "path";
 
-const model = path.resolve(process.cwd(), "./ggml-vicuna-7b-4bit-rev1.bin");
+const model = path.resolve(process.cwd(), "./ggml-vicuna-7b-1.1-q4_1.bin");
 
 const llama = new LLama(LLamaCpp);
 
@@ -195,7 +232,7 @@ import { LLama } from "llama-node";
 import { LLamaCpp, LoadConfig } from "llama-node/dist/llm/llama-cpp.js";
 import path from "path";
 
-const model = path.resolve(process.cwd(), "./ggml-vicuna-7b-4bit-rev1.bin");
+const model = path.resolve(process.cwd(), "./ggml-vicuna-7b-1.1-q4_1.bin");
 
 const llama = new LLama(LLamaCpp);
 
@@ -363,7 +400,7 @@ import { LLama } from "llama-node";
 import { LLamaCpp, LoadConfig } from "llama-node/dist/llm/llama-cpp.js";
 import path from "path";
 
-const model = path.resolve(process.cwd(), "../ggml-vicuna-7b-4bit-rev1.bin");
+const model = path.resolve(process.cwd(), "../ggml-vicuna-7b-1.1-q4_1.bin");
 
 const llama = new LLama(LLamaCpp);
 
@@ -446,5 +483,5 @@ run();
 - [ ] 更多平台和处理器架构（在最高的性能条件下）
 - [ ] 优化嵌入API，提供可以配置尾词的选项
 - [ ] 命令行工具
-- [ ] 更新llama-rs以支持更多模型 https://github.com/rustformers/llama-rs/pull/85 https://github.com/rustformers/llama-rs/issues/75
+- [ ] 更新llama-rs以支持更多模型 https://github.com/rustformers/llama-rs/pull/141
 - [ ] 更多native推理后端（如rwkv）支持！
diff --git a/README.md b/README.md
@@ -24,6 +24,8 @@ This project is in an early stage, the API for nodejs may change in the future,
   - [Install](#install)
   - [Getting the weights](#getting-the-weights)
     - [Model versioning](#model-versioning)
+      - [llama.cpp](#llamacpp)
+      - [llama-rs](#llama-rs)
   - [Usage (llama.cpp backend)](#usage-llamacpp-backend)
     - [Inference](#inference)
     - [Tokenize](#tokenize)
@@ -89,13 +91,48 @@ The llama-node uses llama-rs under the hook and uses the model format derived fr
 
 ### Model versioning
 
-There are now 3 versions from llama.cpp community:
+#### llama.cpp
+
+For llama.cpp, supported types can check from ggml.h source:
+
+```c
+enum ggml_type {
+    // explicitly numbered values are used in llama.cpp files
+    GGML_TYPE_F32  = 0,
+    GGML_TYPE_F16  = 1,
+    GGML_TYPE_Q4_0 = 2,
+    GGML_TYPE_Q4_1 = 3,
+    GGML_TYPE_Q4_2 = 4,
+    GGML_TYPE_Q4_3 = 5,
+    GGML_TYPE_Q8_0 = 6,
+    GGML_TYPE_I8,
+    GGML_TYPE_I16,
+    GGML_TYPE_I32,
+    GGML_TYPE_COUNT,
+};
+```
 
-- GGML: legacy format, oldest ggml tensor file format
-- GGMF: also legacy format, newer than GGML, older than GGJT
-- GGJT: mmap-able format
+#### llama-rs
+
+For llama-rs, supported model types can check from llama-rs ggml bindings:
+
+```rust
+pub enum Type {
+    /// Quantized 4-bit (type 0).
+    #[default]
+    Q4_0,
+    /// Quantized 4-bit (type 1); used by GPTQ.
+    Q4_1,
+    /// Integer 32-bit.
+    I32,
+    /// Float 16-bit.
+    F16,
+    /// Float 32-bit.
+    F32,
+}
+```
 
-The llama-rs backend now only supports GGML/GGMF models, and llama.cpp backend only supports GGJT models.
+llama-rs also supports legacy llama.cpp models
 
 ---
 
@@ -112,7 +149,7 @@ import { LLama } from "llama-node";
 import { LLamaCpp, LoadConfig } from "llama-node/dist/llm/llama-cpp.js";
 import path from "path";
 
-const model = path.resolve(process.cwd(), "./ggml-vicuna-7b-4bit-rev1.bin");
+const model = path.resolve(process.cwd(), "./ggml-vicuna-7b-1.1-q4_1.bin");
 
 const llama = new LLama(LLamaCpp);
 
@@ -165,7 +202,7 @@ import { LLama } from "llama-node";
 import { LLamaCpp, LoadConfig } from "llama-node/dist/llm/llama-cpp.js";
 import path from "path";
 
-const model = path.resolve(process.cwd(), "./ggml-vicuna-7b-4bit-rev1.bin");
+const model = path.resolve(process.cwd(), "./ggml-vicuna-7b-1.1-q4_1.bin");
 
 const llama = new LLama(LLamaCpp);
 
@@ -197,7 +234,7 @@ import { LLama } from "llama-node";
 import { LLamaCpp, LoadConfig } from "llama-node/dist/llm/llama-cpp.js";
 import path from "path";
 
-const model = path.resolve(process.cwd(), "./ggml-vicuna-7b-4bit-rev1.bin");
+const model = path.resolve(process.cwd(), "./ggml-vicuna-7b-1.1-q4_1.bin");
 
 const llama = new LLama(LLamaCpp);
 
@@ -366,7 +403,7 @@ import { LLama } from "llama-node";
 import { LLamaCpp, LoadConfig } from "llama-node/dist/llm/llama-cpp.js";
 import path from "path";
 
-const model = path.resolve(process.cwd(), "../ggml-vicuna-7b-4bit-rev1.bin");
+const model = path.resolve(process.cwd(), "../ggml-vicuna-7b-1.1-q4_1.bin");
 
 const llama = new LLama(LLamaCpp);
 
@@ -452,5 +489,5 @@ The following steps will allow you to compile the binary with best quality on yo
 - [ ] more platforms and cross compile (performance related)
 - [ ] tweak embedding API, make end token configurable
 - [ ] cli and interactive
-- [ ] support more open source models as llama-rs planned https://github.com/rustformers/llama-rs/pull/85 https://github.com/rustformers/llama-rs/issues/75
+- [ ] support more open source models as llama-rs planned https://github.com/rustformers/llama-rs/pull/141
 - [ ] more backends (eg. rwkv) supports!
diff --git a/example/src/langchain/langchain.ts b/example/src/langchain/langchain.ts
@@ -4,7 +4,7 @@ import { LLama } from "llama-node";
 import { LLamaCpp, LoadConfig } from "llama-node/dist/llm/llama-cpp.js";
 import path from "path";
 
-const model = path.resolve(process.cwd(), "../ggml-vicuna-7b-4bit-rev1.bin");
+const model = path.resolve(process.cwd(), "../ggml-vicuna-7b-1.1-q4_1.bin");
 
 const llama = new LLama(LLamaCpp);
 

diff --git a/example/src/llama-cpp/embedding.ts b/example/src/llama-cpp/embedding.ts
@@ -2,7 +2,7 @@ import { LLama } from "llama-node";
 import { LLamaCpp, LoadConfig } from "llama-node/dist/llm/llama-cpp.js";
 import path from "path";
 
-const model = path.resolve(process.cwd(), "../ggml-vicuna-7b-4bit-rev1.bin");
+const model = path.resolve(process.cwd(), "../ggml-vicuna-7b-1.1-q4_1.bin");
 
 const llama = new LLama(LLamaCpp);
 

diff --git a/example/src/llama-cpp/llama-cpp.ts b/example/src/llama-cpp/llama-cpp.ts
@@ -2,7 +2,7 @@ import { LLama } from "llama-node";
 import { LLamaCpp, LoadConfig } from "llama-node/dist/llm/llama-cpp.js";
 import path from "path";
 
-const model = path.resolve(process.cwd(), "../ggml-vicuna-7b-4bit-rev1.bin");
+const model = path.resolve(process.cwd(), "../ggml-vicuna-7b-1.1-q4_1.bin");
 
 const llama = new LLama(LLamaCpp);
 
@@ -22,7 +22,7 @@ const config: LoadConfig = {
 
 llama.load(config);
 
-const template = `How are you`;
+const template = `How are you?`;
 
 const prompt = `### Human:
 

diff --git a/example/src/llama-cpp/tokenize.ts b/example/src/llama-cpp/tokenize.ts
@@ -2,7 +2,7 @@ import { LLama } from "llama-node";
 import { LLamaCpp, LoadConfig } from "llama-node/dist/llm/llama-cpp.js";
 import path from "path";
 
-const model = path.resolve(process.cwd(), "../ggml-vicuna-7b-4bit-rev1.bin");
+const model = path.resolve(process.cwd(), "../ggml-vicuna-7b-1.1-q4_1.bin");
 
 const llama = new LLama(LLamaCpp);