From 2f3e5c38f5e50b2dbe51cd9d8b3a3f9b0256faba Mon Sep 17 00:00:00 2001
From: Holden <34213478+hodlen@users.noreply.github.com>
Date: Sat, 16 Dec 2023 00:40:55 +0800
Subject: [PATCH] add gpu index opts and udpate doc commands

---
 README.md                    | 53 +++++++++++++++++++-----------------
 common/common.cpp            | 11 ++++----
 common/common.h              |  2 +-
 examples/batched/batched.cpp | 10 +++----
 llama.cpp                    |  2 +-
 llama.h                      |  2 +-
 scripts/export-gpu-split.py  |  2 +-
 7 files changed, 42 insertions(+), 40 deletions(-)
diff --git a/README.md b/README.md
index 03296808..d13f6b2b 100644
--- a/README.md
+++ b/README.md
@@ -50,7 +50,6 @@ The SparseLLM Team is currently converting the Mistral-7B model to a sparser ver
 
 - [Installation](##setup--installation)
 - [Model Weights](##model-weights)
-- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
 
 ## Setup & Installation
 ### Get the Code
@@ -60,28 +59,24 @@ git clone https://github.com/hodlen/PowerInfer
 cd PowerInfer
 ```
 ### Build
-In order to build PowerInfer you have two different options.
-
-- Using `make`:
-  - On Linux or MacOS:
-    ```bash
-      make
-    ```
-- Using `CMake`:
-  - If you have one GPU:
-  ```bash
-    mkdir build
-    cd build
-    cmake .. -DLLAMA_CUBLAS=ON
-    cmake --build . --config Release
-  ```
-  - If you just CPU:
-  ```bash
-    mkdir build
-    cd build
-    cmake .. 
-    cmake --build . --config Release
-  ```
+In order to build PowerInfer you have two different options. These commands are supposed to be run from the root directory of the project.
+
+Using `make` on Linux or MacOS:
+```bash
+make
+```
+
+Using `CMake`:
+* If you have one GPU:
+```bash
+cmake -S . -B build -DLLAMA_CUBLAS=ON
+cmake --build build --config Release
+```
+* If you just CPU:
+```bash
+cmake -S . -B build
+cmake --build build --config Release
+```
 
 ## Model Weights
 
@@ -96,11 +91,19 @@ In order to build PowerInfer you have two different options.
 ```bash
   ./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt)
 ```
-- If you have CPU with one consumer grade GPU:
+- If you have CPU with one GPU:
 ```bash
-  ./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt)
+./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt)
 ```
 
+As for now, it requires a offline-generated "GPU index" file to split FFNs on GPU. If you want to try it, please use the following instruction to generate the GPU index file:
+```bash
+python scripts/export-gpu-split.py $(activation_count_path) $(output_idx_path) solver
+```
+Then, you can use the following instruction to run PowerInfer with GPU index:
+```bash
+./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt) --gpu-index $(split_path)
+```
 
 ## Evaluation
 
diff --git a/common/common.cpp b/common/common.cpp
index 429f313a..936e114c 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -471,12 +471,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.lora_base = argv[i];
-        } else if (arg == "--mlp-adapter") {
+        } else if (arg == "--gpu-index") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.mlp_adapter = argv[i];
+            params.gpu_index = argv[i];
         } else if (arg == "--mmproj") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -970,9 +970,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 
     if (llama_use_sparse_inference(model)) {
         fprintf(stderr, "%s: postprocessing PowerInfer model '%s'\n", __func__, params.model.c_str());
-        if (!params.mlp_adapter.empty()) {
-            fprintf(stderr, "%s: warning: --mlp-adapter is deprecated and has no effect\n", __func__);
-            int err = llama_model_apply_mlp_from_file(model, params.mlp_adapter.c_str(), true);
+        if (!params.gpu_index.empty()) {
+            int err = llama_model_apply_gpu_idx_from_file(model, params.gpu_index.c_str(), true);
             if (err != 0) {
                 fprintf(stderr, "%s: error: failed to apply mlp adapter\n", __func__);
                 llama_free_model(model);
@@ -1358,7 +1357,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
         fprintf(stream, "  - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
     }
     fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
-    fprintf(stream, "mlp_adapter: %s\n", params.mlp_adapter.c_str());
+    fprintf(stream, "gpu_index: %s\n", params.gpu_index.c_str());
     fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
     fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
     fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
diff --git a/common/common.h b/common/common.h
index 20497018..c16eb86e 100644
--- a/common/common.h
+++ b/common/common.h
@@ -91,7 +91,7 @@ struct gpt_params {
     std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
     std::string lora_base  = "";                              // base model path for the lora adapter
 
-    std::string mlp_adapter = "";  // sparse activation mlp adapter path
+    std::string gpu_index = "";  // sparse activation mlp adapter path
 
     int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
     int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index b9a355b0..fcfe9140 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -49,11 +49,11 @@ int main(int argc, char ** argv) {
     }
 
     if (argc >= 8) {
-        params.mlp_adapter = argv[7];
+        params.gpu_index = argv[7];
     }
 
-    printf("params: model = %s, prompt = %s, n_parallel = %d, n_len = %d, n_gpu_layers = %d, n_threads = %d, mlp_adapter = %s\n",
-           params.model.c_str(), params.prompt.c_str(), n_parallel, n_len, n_gpu_layers, params.n_threads, params.mlp_adapter.c_str());
+    printf("params: model = %s, prompt = %s, n_parallel = %d, n_len = %d, n_gpu_layers = %d, n_threads = %d, gpu_index = %s\n",
+           params.model.c_str(), params.prompt.c_str(), n_parallel, n_len, n_gpu_layers, params.n_threads, params.gpu_index.c_str());
 
     if (params.prompt.empty()) {
         params.prompt = "Hello my name is";
@@ -76,8 +76,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    if (!params.mlp_adapter.empty()) {
-        int err = llama_model_apply_mlp_from_file(model, params.mlp_adapter.c_str(), true);
+    if (!params.gpu_index.empty()) {
+        int err = llama_model_apply_gpu_idx_from_file(model, params.gpu_index.c_str(), true);
         if (err != 0) {
             fprintf(stderr, "%s: error: failed to apply mlp adapter\n", __func__);
             llama_free_model(model);
diff --git a/llama.cpp b/llama.cpp
index e2e466d0..5814a1ea 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9660,7 +9660,7 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
     }
 }
 
-int llama_model_apply_mlp_from_file(struct llama_model * model, const char * path_mlp, bool use_mmap) {
+int llama_model_apply_gpu_idx_from_file(struct llama_model * model, const char * path_mlp, bool use_mmap) {
     llama_mlp_model_loader * mlp_ml = new llama_mlp_model_loader(path_mlp, use_mmap);
     if (mlp_ml -> apply_tensors_to_base_model(model) > 0) {
         LLAMA_LOG_ERROR("%s: failed to apply mlp adapter\n", __func__);
diff --git a/llama.h b/llama.h
index 1d73b522..0ab02cca 100644
--- a/llama.h
+++ b/llama.h
@@ -342,7 +342,7 @@ extern "C" {
                       const char * path_base_model,
                              int   n_threads);
 
-    LLAMA_API int llama_model_apply_mlp_from_file(
+    LLAMA_API int llama_model_apply_gpu_idx_from_file(
                   struct llama_model * model,
                           const char * path_mlp,
                                 bool   use_mmap);
diff --git a/scripts/export-gpu-split.py b/scripts/export-gpu-split.py
index c028f6a3..30f44fe8 100644
--- a/scripts/export-gpu-split.py
+++ b/scripts/export-gpu-split.py
@@ -134,7 +134,7 @@ def main(predictors_path: str, output_path: str, solver_path: str):
     parser.add_argument(
         "output_path",
         help="path to the output GGML adapter",
-        default="./ggml-mlp-adapters.bin",
+        default="./gpu-index.bin",
     )
     parser.add_argument("solver", help="path to the solver")