Support setting VRAM budget for examples/server (#106)

* support --vram-budget for server * set default batch size to 32 to avoid CUDA OOM
SJTU-IPADS · Dec 29, 2023 · 74c5c58 · 74c5c58
1 parent 79986ec
commit 74c5c58
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 1 deletion.
diff --git a/common/common.h b/common/common.h
@@ -49,7 +49,7 @@ struct gpt_params {
     int32_t n_threads_batch                 = -1;    // number of threads to use for batch processing (-1 = use n_threads)
     int32_t n_predict                       = -1;    // new tokens to predict
     int32_t n_ctx                           = 512;   // context size
-    int32_t n_batch                         = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_batch                         = 32;    // batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_keep                          = 0;     // number of tokens to keep from initial prompt
     int32_t n_draft                         = 16;    // number of tokens to draft during speculative decoding
     int32_t n_chunks                        = -1;    // max number of chunks to process (-1 = unlimited)

diff --git a/examples/server/README.md b/examples/server/README.md
@@ -27,6 +27,7 @@ Command line options:
 -   `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
 -   `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
 -   `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
+-   `--vram-budget N`: VRAM budget in GiB (default: -1, -1 = available VRAM)
 -   `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
 
 ## Build

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -1804,6 +1804,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
     printf("    -spf FNAME, --system-prompt-file FNAME\n");
     printf("                        Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
+    printf("  --vram-budget N       VRAM budget in GiB (default: -1, -1 = available VRAM)\n");
     printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA.\n");
     printf("\n");
 }
@@ -2150,6 +2151,27 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
             );
             llama.process_system_prompt_data(json::parse(systm_content));
         }
+        else if (arg == "--vram-budget")
+        {
+            if (++i >= argc)
+            {
+                invalid_param = true;
+                break;
+            }
+#ifdef GGML_USE_CUBLAS
+            params.vram_budget_gb = std::stof(argv[i]);
+#else
+            fprintf(stderr, "warning: PowerInfer was compiled without cuBLAS. It is not possible to set a VRAM budget.\n");
+#endif
+        }
+        else if (arg == "--reset-gpu-index")
+        {
+            params.reset_gpu_index = true;
+        } 
+        else if (arg == "--disable-gpu-index")
+        {
+            params.disale_gpu_index = true;
+        }
         else if(arg == "--mmproj")
         {
             if (++i >= argc)