Skip to content

Commit

Permalink
Support setting VRAM budget for examples/server (#106)
Browse files Browse the repository at this point in the history
* support --vram-budget for server

* set default batch size to 32 to avoid CUDA OOM
  • Loading branch information
hodlen authored Dec 29, 2023
1 parent 79986ec commit 74c5c58
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 1 deletion.
2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ struct gpt_params {
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_batch = 32; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
Expand Down
1 change: 1 addition & 0 deletions examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ Command line options:
- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
- `--vram-budget N`: VRAM budget in GiB (default: -1, -1 = available VRAM)
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.

## Build
Expand Down
22 changes: 22 additions & 0 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1804,6 +1804,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
printf(" -spf FNAME, --system-prompt-file FNAME\n");
printf(" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
printf(" --vram-budget N VRAM budget in GiB (default: -1, -1 = available VRAM)\n");
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
printf("\n");
}
Expand Down Expand Up @@ -2150,6 +2151,27 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
);
llama.process_system_prompt_data(json::parse(systm_content));
}
else if (arg == "--vram-budget")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
params.vram_budget_gb = std::stof(argv[i]);
#else
fprintf(stderr, "warning: PowerInfer was compiled without cuBLAS. It is not possible to set a VRAM budget.\n");
#endif
}
else if (arg == "--reset-gpu-index")
{
params.reset_gpu_index = true;
}
else if (arg == "--disable-gpu-index")
{
params.disale_gpu_index = true;
}
else if(arg == "--mmproj")
{
if (++i >= argc)
Expand Down

0 comments on commit 74c5c58

Please sign in to comment.