From 561780396ddf1c4abc9fc5e5ff32f685b3df1933 Mon Sep 17 00:00:00 2001 From: AlpinDale <52078762+AlpinDale@users.noreply.github.com> Date: Fri, 23 Feb 2024 19:31:54 +0000 Subject: [PATCH] server : add KV cache quantization options (#5684) --- examples/server/server.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 369121e885b27..524d0ada33ab0 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1948,6 +1948,10 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); printf(" -spf FNAME, --system-prompt-file FNAME\n"); printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n"); + printf(" -ctk TYPE, --cache-type-k TYPE\n"); + printf(" KV cache data type for K (default: f16)\n"); + printf(" -ctv TYPE, --cache-type-v TYPE\n"); + printf(" KV cache data type for V (default: f16)\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n"); printf(" --log-disable disables logging to a file.\n"); printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n"); @@ -2386,6 +2390,12 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, ); llama.process_system_prompt_data(json::parse(systm_content)); } + else if (arg == "-ctk" || arg == "--cache-type-k") { + params.cache_type_k = argv[++i]; + } + else if (arg == "-ctv" || arg == "--cache-type-v") { + params.cache_type_v = argv[++i]; + } else if(arg == "--mmproj") { if (++i >= argc)