From 473a84f5f7cb8860ccc33a21ededbd6097f4c66f Mon Sep 17 00:00:00 2001 From: Gadflyii Date: Tue, 30 Sep 2025 10:21:49 -0500 Subject: [PATCH 1/3] implement --no-host to disable host buffer --- common/arg.cpp | 7 ++++++ common/common.cpp | 1 + common/common.h | 1 + include/llama.h | 1 + src/llama-model.cpp | 17 ++++++++------ tools/llama-bench/llama-bench.cpp | 38 ++++++++++++++++++++++++++++--- 6 files changed, 55 insertions(+), 10 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 3c932264d0668..40edfc24393dd 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2642,6 +2642,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.no_extra_bufts = true; } ).set_env("LLAMA_ARG_NO_REPACK")); + add_opt(common_arg( + {"--no-host"}, + "bypass host buffer allowing extra buffers to be used", + [](common_params & params) { + params.no_host = true; + } + ).set_env("LLAMA_ARG_NO_HOST")); add_opt(common_arg( {"-ctk", "--cache-type-k"}, "TYPE", string_format( diff --git a/common/common.cpp b/common/common.cpp index c1e736c44cf56..b0591e84b0668 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1133,6 +1133,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.use_mlock = params.use_mlock; mparams.check_tensors = params.check_tensors; mparams.use_extra_bufts = !params.no_extra_bufts; + mparams.no_host = params.no_host; if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; diff --git a/common/common.h b/common/common.h index 40c6847f32ddb..789ea996a49a1 100644 --- a/common/common.h +++ b/common/common.h @@ -392,6 +392,7 @@ struct common_params { bool check_tensors = false; // validate tensor data bool no_op_offload = false; // globally disable offload host tensor operations to device bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking) + bool no_host = false; // bypass host buffer allowing extra buffers to be used bool single_turn = false; // single turn chat conversation diff --git a/include/llama.h b/include/llama.h index 452d9ec5bf285..4b0291b952ec6 100644 --- a/include/llama.h +++ b/include/llama.h @@ -296,6 +296,7 @@ extern "C" { bool use_mlock; // force system to keep model in RAM bool check_tensors; // validate model tensor data bool use_extra_bufts; // use extra buffer types (used for weight repacking) + bool no_host; // bypass host buffer allowing extra buffers to be used }; // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 2470f87850f2b..213772224dd0f 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -310,7 +310,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara } // CPU: ACCEL -> GPU host -> CPU extra -> CPU -static buft_list_t make_cpu_buft_list(const std::vector & devices, bool use_extra_bufts) { +static buft_list_t make_cpu_buft_list(const std::vector & devices, bool use_extra_bufts, bool no_host) { buft_list_t buft_list; // add ACCEL buffer types @@ -331,11 +331,13 @@ static buft_list_t make_cpu_buft_list(const std::vector & de // generally, this will be done using the first device in the list // a better approach would be to handle this on a weight-by-weight basis using the offload_op // function of the device to determine if it would benefit from being stored in a host buffer - for (auto * dev : devices) { - ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev); - if (buft) { - buft_list.emplace_back(dev, buft); - break; + if (!no_host) { + for (auto * dev : devices) { + ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev); + if (buft) { + buft_list.emplace_back(dev, buft); + break; + } } } @@ -2062,7 +2064,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false"); // build a list of buffer types for the CPU and GPU devices - pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts); + pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host); for (auto * dev : devices) { buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split); // add CPU buffer types as a fallback @@ -19651,6 +19653,7 @@ llama_model_params llama_model_default_params() { /*.use_mlock =*/ false, /*.check_tensors =*/ false, /*.use_extra_bufts =*/ true, + /*.no_host =*/ false, }; return result; diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 275ba367c02f1..cfaebd2e11cdb 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -357,6 +357,7 @@ struct cmd_params { std::vector use_mmap; std::vector embeddings; std::vector no_op_offload; + std::vector no_host; ggml_numa_strategy numa; int reps; ggml_sched_priority prio; @@ -394,6 +395,7 @@ static const cmd_params cmd_params_defaults = { /* use_mmap */ { true }, /* embeddings */ { false }, /* no_op_offload */ { false }, + /* no_host */ { false }, /* numa */ GGML_NUMA_STRATEGY_DISABLED, /* reps */ 5, /* prio */ GGML_SCHED_PRIO_NORMAL, @@ -474,6 +476,8 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -ot --override-tensor =;...\n"); printf(" (default: disabled)\n"); printf(" -nopo, --no-op-offload <0|1> (default: 0)\n"); + printf(" --no-host <0|1> (default: %s)\n", + join(cmd_params_defaults.no_host, ",").c_str()); printf("\n"); printf( "Multiple values can be given for each parameter by separating them with ','\n" @@ -803,6 +807,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = string_split(argv[i], split_delim); params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end()); + } else if (arg == "--no-host") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.no_host.insert(params.no_host.end(), p.begin(), p.end()); } else if (arg == "-ts" || arg == "--tensor-split") { if (++i >= argc) { invalid_param = true; @@ -1024,6 +1035,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.no_op_offload.empty()) { params.no_op_offload = cmd_params_defaults.no_op_offload; } + if (params.no_host.empty()) { + params.no_host = cmd_params_defaults.no_host; + } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } @@ -1065,6 +1079,7 @@ struct cmd_params_instance { bool use_mmap; bool embeddings; bool no_op_offload; + bool no_host; llama_model_params to_llama_mparams() const { llama_model_params mparams = llama_model_default_params(); @@ -1077,6 +1092,7 @@ struct cmd_params_instance { mparams.main_gpu = main_gpu; mparams.tensor_split = tensor_split.data(); mparams.use_mmap = use_mmap; + mparams.no_host = no_host; if (n_cpu_moe <= 0) { if (tensor_buft_overrides.empty()) { @@ -1159,6 +1175,7 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & mmp : params.use_mmap) for (const auto & embd : params.embeddings) for (const auto & nopo : params.no_op_offload) + for (const auto & noh : params.no_host) for (const auto & nb : params.n_batch) for (const auto & nub : params.n_ubatch) for (const auto & tk : params.type_k) @@ -1199,6 +1216,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .use_mmap = */ mmp, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, + /* .no_host = */ noh, }; instances.push_back(instance); } @@ -1232,6 +1250,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .use_mmap = */ mmp, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, + /* .no_host = */ noh, }; instances.push_back(instance); } @@ -1265,6 +1284,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .use_mmap = */ mmp, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, + /* .no_host = */ noh, }; instances.push_back(instance); } @@ -1303,6 +1323,7 @@ struct test { bool use_mmap; bool embeddings; bool no_op_offload; + bool no_host; int n_prompt; int n_gen; int n_depth; @@ -1339,6 +1360,7 @@ struct test { use_mmap = inst.use_mmap; embeddings = inst.embeddings; no_op_offload = inst.no_op_offload; + no_host = inst.no_host; n_prompt = inst.n_prompt; n_gen = inst.n_gen; n_depth = inst.n_depth; @@ -1386,8 +1408,8 @@ struct test { "type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split", "tensor_buft_overrides", "use_mmap", "embeddings", "no_op_offload", - "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns", - "stddev_ns", "avg_ts", "stddev_ts" + "no_host", "n_prompt", "n_gen", "n_depth", "test_time", + "avg_ns", "stddev_ns", "avg_ts", "stddev_ts" }; return fields; } @@ -1402,7 +1424,7 @@ struct test { return INT; } if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || - field == "use_mmap" || field == "embeddings") { + field == "use_mmap" || field == "embeddings" || field == "no_host") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -1477,6 +1499,7 @@ struct test { std::to_string(use_mmap), std::to_string(embeddings), std::to_string(no_op_offload), + std::to_string(no_host), std::to_string(n_prompt), std::to_string(n_gen), std::to_string(n_depth), @@ -1665,6 +1688,9 @@ struct markdown_printer : public printer { if (field == "no_op_offload") { return 4; } + if (field == "no_host") { + return 4; + } int width = std::max((int) field.length(), 10); @@ -1699,6 +1725,9 @@ struct markdown_printer : public printer { if (field == "no_op_offload") { return "nopo"; } + if (field == "no_host") { + return "noh"; + } if (field == "devices") { return "dev"; } @@ -1779,6 +1808,9 @@ struct markdown_printer : public printer { if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) { fields.emplace_back("no_op_offload"); } + if (params.no_host.size() > 1 || params.no_host != cmd_params_defaults.no_host) { + fields.emplace_back("no_host"); + } fields.emplace_back("test"); fields.emplace_back("t/s"); From dc4c64aa01ef8bd87daef7f494e65428ac946987 Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 6 Oct 2025 19:46:24 +0200 Subject: [PATCH 2/3] fix equal_mparams --- tools/llama-bench/llama-bench.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index cfaebd2e11cdb..6b692eb5b00f9 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -1138,6 +1138,7 @@ struct cmd_params_instance { split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split && devices == other.devices && + no_host == other.no_host && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides); } From faf58ef072ce1bad22caf59773719e060e59c11c Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 6 Oct 2025 19:50:19 +0200 Subject: [PATCH 3/3] move no-host enumeration order together with other model params --- tools/llama-bench/llama-bench.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 6b692eb5b00f9..ea34eb005a555 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -1174,9 +1174,9 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & ts : params.tensor_split) for (const auto & ot : params.tensor_buft_overrides) for (const auto & mmp : params.use_mmap) + for (const auto & noh : params.no_host) for (const auto & embd : params.embeddings) for (const auto & nopo : params.no_op_offload) - for (const auto & noh : params.no_host) for (const auto & nb : params.n_batch) for (const auto & nub : params.n_ubatch) for (const auto & tk : params.type_k)