diff --git a/common/arg.cpp b/common/arg.cpp index 24c0e00750f2a..df127bc183a57 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -234,10 +234,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context } } - postprocess_cpu_params(params.cpuparams, nullptr); + postprocess_cpu_params(params.cpuparams, nullptr); postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams); - postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams); - postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch); + + postprocess_cpu_params(params.speculative.cpuparams, ¶ms.cpuparams); + postprocess_cpu_params(params.speculative.cpuparams_batch, ¶ms.cpuparams_batch); if (params.prompt_cache_all && (params.interactive || params.interactive_first)) { throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); @@ -252,7 +253,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context for (auto & antiprompt : params.antiprompt) { string_process_escapes(antiprompt); } - for (auto & seq_breaker : params.sparams.dry_sequence_breakers) { + for (auto & seq_breaker : params.sampling.dry_sequence_breakers) { string_process_escapes(seq_breaker); } } @@ -330,7 +331,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex std::string sampler_type_chars; std::string sampler_type_names; - for (const auto & sampler : params.sparams.samplers) { + for (const auto & sampler : params.sampling.samplers) { sampler_type_chars += common_sampler_type_to_chr(sampler); sampler_type_names += common_sampler_type_to_str(sampler) + ";"; } @@ -408,26 +409,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } )); - add_opt(common_arg( - {"-td", "--threads-draft"}, "N", - "number of threads to use during generation (default: same as --threads)", - [](common_params & params, int value) { - params.draft_cpuparams.n_threads = value; - if (params.draft_cpuparams.n_threads <= 0) { - params.draft_cpuparams.n_threads = std::thread::hardware_concurrency(); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"-tbd", "--threads-batch-draft"}, "N", - "number of threads to use during batch and prompt processing (default: same as --threads-draft)", - [](common_params & params, int value) { - params.draft_cpuparams_batch.n_threads = value; - if (params.draft_cpuparams_batch.n_threads <= 0) { - params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency(); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-C", "--cpu-mask"}, "M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")", @@ -516,108 +497,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.cpuparams_batch.poll = value; } )); - add_opt(common_arg( - {"-Cd", "--cpu-mask-draft"}, "M", - "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](common_params & params, const std::string & mask) { - params.draft_cpuparams.mask_valid = true; - if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"-Crd", "--cpu-range-draft"}, "lo-hi", - "Ranges of CPUs for affinity. Complements --cpu-mask-draft", - [](common_params & params, const std::string & range) { - params.draft_cpuparams.mask_valid = true; - if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) { - throw std::invalid_argument("invalid range"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--cpu-strict-draft"}, "<0|1>", - "Use strict CPU placement for draft model (default: same as --cpu-strict)", - [](common_params & params, int value) { - params.draft_cpuparams.strict_cpu = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--prio-draft"}, "N", - string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority), - [](common_params & params, int prio) { - if (prio < 0 || prio > 3) { - throw std::invalid_argument("invalid value"); - } - params.draft_cpuparams.priority = (enum ggml_sched_priority) prio; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--poll-draft"}, "<0|1>", - "Use polling to wait for draft model work (default: same as --poll])", - [](common_params & params, int value) { - params.draft_cpuparams.poll = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"-Cbd", "--cpu-mask-batch-draft"}, "M", - "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](common_params & params, const std::string & mask) { - params.draft_cpuparams_batch.mask_valid = true; - if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", - "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", - [](common_params & params, const std::string & range) { - params.draft_cpuparams_batch.mask_valid = true; - if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--cpu-strict-batch-draft"}, "<0|1>", - "Use strict CPU placement for draft model (default: --cpu-strict-draft)", - [](common_params & params, int value) { - params.draft_cpuparams_batch.strict_cpu = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--prio-batch-draft"}, "N", - string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority), - [](common_params & params, int prio) { - if (prio < 0 || prio > 3) { - throw std::invalid_argument("invalid value"); - } - params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--poll-batch-draft"}, "<0|1>", - "Use polling to wait for draft model work (default: --poll-draft)", - [](common_params & params, int value) { - params.draft_cpuparams_batch.poll = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--draft"}, "N", - string_format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), - [](common_params & params, int value) { - params.n_draft = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); - add_opt(common_arg( - {"-ps", "--p-split"}, "N", - string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split), - [](common_params & params, const std::string & value) { - params.p_split = std::stof(value); - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-lcs", "--lookup-cache-static"}, "FNAME", "path to static lookup cache to use for lookup decoding (not updated by generation)", @@ -702,7 +581,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), [](common_params & params) { params.no_perf = true; - params.sparams.no_perf = true; + params.sampling.no_perf = true; } ).set_env("LLAMA_ARG_NO_PERF")); add_opt(common_arg( @@ -884,155 +763,155 @@ common_params_context common_params_parser_init(common_params & params, llama_ex string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), [](common_params & params, const std::string & value) { const auto sampler_names = string_split(value, ';'); - params.sparams.samplers = common_sampler_types_from_names(sampler_names, true); + params.sampling.samplers = common_sampler_types_from_names(sampler_names, true); } ).set_sparam()); add_opt(common_arg( {"-s", "--seed"}, "SEED", - string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED), + string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED), [](common_params & params, const std::string & value) { - params.sparams.seed = std::stoul(value); + params.sampling.seed = std::stoul(value); } ).set_sparam()); add_opt(common_arg( {"--sampling-seq"}, "SEQUENCE", string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), [](common_params & params, const std::string & value) { - params.sparams.samplers = common_sampler_types_from_chars(value); + params.sampling.samplers = common_sampler_types_from_chars(value); } ).set_sparam()); add_opt(common_arg( {"--ignore-eos"}, "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)", [](common_params & params) { - params.sparams.ignore_eos = true; + params.sampling.ignore_eos = true; } ).set_sparam()); add_opt(common_arg( {"--penalize-nl"}, - string_format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"), + string_format("penalize newline tokens (default: %s)", params.sampling.penalize_nl ? "true" : "false"), [](common_params & params) { - params.sparams.penalize_nl = true; + params.sampling.penalize_nl = true; } ).set_sparam()); add_opt(common_arg( {"--temp"}, "N", - string_format("temperature (default: %.1f)", (double)params.sparams.temp), + string_format("temperature (default: %.1f)", (double)params.sampling.temp), [](common_params & params, const std::string & value) { - params.sparams.temp = std::stof(value); - params.sparams.temp = std::max(params.sparams.temp, 0.0f); + params.sampling.temp = std::stof(value); + params.sampling.temp = std::max(params.sampling.temp, 0.0f); } ).set_sparam()); add_opt(common_arg( {"--top-k"}, "N", - string_format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k), + string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k), [](common_params & params, int value) { - params.sparams.top_k = value; + params.sampling.top_k = value; } ).set_sparam()); add_opt(common_arg( {"--top-p"}, "N", - string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p), + string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p), [](common_params & params, const std::string & value) { - params.sparams.top_p = std::stof(value); + params.sampling.top_p = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--min-p"}, "N", - string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p), + string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p), [](common_params & params, const std::string & value) { - params.sparams.min_p = std::stof(value); + params.sampling.min_p = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--xtc-probability"}, "N", - string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability), + string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability), [](common_params & params, const std::string & value) { - params.sparams.xtc_probability = std::stof(value); + params.sampling.xtc_probability = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--xtc-threshold"}, "N", - string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold), + string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold), [](common_params & params, const std::string & value) { - params.sparams.xtc_threshold = std::stof(value); + params.sampling.xtc_threshold = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--typical"}, "N", - string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p), + string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p), [](common_params & params, const std::string & value) { - params.sparams.typ_p = std::stof(value); + params.sampling.typ_p = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--repeat-last-n"}, "N", - string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n), + string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n), [](common_params & params, int value) { - params.sparams.penalty_last_n = value; - params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n); + params.sampling.penalty_last_n = value; + params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n); } ).set_sparam()); add_opt(common_arg( {"--repeat-penalty"}, "N", - string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat), + string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat), [](common_params & params, const std::string & value) { - params.sparams.penalty_repeat = std::stof(value); + params.sampling.penalty_repeat = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--presence-penalty"}, "N", - string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present), + string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present), [](common_params & params, const std::string & value) { - params.sparams.penalty_present = std::stof(value); + params.sampling.penalty_present = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--frequency-penalty"}, "N", - string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq), + string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq), [](common_params & params, const std::string & value) { - params.sparams.penalty_freq = std::stof(value); + params.sampling.penalty_freq = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dry-multiplier"}, "N", - string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sparams.dry_multiplier), + string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier), [](common_params & params, const std::string & value) { - params.sparams.dry_multiplier = std::stof(value); + params.sampling.dry_multiplier = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dry-base"}, "N", - string_format("set DRY sampling base value (default: %.2f)", (double)params.sparams.dry_base), + string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base), [](common_params & params, const std::string & value) { float potential_base = std::stof(value); if (potential_base >= 1.0f) { - params.sparams.dry_base = potential_base; + params.sampling.dry_base = potential_base; } } ).set_sparam()); add_opt(common_arg( {"--dry-allowed-length"}, "N", - string_format("set allowed length for DRY sampling (default: %d)", params.sparams.dry_allowed_length), + string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length), [](common_params & params, int value) { - params.sparams.dry_allowed_length = value; + params.sampling.dry_allowed_length = value; } ).set_sparam()); add_opt(common_arg( {"--dry-penalty-last-n"}, "N", - string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sparams.dry_penalty_last_n), + string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n), [](common_params & params, int value) { - params.sparams.dry_penalty_last_n = value; + params.sampling.dry_penalty_last_n = value; } ).set_sparam()); add_opt(common_arg( {"--dry-sequence-breaker"}, "STRING", string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n", - params.sparams.dry_sequence_breakers.empty() ? "none" : - std::accumulate(std::next(params.sparams.dry_sequence_breakers.begin()), - params.sparams.dry_sequence_breakers.end(), - std::string("'") + (params.sparams.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sparams.dry_sequence_breakers[0]) + "'", + params.sampling.dry_sequence_breakers.empty() ? "none" : + std::accumulate(std::next(params.sampling.dry_sequence_breakers.begin()), + params.sampling.dry_sequence_breakers.end(), + std::string("'") + (params.sampling.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sampling.dry_sequence_breakers[0]) + "'", [](const std::string& a, const std::string& b) { std::string formatted_b = (b == "\n") ? "\\n" : b; return a + ", '" + formatted_b + "'"; @@ -1041,51 +920,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex static bool defaults_cleared = false; if (!defaults_cleared) { - params.sparams.dry_sequence_breakers.clear(); + params.sampling.dry_sequence_breakers.clear(); defaults_cleared = true; } if (value == "none") { - params.sparams.dry_sequence_breakers.clear(); + params.sampling.dry_sequence_breakers.clear(); } else { - params.sparams.dry_sequence_breakers.emplace_back(value); + params.sampling.dry_sequence_breakers.emplace_back(value); } } ).set_sparam()); add_opt(common_arg( {"--dynatemp-range"}, "N", - string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range), + string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range), [](common_params & params, const std::string & value) { - params.sparams.dynatemp_range = std::stof(value); + params.sampling.dynatemp_range = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dynatemp-exp"}, "N", - string_format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent), + string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent), [](common_params & params, const std::string & value) { - params.sparams.dynatemp_exponent = std::stof(value); + params.sampling.dynatemp_exponent = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--mirostat"}, "N", string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n" - "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat), + "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat), [](common_params & params, int value) { - params.sparams.mirostat = value; + params.sampling.mirostat = value; } ).set_sparam()); add_opt(common_arg( {"--mirostat-lr"}, "N", - string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta), + string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta), [](common_params & params, const std::string & value) { - params.sparams.mirostat_eta = std::stof(value); + params.sampling.mirostat_eta = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--mirostat-ent"}, "N", - string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau), + string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau), [](common_params & params, const std::string & value) { - params.sparams.mirostat_tau = std::stof(value); + params.sampling.mirostat_tau = std::stof(value); } ).set_sparam()); add_opt(common_arg( @@ -1101,7 +980,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex try { if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); - params.sparams.logit_bias.push_back({key, bias}); + params.sampling.logit_bias.push_back({key, bias}); } else { throw std::invalid_argument("invalid input format"); } @@ -1112,9 +991,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--grammar"}, "GRAMMAR", - string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()), + string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()), [](common_params & params, const std::string & value) { - params.sparams.grammar = value; + params.sampling.grammar = value; } ).set_sparam()); add_opt(common_arg( @@ -1128,7 +1007,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex std::copy( std::istreambuf_iterator(file), std::istreambuf_iterator(), - std::back_inserter(params.sparams.grammar) + std::back_inserter(params.sampling.grammar) ); } ).set_sparam()); @@ -1136,7 +1015,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"-j", "--json-schema"}, "SCHEMA", "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", [](common_params & params, const std::string & value) { - params.sparams.grammar = json_schema_to_grammar(json::parse(value)); + params.sampling.grammar = json_schema_to_grammar(json::parse(value)); } ).set_sparam()); add_opt(common_arg( @@ -1445,17 +1324,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } ).set_env("LLAMA_ARG_N_GPU_LAYERS")); - add_opt(common_arg( - {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", - "number of layers to store in VRAM for the draft model", - [](common_params & params, int value) { - params.n_gpu_layers_draft = value; - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-sm", "--split-mode"}, "{none,layer,row}", "how to split the model across multiple GPUs, one of:\n" @@ -1594,13 +1462,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.model = value; } ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); - add_opt(common_arg( - {"-md", "--model-draft"}, "FNAME", - "draft model for speculative decoding (default: unused)", - [](common_params & params, const std::string & value) { - params.model_draft = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-mu", "--model-url"}, "MODEL_URL", "model download url (default: unused)", @@ -2038,5 +1899,168 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_LOG_TIMESTAMPS")); + // speculative parameters + add_opt(common_arg( + {"-td", "--threads-draft"}, "N", + "number of threads to use during generation (default: same as --threads)", + [](common_params & params, int value) { + params.speculative.cpuparams.n_threads = value; + if (params.speculative.cpuparams.n_threads <= 0) { + params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency(); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"-tbd", "--threads-batch-draft"}, "N", + "number of threads to use during batch and prompt processing (default: same as --threads-draft)", + [](common_params & params, int value) { + params.speculative.cpuparams_batch.n_threads = value; + if (params.speculative.cpuparams_batch.n_threads <= 0) { + params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"-Cd", "--cpu-mask-draft"}, "M", + "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", + [](common_params & params, const std::string & mask) { + params.speculative.cpuparams.mask_valid = true; + if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"-Crd", "--cpu-range-draft"}, "lo-hi", + "Ranges of CPUs for affinity. Complements --cpu-mask-draft", + [](common_params & params, const std::string & range) { + params.speculative.cpuparams.mask_valid = true; + if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) { + throw std::invalid_argument("invalid range"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--cpu-strict-draft"}, "<0|1>", + "Use strict CPU placement for draft model (default: same as --cpu-strict)", + [](common_params & params, int value) { + params.speculative.cpuparams.strict_cpu = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--prio-draft"}, "N", + string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority), + [](common_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--poll-draft"}, "<0|1>", + "Use polling to wait for draft model work (default: same as --poll])", + [](common_params & params, int value) { + params.speculative.cpuparams.poll = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"-Cbd", "--cpu-mask-batch-draft"}, "M", + "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", + [](common_params & params, const std::string & mask) { + params.speculative.cpuparams_batch.mask_valid = true; + if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", + "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", + [](common_params & params, const std::string & range) { + params.speculative.cpuparams_batch.mask_valid = true; + if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--cpu-strict-batch-draft"}, "<0|1>", + "Use strict CPU placement for draft model (default: --cpu-strict-draft)", + [](common_params & params, int value) { + params.speculative.cpuparams_batch.strict_cpu = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--prio-batch-draft"}, "N", + string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority), + [](common_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--poll-batch-draft"}, "<0|1>", + "Use polling to wait for draft model work (default: --poll-draft)", + [](common_params & params, int value) { + params.speculative.cpuparams_batch.poll = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--draft-max", "--draft", "--draft-n"}, "N", + string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max), + [](common_params & params, int value) { + params.speculative.n_max = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--draft-min", "--draft-n-min"}, "N", + string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min), + [](common_params & params, int value) { + params.speculative.n_min = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--draft-p-split"}, "P", + string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split), + [](common_params & params, const std::string & value) { + params.speculative.p_split = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--draft-p-min"}, "P", + string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min), + [](common_params & params, const std::string & value) { + params.speculative.p_min = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"-cd", "--ctx-size-draft"}, "N", + string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx), + [](common_params & params, int value) { + params.speculative.n_ctx = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", + "number of layers to store in VRAM for the draft model", + [](common_params & params, int value) { + params.speculative.n_gpu_layers = value; + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"-md", "--model-draft"}, "FNAME", + "draft model for speculative decoding (default: unused)", + [](common_params & params, const std::string & value) { + params.speculative.model = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + return ctx_arg; } diff --git a/common/common.cpp b/common/common.cpp index e3de343b0edaa..94f41a8ca48c6 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -538,12 +538,12 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat [](const unsigned char c) { return !std::isprint(c); }), detokenized.end()); - buf << "\n" << std::to_string(i) - << ":token '" << detokenized << "'" - << ":pos " << std::to_string(batch.pos[i]) - << ":n_seq_id " << std::to_string(batch.n_seq_id[i]) - << ":seq_id " << std::to_string(batch.seq_id[i][0]) - << ":logits " << std::to_string(batch.logits[i]); + buf << "\n" << std::to_string(i) + << ", token '" << detokenized << "'" + << ", pos " << std::to_string(batch.pos[i]) + << ", n_seq_id " << std::to_string(batch.n_seq_id[i]) + << ", seq_id " << std::to_string(batch.seq_id[i][0]) + << ", logits " << std::to_string(batch.logits[i]); } buf << " ]"; @@ -927,9 +927,9 @@ struct common_init_result common_init_from_params(common_params & params) { common_lora_adapters_apply(lctx, iparams.lora_adapters); } - if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) { + if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) { LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__); - params.sparams.ignore_eos = false; + params.sampling.ignore_eos = false; } if (params.warmup) { @@ -1492,6 +1492,66 @@ void common_batch_add( batch.n_tokens++; } +// +// Token utils +// + +size_t common_lcp(const llama_tokens & a, const llama_tokens & b) { + size_t i; + for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} + + return i; +} + +size_t common_lcs(const llama_tokens & a, const llama_tokens & b) { + // check for empty sequences + if (a.empty() || b.empty()) { + return 0; + } + + // get the lengths of the input sequences + size_t a_len = a.size(); + size_t b_len = b.size(); + + // initialize the maximum length of the longest common subsequence (LCS) + size_t max_length = 0; + + // use two rows instead of a 2D matrix to optimize space + std::vector prev_row(b_len + 1, 0); + std::vector curr_row(b_len + 1, 0); + + // iterate through the elements of a + for (size_t i = 1; i <= a_len; i++) { + // iterate through the elements of b + for (size_t j = 1; j <= b_len; j++) { + // if elements at the current positions match + if (a[i - 1] == b[j - 1]) { + // if it's the first element of either sequences, set LCS length to 1 + if (i == 1 || j == 1) { + curr_row[j] = 1; + } else { + // increment LCS length by 1 compared to the previous element + curr_row[j] = prev_row[j - 1] + 1; + } + + // update max_length if necessary + if (curr_row[j] > max_length) { + max_length = curr_row[j]; + } + } else { + // reset LCS length if elements don't match + curr_row[j] = 0; + } + } + + // update the previous row for the next iteration + prev_row = curr_row; + } + + // return the maximum length of the LCS + return max_length; +} + // // Vocab utils // diff --git a/common/common.h b/common/common.h index bd35ccb79f249..9766ac5fd00c1 100644 --- a/common/common.h +++ b/common/common.h @@ -33,6 +33,8 @@ struct common_lora_adapter_container : common_lora_adapter_info { struct llama_lora_adapter * adapter; }; +using llama_tokens = std::vector; + // build info struct common_control_vector_load_info; @@ -97,8 +99,8 @@ enum dimre_method { DIMRE_METHOD_MEAN, }; -// sampler parameters -struct common_sampler_params { +// sampling parameters +struct common_params_sampling { uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler int32_t n_prev = 64; // number of previous tokens to remember @@ -149,19 +151,30 @@ struct common_sampler_params { std::string print() const; }; +struct common_params_speculative { + int32_t n_ctx = 0; // draft context size + int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding + int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding + int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default) + float p_split = 0.1f; // speculative decoding split probability + float p_min = 0.9f; // minimum speculative decoding probability (greedy) + + struct cpu_params cpuparams; + struct cpu_params cpuparams_batch; + + std::string model = ""; // draft model for speculative decoding // NOLINT +}; + struct common_params { int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 4096; // context size int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_draft = 5; // number of tokens to draft during speculative decoding int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_parallel = 1; // number of parallel sequences to decode int32_t n_sequences = 1; // number of sequences to decode - float p_split = 0.1f; // speculative decoding split probability int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) - int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs int32_t grp_attn_n = 1; // group-attention factor @@ -178,8 +191,6 @@ struct common_params { struct cpu_params cpuparams; struct cpu_params cpuparams_batch; - struct cpu_params draft_cpuparams; - struct cpu_params draft_cpuparams_batch; ggml_backend_sched_eval_callback cb_eval = nullptr; void * cb_eval_user_data = nullptr; @@ -191,10 +202,10 @@ struct common_params { enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings - struct common_sampler_params sparams; + struct common_params_sampling sampling; + struct common_params_speculative speculative; std::string model = ""; // model path // NOLINT - std::string model_draft = ""; // draft model for speculative decoding // NOLINT std::string model_alias = "unknown"; // model alias // NOLINT std::string model_url = ""; // model url to download // NOLINT std::string hf_token = ""; // HF token // NOLINT @@ -457,7 +468,9 @@ struct llama_model * common_load_model_from_hf(const char * repo, const char * f // clear LoRA adapters from context, then apply new list of adapters void common_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters); +// // Batch utils +// void common_batch_clear(struct llama_batch & batch); @@ -468,6 +481,16 @@ void common_batch_add( const std::vector & seq_ids, bool logits); +// +// Token utils +// + +// longest common prefix +size_t common_lcp(const llama_tokens & a, const llama_tokens & b); + +// longet common subsequence +size_t common_lcs(const llama_tokens & a, const llama_tokens & b); + // // Vocab utils // diff --git a/common/sampling.cpp b/common/sampling.cpp index 7922fde47d369..0c4699a89c8b2 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -99,7 +99,7 @@ struct ring_buffer { }; struct common_sampler { - common_sampler_params params; + common_params_sampling params; struct llama_sampler * grmr; struct llama_sampler * chain; @@ -125,7 +125,7 @@ struct common_sampler { } }; -std::string common_sampler_params::print() const { +std::string common_params_sampling::print() const { char result[1024]; snprintf(result, sizeof(result), @@ -141,7 +141,7 @@ std::string common_sampler_params::print() const { return std::string(result); } -struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) { +struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) { llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); lparams.no_perf = params.no_perf; @@ -320,6 +320,45 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co return cur_p.data[cur_p.selected].id; } +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector & idxs, const llama_tokens & draft, bool grammar_first) { + GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1"); + + std::vector result; + result.reserve(idxs.size()); + + size_t i = 0; + for (; i < draft.size(); i++) { + const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first); + + common_sampler_accept(gsmpl, id, true); + + result.push_back(id); + + if (draft[i] != id) { + break; + } + } + + if (i == draft.size()) { + const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first); + + common_sampler_accept(gsmpl, id, true); + + result.push_back(id); + } + + return result; +} + +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) { + std::vector idxs(draft.size() + 1); + for (size_t i = 0; i < idxs.size(); ++i) { + idxs[i] = i; + } + + return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first); +} + uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) { return llama_sampler_get_seed(gsmpl->chain); } diff --git a/common/sampling.h b/common/sampling.h index d37f25ad37c4a..348911b18888b 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -36,7 +36,7 @@ struct common_sampler; // llama_sampler API overloads -struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params); +struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params); void common_sampler_free(struct common_sampler * gsmpl); @@ -60,6 +60,27 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam // llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); +// generalized version of common_sampler_sample +// +// will cross-reference the sampled tokens with a batch of draft tokens and accept those that match +// if the sampler disagrees at some point, we stop and return the accepted tokens up to now +// +// common_sampler_sample_n(gsmpl, ctx, { idx }, {}); +// +// is equivalent to +// +// common_sampler_sample(gsmpl, ctx, idx); +// common_sampler_accept(gsmpl, token, true); +// +// requires: idxs.size() == draft.size() + 1 +// +// returns at least 1 token, up to idxs.size() +// +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector & idxs, const llama_tokens & draft, bool grammar_first = false); + +// assume idxs == [ 0, 1, 2, ..., draft.size() ] +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false); + uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl); // helpers diff --git a/common/speculative.cpp b/common/speculative.cpp new file mode 100644 index 0000000000000..fe315a2703e9c --- /dev/null +++ b/common/speculative.cpp @@ -0,0 +1,269 @@ +#include "speculative.h" + +#include "log.h" +#include "common.h" +#include "sampling.h" + +#include + +#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128 +#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5 + +struct common_speculative { + struct llama_context * ctx; + struct common_sampler * smpl; + + llama_batch batch; + llama_tokens prompt; +}; + +struct common_speculative * common_speculative_init( + struct llama_context * ctx_dft) { + auto * result = new common_speculative { + /* .ctx = */ ctx_dft, + /* .smpl = */ nullptr, + /* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1), + /* .prompt = */ {}, + }; + + // TODO: optimize or pass from outside? +#if 0 + { + common_params_sampling params; + params.no_perf = false; + + params.top_k = 40; + params.top_p = 0.9; + + params.samplers = { + COMMON_SAMPLER_TYPE_TOP_K, + COMMON_SAMPLER_TYPE_TOP_P, + COMMON_SAMPLER_TYPE_INFILL, + }; + + result->smpl = common_sampler_init(llama_get_model(ctx_dft), params); + } +#else + { + common_params_sampling params; + params.no_perf = false; + + params.top_k = 10; + + params.samplers = { + COMMON_SAMPLER_TYPE_TOP_K, + }; + + result->smpl = common_sampler_init(llama_get_model(ctx_dft), params); + } +#endif + + return result; +} + +void common_speculative_free(struct common_speculative * spec) { + common_sampler_free(spec->smpl); + + llama_batch_free(spec->batch); + + delete spec; +} + +bool common_speculative_are_compatible( + const struct llama_context * ctx_tgt, + const struct llama_context * ctx_dft) { + const struct llama_model * model_tgt = llama_get_model(ctx_tgt); + const struct llama_model * model_dft = llama_get_model(ctx_dft); + + const bool vocab_type_tgt = llama_vocab_type(model_tgt); + LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt); + + const bool vocab_type_dft = llama_vocab_type(model_dft); + LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft); + + if (vocab_type_tgt != vocab_type_dft) { + LOG_ERR("%s: draft model vocab type must match target model to use speculation but " + "vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt); + return false; + } + + if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) || + llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) || + llama_token_bos(model_tgt) != llama_token_bos(model_dft) || + llama_token_eos(model_tgt) != llama_token_eos(model_dft) + ) { + LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__); + return false; + } + + { + const int n_vocab_tgt = llama_n_vocab(model_tgt); + const int n_vocab_dft = llama_n_vocab(model_dft); + + const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft); + + if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) { + LOG_ERR("%s: draft model vocab must closely match target model to use speculation but " + "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n", + __func__, n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE); + return false; + } + + for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) { + const char * token_text_tgt = llama_token_get_text(model_tgt, i); + const char * token_text_dft = llama_token_get_text(model_dft, i); + if (std::strcmp(token_text_tgt, token_text_dft) != 0) { + LOG_ERR("%s: draft model vocab must match target model to use speculation but " + "token %d content differs - target '%s', draft '%s'\n", __func__, i, + common_token_to_piece(ctx_tgt, i).c_str(), + common_token_to_piece(ctx_dft, i).c_str()); + return false; + } + } + } + + return true; +} + +llama_tokens common_speculative_gen_draft( + struct common_speculative * spec, + struct common_speculative_params params, + const llama_tokens & prompt_tgt, + llama_token id_last) { + auto & batch = spec->batch; + auto & ctx = spec->ctx; + auto & smpl = spec->smpl; + auto & prompt = spec->prompt; + + int reuse_i = 0; + int reuse_n = 0; + + const int n_ctx = llama_n_ctx(ctx) - params.n_draft; + + const int i_start = std::max(0, (int) prompt_tgt.size() - n_ctx); + + // reuse as much as possible from the old draft context + // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt + for (int i = 0; i < (int) prompt.size(); ++i) { + int cur = 0; + while (i_start + cur < (int) prompt_tgt.size() && + i + cur < (int) prompt.size() && + prompt_tgt[i_start + cur] == prompt[i + cur]) { + cur++; + } + + if ((cur >= params.n_reuse || n_ctx >= (int) prompt_tgt.size()) && cur > reuse_n) { + reuse_i = i; + reuse_n = cur; + } + } + + LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size()); + + llama_tokens result; + result.reserve(params.n_draft); + + if (reuse_n == 0) { + llama_kv_cache_clear(ctx); + + prompt.clear(); + } else { + // this happens when a previous draft has been discarded (for example, due to being too small), but the + // target model agreed with it. in this case, we simply pass back the previous results to save compute + if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) { + for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) { + result.push_back(prompt[i]); + + if (params.n_draft <= (int) result.size()) { + break; + } + } + + return result; + } + + if (reuse_i > 0) { + llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i); + llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i); + + prompt.erase(prompt.begin(), prompt.begin() + reuse_i); + } + + if (reuse_n < (int) prompt.size()) { + llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1); + + prompt.erase(prompt.begin() + reuse_n, prompt.end()); + } + } + + // prepare a batch to evaluate any new tokens in the prompt + common_batch_clear(batch); + + for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) { + //LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]); + common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false); + + prompt.push_back(prompt_tgt[i]); + } + + // we should rarely end-up here during normal decoding + if (batch.n_tokens > 0) { + //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str()); + + llama_decode(ctx, batch); + } + + const llama_pos n_past = prompt.size(); + + LOG_DBG("%s: n_past = %d\n", __func__, n_past); + + common_batch_clear(batch); + common_batch_add (batch, id_last, n_past, { 0 }, true); + + prompt.push_back(id_last); + + //LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str()); + + llama_decode(ctx, batch); + + common_sampler_reset(smpl); + + // sample n_draft tokens from the draft model + for (int i = 0; i < params.n_draft; ++i) { + common_batch_clear(batch); + + common_sampler_sample(smpl, ctx, 0, true); + + const auto * cur_p = common_sampler_get_candidates(smpl); + + for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) { + LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n", + k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str()); + } + + // add drafted token for each sequence + const llama_token id = cur_p->data[0].id; + + // only collect very high-confidence draft tokens + if (cur_p->data[0].p < params.p_min) { + break; + } + + common_sampler_accept(smpl, id, true); + + result.push_back(id); + + if (params.n_draft <= (int) result.size()) { + break; + } + + common_batch_add(batch, id, n_past + i + 1, { 0 }, true); + + // evaluate the drafted tokens on the draft model + llama_decode(ctx, batch); + + prompt.push_back(id); + } + + return result; +} diff --git a/common/speculative.h b/common/speculative.h new file mode 100644 index 0000000000000..50ec0344618aa --- /dev/null +++ b/common/speculative.h @@ -0,0 +1,28 @@ +#pragma once + +#include "llama.h" +#include "common.h" + +struct common_speculative; + +struct common_speculative_params { + int n_draft = 16; // max drafted tokens + int n_reuse = 256; + + float p_min = 0.9f; // min probabiliy required to accept a token in the draft +}; + +struct common_speculative * common_speculative_init(struct llama_context * ctx_dft); + +void common_speculative_free(struct common_speculative * spec); + +bool common_speculative_are_compatible( + const struct llama_context * ctx_tgt, + const struct llama_context * ctx_dft); + +// sample up to n_draft tokens and add them to the batch using the draft model +llama_tokens common_speculative_gen_draft( + struct common_speculative * spec, + struct common_speculative_params params, + const llama_tokens & prompt, + llama_token id_last); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9f4b8154b88a8..80a179b86af7e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2707,7 +2707,7 @@ def set_vocab(self): self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_add_space_prefix(add_prefix) - self.gguf_writer.add_token_type_count(1) + self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) if precompiled_charsmap: self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 3b554033e7ee4..ba219cd4b32ae 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -68,10 +68,10 @@ int main(int argc, char ** argv) { llama_sampler * smpl = llama_sampler_chain_init(sparams); - llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k)); - llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep)); - llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp)); - llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed)); + llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k)); + llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sampling.top_p, params.sampling.min_keep)); + llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp)); + llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed)); if (ctx == NULL) { LOG_ERR("%s: error: failed to create the llama_context\n" , __func__); diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 1610985858fc9..2691c6e6b2dd2 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -191,7 +191,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ LOG("\n"); - struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams); + struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling); if (!smpl) { LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); exit(1); diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index cbecec343c640..e9cbb51ed90ab 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -237,7 +237,7 @@ static struct common_sampler * llama_init(struct llava_context * ctx_llava, comm LOG_INF("\n"); - struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams); + struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling); return smpl; } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 341d3d5691cf5..25837d9502ff4 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -101,7 +101,7 @@ int main(int argc, char ** argv) { common_init(); - auto & sparams = params.sparams; + auto & sparams = params.sampling; // save choice to use color for later // (note for later: this is a slightly awkward choice) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 623e67b47fbca..6a06e657c3e44 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -176,7 +176,7 @@ struct server_slot { // sampling json json_schema; - struct common_sampler_params sparams; + struct common_params_sampling sparams; struct common_sampler * smpl = nullptr; llama_token sampled; @@ -688,7 +688,7 @@ struct server_context { SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx); - slot.sparams = params.sparams; + slot.sparams = params.sampling; slot.callback_on_release = [this](int) { queue_tasks.pop_deferred_task(); @@ -744,7 +744,7 @@ struct server_context { } // length of the Longest Common Subsequence between the current slot's prompt and the input prompt - int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens); + int cur_lcs_len = common_lcs(slot.cache_tokens, task.prompt_tokens); // fraction of the common subsequence length compared to the current slot's prompt length float cur_similarity = static_cast(cur_lcs_len) / static_cast(slot.cache_tokens.size()); @@ -789,7 +789,7 @@ struct server_context { bool launch_slot_with_task(server_slot & slot, const server_task & task) { slot_params default_params; // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) - auto default_sparams = params.sparams; + auto default_sparams = params.sampling; const auto & data = task.data; if (data.count("__oaicompat") != 0) { @@ -1961,7 +1961,7 @@ struct server_context { if (slot.params.cache_prompt) { // reuse any previously computed tokens that are common with the new prompt - slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens); + slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens); // reuse chunks from the cached prompt by shifting their KV cache in the new position if (params.n_cache_reuse > 0) { diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index c47ed3e47a76d..1665e9dc37db6 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -24,7 +24,6 @@ #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" using json = nlohmann::ordered_json; -using llama_tokens = std::vector; #define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) #define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) @@ -439,62 +438,6 @@ static std::string gen_chatcmplid() { // other common utils // -static size_t longest_common_prefix(const llama_tokens & a, const llama_tokens & b) { - size_t i; - for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} - - return i; -} - -static size_t longest_common_subsequence(const llama_tokens & a, const llama_tokens & b) { - // check for empty sequences - if (a.empty() || b.empty()) { - return 0; - } - - // get the lengths of the input sequences - size_t a_len = a.size(); - size_t b_len = b.size(); - - // initialize the maximum length of the longest common subsequence (LCS) - size_t max_length = 0; - - // use two rows instead of a 2D matrix to optimize space - std::vector prev_row(b_len + 1, 0); - std::vector curr_row(b_len + 1, 0); - - // iterate through the elements of a - for (size_t i = 1; i <= a_len; i++) { - // iterate through the elements of b - for (size_t j = 1; j <= b_len; j++) { - // if elements at the current positions match - if (a[i - 1] == b[j - 1]) { - // if it's the first element of either sequences, set LCS length to 1 - if (i == 1 || j == 1) { - curr_row[j] = 1; - } else { - // increment LCS length by 1 compared to the previous element - curr_row[j] = prev_row[j - 1] + 1; - } - - // update max_length if necessary - if (curr_row[j] > max_length) { - max_length = curr_row[j]; - } - } else { - // reset LCS length if elements don't match - curr_row[j] = 0; - } - } - - // update the previous row for the next iteration - prev_row = curr_row; - } - - // return the maximum length of the LCS - return max_length; -} - static bool ends_with(const std::string & str, const std::string & suffix) { return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); } diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 811a63bdbf56b..680def09f7c8c 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -13937,7 +13937,7 @@ int ggml_cpu_has_vsx(void) { } int ggml_cpu_has_neon(void) { -#if defined(__ARM_ARCH) +#if defined(__ARM_ARCH) && defined(__ARM_NEON) return ggml_arm_arch_features.has_neon; #else return 0; @@ -13945,7 +13945,7 @@ int ggml_cpu_has_neon(void) { } int ggml_cpu_has_sve(void) { -#if defined(__ARM_ARCH) +#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE) return ggml_arm_arch_features.has_sve; #else return 0; @@ -13953,7 +13953,7 @@ int ggml_cpu_has_sve(void) { } int ggml_cpu_has_matmul_int8(void) { -#if defined(__ARM_ARCH) +#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_MATMUL_INT8) return ggml_arm_arch_features.has_i8mm; #else return 0; @@ -13961,7 +13961,7 @@ int ggml_cpu_has_matmul_int8(void) { } int ggml_cpu_get_sve_cnt(void) { -#if defined(__ARM_ARCH) +#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE) return ggml_arm_arch_features.sve_cnt; #else return 0; diff --git a/ggml/src/ggml-cuda/argmax.cu b/ggml/src/ggml-cuda/argmax.cu index aab04eca7a385..5340eedc08916 100644 --- a/ggml/src/ggml-cuda/argmax.cu +++ b/ggml/src/ggml-cuda/argmax.cu @@ -1,57 +1,69 @@ -#include "common.cuh" +#include +#include + #include "argmax.cuh" +#include "common.cuh" #include "sum.cuh" -#include +static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __restrict__ dst, const int64_t ncols) { + const int64_t row = blockIdx.x; -static __global__ void argmax_f32( - const float * x, int32_t * dst, const int64_t ncols, const int64_t nrows) { + float maxval = -FLT_MAX; + int argmax = -1; + const float * rowx = x + row * ncols; - int argmax_thread = 0; - const int64_t row0 = (int64_t)blockIdx.x*WARP_SIZE; + for (int32_t col = threadIdx.x; col < ncols; col += blockDim.x) { + const float val = rowx[col]; + if (val > maxval) { + maxval = val; + argmax = col; + } + } #pragma unroll - for (int64_t row1 = 0; row1 < WARP_SIZE; ++row1) { - const int64_t row = row0 + row1; - - if (row >= nrows) { - break; + for (int offset = 16; offset > 0; offset >>= 1) { + const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE); + const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE); + if (val > maxval) { + maxval = val; + argmax = col; } + } - float maxval = -FLT_MAX; - int argmax = -1; - - for (int32_t col = threadIdx.x; col < ncols; col += WARP_SIZE) { - const float val = x[row*ncols + col]; - const int bigger = val > maxval; - const int not_bigger = bigger ^ 0x00000001; - - maxval = maxval*not_bigger + val*bigger; - argmax = argmax*not_bigger + col*bigger; + const int n_warps = blockDim.x / WARP_SIZE; + const int lane_id = threadIdx.x % WARP_SIZE; + const int warp_id = threadIdx.x / WARP_SIZE; + if (n_warps > 1) { + constexpr int max_warps = 1024 / WARP_SIZE; + __shared__ float shared_maxval[max_warps]; + __shared__ int shared_argmax[max_warps]; + if (lane_id == 0) { + shared_maxval[warp_id] = maxval; + shared_argmax[warp_id] = argmax; } + __syncthreads(); + + if (warp_id == 0) { + if (lane_id < n_warps) { + maxval = shared_maxval[lane_id]; + argmax = shared_argmax[lane_id]; + } #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, mask, WARP_SIZE); - const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, mask, WARP_SIZE); - const int bigger = val > maxval; - const int not_bigger = bigger ^ 0x00000001; - - maxval = maxval*not_bigger + val*bigger; - argmax = argmax*not_bigger + col*bigger; + for (int offset = 16; offset > 0; offset >>= 1) { + const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE); + const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE); + if (val > maxval) { + maxval = val; + argmax = col; + } + } } - - const int store = row1 == threadIdx.x; - argmax_thread += store*argmax; } - const int row = row0 + threadIdx.x; - - if (row >= nrows) { - return; + if (warp_id == 0 && lane_id == 0) { + dst[row] = argmax; } - - dst[row] = argmax_thread; } void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -70,10 +82,10 @@ void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { cudaStream_t stream = ctx.stream(); - const int64_t num_blocks = (nrows + WARP_SIZE - 1) / WARP_SIZE; - - const dim3 blocks_dim(WARP_SIZE, 1, 1); + const int64_t num_blocks = nrows; + const int64_t num_threads = std::min(1024, (ne00 + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE); + const dim3 blocks_dim(num_threads, 1, 1); const dim3 blocks_num(num_blocks, 1, 1); - argmax_f32<<>>(src0_d, dst_d, ne00, nrows); + argmax_f32<<>>(src0_d, dst_d, ne00); } diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index e146c691c6f87..b0dd16066b4ba 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -180,8 +180,8 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) { return __reduce_add_sync(0xffffffff, x); #else #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x += __shfl_xor_sync(0xffffffff, x, mask, 32); + for (int offset = 16; offset > 0; offset >>= 1) { + x += __shfl_xor_sync(0xffffffff, x, offset, 32); } return x; #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE @@ -189,17 +189,17 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) { static __device__ __forceinline__ float warp_reduce_sum(float x) { #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x += __shfl_xor_sync(0xffffffff, x, mask, 32); + for (int offset = 16; offset > 0; offset >>= 1) { + x += __shfl_xor_sync(0xffffffff, x, offset, 32); } return x; } static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) { #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32); - a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32); + for (int offset = 16; offset > 0; offset >>= 1) { + a.x += __shfl_xor_sync(0xffffffff, a.x, offset, 32); + a.y += __shfl_xor_sync(0xffffffff, a.y, offset, 32); } return a; } @@ -209,16 +209,16 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32); + for (int offset = 16; offset > 0; offset >>= 1) { + const half2 a_other = __shfl_xor_sync(0xffffffff, a, offset, 32); reinterpret_cast(a.x) += __low2half(a_other); reinterpret_cast(a.y) += __high2half(a_other); } return a; #else #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32)); + for (int offset = 16; offset > 0; offset >>= 1) { + a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, 32)); } return a; #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) @@ -231,8 +231,8 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { static __device__ __forceinline__ float warp_reduce_max(float x) { #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); + for (int offset = 16; offset > 0; offset >>= 1) { + x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, 32)); } return x; } @@ -275,8 +275,8 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); + for (int offset = 16; offset > 0; offset >>= 1) { + x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32)); } return x; #else diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index 45408ce8684e4..1702e4ce2feba 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -69,8 +69,8 @@ static __global__ void quantize_mmq_q8_1( // Exchange max. abs. value between vals_per_scale/4 threads. #pragma unroll - for (int mask = vals_per_scale/8; mask > 0; mask >>= 1) { - amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, mask, WARP_SIZE)); + for (int offset = vals_per_scale/8; offset > 0; offset >>= 1) { + amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, offset, WARP_SIZE)); } float sum; @@ -79,8 +79,8 @@ static __global__ void quantize_mmq_q8_1( // Exchange calculate sum across vals_per_sum/4 threads. #pragma unroll - for (int mask = vals_per_sum/8; mask > 0; mask >>= 1) { - sum += __shfl_xor_sync(0xFFFFFFFF, sum, mask, WARP_SIZE); + for (int offset = vals_per_sum/8; offset > 0; offset >>= 1) { + sum += __shfl_xor_sync(0xFFFFFFFF, sum, offset, WARP_SIZE); } } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 5e07a3fb50543..965ef6ee3c0b8 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -2256,6 +2256,7 @@ struct ggml_tensor * ggml_argmax( struct ggml_context * ctx, struct ggml_tensor * a) { GGML_ASSERT(ggml_is_matrix(a)); + GGML_ASSERT(a->ne[0] <= INT32_MAX); struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]); @@ -4139,6 +4140,7 @@ struct ggml_tensor * ggml_argsort( struct ggml_context * ctx, struct ggml_tensor * a, enum ggml_sort_order order) { + GGML_ASSERT(a->ne[0] <= INT32_MAX); struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne); ggml_set_op_params_i32(result, 0, (int32_t) order); diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py index 321cbcd4c5507..962c27b204464 100644 --- a/gguf-py/gguf/metadata.py +++ b/gguf-py/gguf/metadata.py @@ -545,7 +545,10 @@ def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter): gguf_writer.add_size_label(self.size_label) if self.license is not None: - gguf_writer.add_license(self.license) + if isinstance(self.license, list): + gguf_writer.add_license(",".join(self.license)) + else: + gguf_writer.add_license(self.license) if self.license_name is not None: gguf_writer.add_license_name(self.license_name) if self.license_link is not None: diff --git a/src/llama.cpp b/src/llama.cpp index f26898e6d58ec..5bb73a94f687e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7223,12 +7223,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w } break; case GGML_OP_ADD: { - ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, w->ne[0], 512); + ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]); op_tensor = ggml_add(ctx, a, w); } break; case GGML_OP_MUL: { - ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, w->ne[0], 512); + ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]); op_tensor = ggml_mul(ctx, a, w); } break; case GGML_OP_DIV: @@ -18375,13 +18375,13 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { static void llama_kv_cache_update_internal(struct llama_context & lctx) { bool need_reserve = false; - // apply K-shift if needed - if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) { + if (lctx.kv_self.has_shift) { if (!llama_kv_cache_can_shift(&lctx)) { - GGML_ABORT("Deepseek2 does not support K-shift"); + GGML_ABORT("The current context does not support K-shift"); } - { + // apply K-shift if needed + if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { ggml_backend_sched_reset(lctx.sched.get()); ggml_cgraph * gf = llama_build_graph_k_shift(lctx); @@ -20631,7 +20631,7 @@ void llama_kv_cache_update(struct llama_context * ctx) { } bool llama_kv_cache_can_shift(struct llama_context * ctx) { - return ctx->model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA + return !ctx->kv_self.recurrent && ctx->model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA } // deprecated