Skip to content

Commit

Permalink
Update llama-run to include temperature option
Browse files Browse the repository at this point in the history
This commit updates the `examples/run/README.md` file to include a new
option for setting the temperature and updates the `run.cpp` file to
parse this option.

Signed-off-by: Eric Curtin <[email protected]>
  • Loading branch information
ericcurtin committed Dec 19, 2024
1 parent a3c33b1 commit cd61ea0
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 14 deletions.
2 changes: 2 additions & 0 deletions examples/run/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ Options:
Context size (default: 2048)
-n, --ngl <value>
Number of GPU layers (default: 0)
--temp <value>
Temperature (default: 0.8)
-v, --verbose, --log-verbose
Set verbosity level to infinity (i.e. log all messages, useful for debugging)
-h, --help
Expand Down
63 changes: 49 additions & 14 deletions examples/run/run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,19 @@ static int printe(const char * fmt, ...) {
class Opt {
public:
int init(int argc, const char ** argv) {
ctx_params_ = llama_context_default_params();
model_params_ = llama_model_default_params();
context_size_default = ctx_params_.n_batch;
ngl_default = model_params_.n_gpu_layers;
common_params_sampling sampling;
temperature_default = sampling.temp;

if (argc < 2) {
printe("Error: No arguments provided.\n");
help();
return 1;
}

// Parse arguments
if (parse(argc, argv)) {
printe("Error: Failed to parse arguments.\n");
Expand All @@ -68,15 +81,24 @@ class Opt {
return 2;
}

ctx_params_.n_batch = context_size_ >= 0 ? context_size_ : context_size_default;
model_params_.n_gpu_layers = ngl_ >= 0 ? ngl_ : ngl_default;
temperature_ = temperature_ >= 0 ? temperature_ : temperature_default;

return 0; // Success
}

llama_context_params ctx_params_;
llama_model_params model_params_;
std::string model_;
std::string user_;
int context_size_ = -1, ngl_ = -1;
std::string user_;
int context_size_ = -1, ngl_ = -1;
float temperature_ = -1;
bool verbose_ = false;

private:
int context_size_default = -1, ngl_default = -1;
float temperature_default = -1;
bool help_ = false;

bool parse_flag(const char ** argv, int i, const char * short_opt, const char * long_opt) {
Expand All @@ -89,6 +111,17 @@ class Opt {
}

option_value = std::atoi(argv[++i]);

return 0;
}

int handle_option_with_value(int argc, const char ** argv, int & i, float & option_value) {
if (i + 1 >= argc) {
return 1;
}

option_value = std::atof(argv[++i]);

return 0;
}

Expand All @@ -103,6 +136,10 @@ class Opt {
if (handle_option_with_value(argc, argv, i, ngl_) == 1) {
return 1;
}
} else if (options_parsing && strcmp(argv[i], "--temperature") == 0) {
if (handle_option_with_value(argc, argv, i, temperature_) == 1) {
return 1;
}
} else if (options_parsing &&
(parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
verbose_ = true;
Expand Down Expand Up @@ -142,6 +179,8 @@ class Opt {
" Context size (default: %d)\n"
" -n, --ngl <value>\n"
" Number of GPU layers (default: %d)\n"
" --temp <value>\n"
" Temperature (default: %.1f)\n"
" -v, --verbose, --log-verbose\n"
" Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n"
" -h, --help\n"
Expand Down Expand Up @@ -170,7 +209,7 @@ class Opt {
" llama-run file://some-file3.gguf\n"
" llama-run --ngl 999 some-file4.gguf\n"
" llama-run --ngl 999 some-file5.gguf Hello World\n",
llama_context_default_params().n_batch, llama_model_default_params().n_gpu_layers);
context_size_default, ngl_default, temperature_default);
}
};

Expand Down Expand Up @@ -495,12 +534,12 @@ class LlamaData {
return 1;
}

context = initialize_context(model, opt.context_size_);
context = initialize_context(model, opt);
if (!context) {
return 1;
}

sampler = initialize_sampler();
sampler = initialize_sampler(opt);
return 0;
}

Expand Down Expand Up @@ -619,14 +658,12 @@ class LlamaData {
// Initializes the model and returns a unique pointer to it
llama_model_ptr initialize_model(Opt & opt) {
ggml_backend_load_all();
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = opt.ngl_ >= 0 ? opt.ngl_ : model_params.n_gpu_layers;
resolve_model(opt.model_);
printe(
"\r%*s"
"\rLoading model",
get_terminal_width(), " ");
llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), model_params));
llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), opt.model_params_));
if (!model) {
printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
}
Expand All @@ -636,10 +673,8 @@ class LlamaData {
}

// Initializes the context with the specified parameters
llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) {
llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = ctx_params.n_batch = n_ctx >= 0 ? n_ctx : ctx_params.n_batch;
llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
llama_context_ptr initialize_context(const llama_model_ptr & model, const Opt & opt) {
llama_context_ptr context(llama_new_context_with_model(model.get(), opt.ctx_params_));
if (!context) {
printe("%s: error: failed to create the llama_context\n", __func__);
}
Expand All @@ -648,10 +683,10 @@ class LlamaData {
}

// Initializes and configures the sampler
llama_sampler_ptr initialize_sampler() {
llama_sampler_ptr initialize_sampler(const Opt & opt) {
llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params()));
llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1));
llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(0.8f));
llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(opt.temperature_));
llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED));

return sampler;
Expand Down

0 comments on commit cd61ea0

Please sign in to comment.