From 2d05bf374bff2d563993d564d312b6e51bf9a541 Mon Sep 17 00:00:00 2001 From: Eric Curtin Date: Fri, 13 Dec 2024 22:46:13 +0000 Subject: [PATCH] Improve progress bar Set default width to whatever the terminal is. Also fixed a small bug around default n_gpu_layers value. Signed-off-by: Eric Curtin --- README.md | 4 +- examples/run/run.cpp | 335 +++++++++++++++++++++++++++++-------------- 2 files changed, 226 insertions(+), 113 deletions(-) diff --git a/README.md b/README.md index 54466c2501c081..42979f73c42c53 100644 --- a/README.md +++ b/README.md @@ -409,7 +409,7 @@ To learn more about model quantization, [read this documentation](examples/quant -[^1]: [examples/perplexity/README.md](examples/perplexity/README.md) +[^1]: [examples/perplexity/README.md](https://github.com/ggerganov/llama.cpp/blob/master/examples/perplexity/README.md) [^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity) ## [`llama-bench`](example/bench) @@ -446,7 +446,7 @@ To learn more about model quantization, [read this documentation](examples/quant -[^3]: [https://github.com/containers/ramalama](RamaLama) +[^3]: [RamaLama](https://github.com/containers/ramalama) ## [`llama-simple`](examples/simple) diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 834ea8f7b4aeb4..c514c2c5ef0e40 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -1,6 +1,7 @@ #if defined(_WIN32) # include #else +# include # include #endif @@ -8,6 +9,7 @@ # include #endif +#include #include #include #include @@ -21,15 +23,43 @@ #include "json.hpp" #include "llama-cpp.h" -#define printe(...) \ - do { \ - fprintf(stderr, __VA_ARGS__); \ - } while (0) +#if defined(_WIN32) +# define FORMAT_ATTR(fmt, args) +#else +# define FORMAT_ATTR(fmt, args) __attribute__((format(printf, fmt, args))) +#endif + +FORMAT_ATTR(1, 2) +static std::string fmt(const char * fmt, ...) { + va_list ap; + va_list ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + const int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT + std::string buf; + buf.resize(size); + const int size2 = vsnprintf(const_cast(buf.data()), buf.size() + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + + return buf; +} + +FORMAT_ATTR(1, 2) +static int printe(const char * fmt, ...) { + va_list args; + va_start(args, fmt); + const int ret = vfprintf(stderr, fmt, args); + va_end(args); + + return ret; +} class Opt { public: int init(int argc, const char ** argv) { - construct_help_str_(); // Parse arguments if (parse(argc, argv)) { printe("Error: Failed to parse arguments.\n"); @@ -48,14 +78,65 @@ class Opt { std::string model_; std::string user_; - int context_size_ = 2048, ngl_ = -1; + int context_size_ = -1, ngl_ = -1; + bool verbose_ = false; private: - std::string help_str_; bool help_ = false; - void construct_help_str_() { - help_str_ = + bool parse_option(int argc, const char ** argv, int & i, const char * short_opt, const char * long_opt, + int & option_var) { + if (strcmp(argv[i], short_opt) == 0 || strcmp(argv[i], long_opt) == 0) { + if (i + 1 >= argc) { + return false; + } + + option_var = std::atoi(argv[++i]); + return true; + } + + return false; + } + + bool parse_flag(int argc, const char ** argv, int i, const char * short_opt, const char * long_opt) { + return strcmp(argv[i], short_opt) == 0 || strcmp(argv[i], long_opt) == 0; + } + + int parse(int argc, const char ** argv) { + int positional_args_i = 0; + bool options_parsing = true; + for (int i = 1; i < argc; ++i) { + if (options_parsing && (parse_option(argc, argv, i, "-c", "--context-size", context_size_) || + parse_option(argc, argv, i, "-n", "--ngl", ngl_))) { + continue; + } else if (options_parsing && parse_flag(argc, argv, i, "-v", "--verbose") || + parse_flag(argc, argv, i, "-v", "--log-verbose")) { + verbose_ = true; + } else if (options_parsing && parse_flag(argc, argv, i, "-h", "--help")) { + help_ = true; + return 0; + } else if (options_parsing && strcmp(argv[i], "--") == 0) { + options_parsing = false; + } else if (positional_args_i == 0) { + if (!argv[i][0] || argv[i][0] == '-') { + return 1; + } + + ++positional_args_i; + model_ = argv[i]; + } else if (positional_args_i == 1) { + ++positional_args_i; + user_ = argv[i]; + } else { + user_ += " " + std::string(argv[i]); + } + } + + return 0; + } + + void help() const { + printf( "Description:\n" " Runs a llm\n" "\n" @@ -64,15 +145,11 @@ class Opt { "\n" "Options:\n" " -c, --context-size \n" - " Context size (default: " + - std::to_string(context_size_); - help_str_ += - ")\n" + " Context size (default: %d)\n" " -n, --ngl \n" - " Number of GPU layers (default: " + - std::to_string(ngl_); - help_str_ += - ")\n" + " Number of GPU layers (default: %d)\n" + " -v, --verbose, --log-verbose\n" + " Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n" " -h, --help\n" " Show help message\n" "\n" @@ -92,53 +169,21 @@ class Opt { " llama-run ollama://granite-code\n" " llama-run ollama://smollm:135m\n" " llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf\n" - " llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf\n" + " llama-run " + "huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf\n" " llama-run https://example.com/some-file1.gguf\n" " llama-run some-file2.gguf\n" " llama-run file://some-file3.gguf\n" - " llama-run --ngl 99 some-file4.gguf\n" - " llama-run --ngl 99 some-file5.gguf Hello World\n"; - } - - int parse(int argc, const char ** argv) { - int positional_args_i = 0; - for (int i = 1; i < argc; ++i) { - if (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0) { - if (i + 1 >= argc) { - return 1; - } - - context_size_ = std::atoi(argv[++i]); - } else if (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0) { - if (i + 1 >= argc) { - return 1; - } - - ngl_ = std::atoi(argv[++i]); - } else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) { - help_ = true; - return 0; - } else if (!positional_args_i) { - ++positional_args_i; - model_ = argv[i]; - } else if (positional_args_i == 1) { - ++positional_args_i; - user_ = argv[i]; - } else { - user_ += " " + std::string(argv[i]); - } - } - - return model_.empty(); // model_ is the only required value + " llama-run --ngl 999 some-file4.gguf\n" + " llama-run --ngl 999 some-file5.gguf Hello World\n", + llama_context_default_params().n_batch, llama_model_default_params().n_gpu_layers); } - - void help() const { printf("%s", help_str_.c_str()); } }; struct progress_data { - size_t file_size = 0; + size_t file_size = 0; std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now(); - bool printed = false; + bool printed = false; }; struct FileDeleter { @@ -151,8 +196,20 @@ struct FileDeleter { typedef std::unique_ptr FILE_ptr; +static int get_terminal_width() { +#if defined(_WIN32) + CONSOLE_SCREEN_BUFFER_INFO csbi; + GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi); + return csbi.srWindow.Right - csbi.srWindow.Left + 1; +#else + struct winsize w; + ioctl(STDOUT_FILENO, TIOCGWINSZ, &w); + return w.ws_col; +#endif +} + #ifdef LLAMA_USE_CURL -class CurlWrapper { +class HttpClient { public: int init(const std::string & url, const std::vector & headers, const std::string & output_file, const bool progress, std::string * response_str = nullptr) { @@ -181,7 +238,7 @@ class CurlWrapper { return 0; } - ~CurlWrapper() { + ~HttpClient() { if (chunk) { curl_slist_free_all(chunk); } @@ -219,7 +276,7 @@ class CurlWrapper { if (progress) { curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &data); - curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_callback); + curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, update_progress); } } @@ -255,37 +312,31 @@ class CurlWrapper { int mins = (static_cast(seconds) % 3600) / 60; int secs = static_cast(seconds) % 60; - std::ostringstream out; if (hrs > 0) { - out << hrs << "h " << std::setw(2) << std::setfill('0') << mins << "m " << std::setw(2) << std::setfill('0') - << secs << "s"; + return fmt("%dh %02dm %02ds", hrs, mins, secs); } else if (mins > 0) { - out << mins << "m " << std::setw(2) << std::setfill('0') << secs << "s"; + return fmt("%dm %02ds", mins, secs); } else { - out << secs << "s"; + return fmt("%ds", secs); } - - return out.str(); } static std::string human_readable_size(curl_off_t size) { static const char * suffix[] = { "B", "KB", "MB", "GB", "TB" }; - char length = sizeof(suffix) / sizeof(suffix[0]); - int i = 0; - double dbl_size = size; + char length = sizeof(suffix) / sizeof(suffix[0]); + int i = 0; + double dbl_size = size; if (size > 1024) { for (i = 0; (size / 1024) > 0 && i < length - 1; i++, size /= 1024) { dbl_size = size / 1024.0; } } - std::ostringstream out; - out << std::fixed << std::setprecision(2) << dbl_size << " " << suffix[i]; - return out.str(); + return fmt("%.2f %s", dbl_size, suffix[i]); } - static int progress_callback(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t, - curl_off_t) { + static int update_progress(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t, + curl_off_t) { progress_data * data = static_cast(ptr); if (total_to_download <= 0) { return 0; @@ -293,27 +344,68 @@ class CurlWrapper { total_to_download += data->file_size; const curl_off_t now_downloaded_plus_file_size = now_downloaded + data->file_size; - const curl_off_t percentage = (now_downloaded_plus_file_size * 100) / total_to_download; - const curl_off_t pos = (percentage / 5); + const curl_off_t percentage = calculate_percentage(now_downloaded_plus_file_size, total_to_download); + std::string progress_prefix = generate_progress_prefix(percentage); + + const double speed = calculate_speed(now_downloaded, data->start_time); + const double tim = (total_to_download - now_downloaded) / speed; + std::string progress_suffix = + generate_progress_suffix(now_downloaded_plus_file_size, total_to_download, speed, tim); + + int progress_bar_width = calculate_progress_bar_width(progress_prefix, progress_suffix); std::string progress_bar; - for (int i = 0; i < 20; ++i) { - progress_bar.append((i < pos) ? "█" : " "); - } + generate_progress_bar(progress_bar_width, percentage, progress_bar); - // Calculate download speed and estimated time to completion - const auto now = std::chrono::steady_clock::now(); - const std::chrono::duration elapsed_seconds = now - data->start_time; - const double speed = now_downloaded / elapsed_seconds.count(); - const double estimated_time = (total_to_download - now_downloaded) / speed; - printe("\r%ld%% |%s| %s/%s %.2f MB/s %s ", percentage, progress_bar.c_str(), - human_readable_size(now_downloaded).c_str(), human_readable_size(total_to_download).c_str(), - speed / (1024 * 1024), human_readable_time(estimated_time).c_str()); - fflush(stderr); + print_progress(progress_prefix, progress_bar, progress_suffix); data->printed = true; return 0; } + static curl_off_t calculate_percentage(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download) { + return (now_downloaded_plus_file_size * 100) / total_to_download; + } + + static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", percentage); } + + static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) { + const auto now = std::chrono::steady_clock::now(); + const std::chrono::duration elapsed_seconds = now - start_time; + return now_downloaded / elapsed_seconds.count(); + } + + static std::string generate_progress_suffix(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download, + double speed, double estimated_time) { + const int width = 10; + return fmt("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(), width, + human_readable_size(total_to_download).c_str(), width, human_readable_size(speed).c_str(), width, + human_readable_time(estimated_time).c_str()); + } + + static int calculate_progress_bar_width(const std::string & progress_prefix, const std::string & progress_suffix) { + int progress_bar_width = get_terminal_width() - progress_prefix.size() - progress_suffix.size() - 3; + if (progress_bar_width < 1) { + progress_bar_width = 1; + } + + return progress_bar_width; + } + + static std::string generate_progress_bar(int progress_bar_width, curl_off_t percentage, + std::string & progress_bar) { + const curl_off_t pos = (percentage * progress_bar_width) / 100; + for (int i = 0; i < progress_bar_width; ++i) { + progress_bar.append((i < pos) ? "█" : " "); + } + + return progress_bar; + } + + static void print_progress(const std::string & progress_prefix, const std::string & progress_bar, + const std::string & progress_suffix) { + printe("\r%*s\r%s%s| %s", get_terminal_width(), " ", progress_prefix.c_str(), progress_bar.c_str(), + progress_suffix.c_str()); + } // Function to write data to a file static size_t write_data(void * ptr, size_t size, size_t nmemb, void * stream) { FILE * out = static_cast(stream); @@ -357,8 +449,8 @@ class LlamaData { #ifdef LLAMA_USE_CURL int download(const std::string & url, const std::vector & headers, const std::string & output_file, const bool progress, std::string * response_str = nullptr) { - CurlWrapper curl; - if (curl.init(url, headers, output_file, progress, response_str)) { + HttpClient http; + if (http.init(url, headers, output_file, progress, response_str)) { return 1; } @@ -467,19 +559,23 @@ class LlamaData { llama_model_params model_params = llama_model_default_params(); model_params.n_gpu_layers = opt.ngl_ >= 0 ? opt.ngl_ : model_params.n_gpu_layers; resolve_model(opt.model_); + printe( + "\r%*s" + "\rLoading model", + get_terminal_width(), " "); llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), model_params)); if (!model) { printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str()); } + printe("\r%*s\r", static_cast(sizeof("Loading model")), " "); return model; } // Initializes the context with the specified parameters llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) { llama_context_params ctx_params = llama_context_default_params(); - ctx_params.n_ctx = n_ctx; - ctx_params.n_batch = n_ctx; + ctx_params.n_ctx = ctx_params.n_batch = n_ctx >= 0 ? n_ctx : ctx_params.n_batch; llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params)); if (!context) { printe("%s: error: failed to create the llama_context\n", __func__); @@ -609,16 +705,20 @@ static int read_user_input(std::string & user) { } // Function to generate a response based on the prompt -static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response) { +static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response, + const bool stdout_a_terminal) { // Set response color - printf("\033[33m"); + if (stdout_a_terminal) { + printf("\033[33m"); + } + if (generate(llama_data, prompt, response)) { printe("failed to generate response\n"); return 1; } // End response with color reset and newline - printf("\n\033[0m"); + printf("\n%s", stdout_a_terminal ? "\033[0m" : ""); return 0; } @@ -642,15 +742,37 @@ static int handle_user_input(std::string & user_input, const std::string & user_ } printf( - "\r " - "\r\033[32m> \033[0m"); + "\r%*s" + "\r\033[32m> \033[0m", + get_terminal_width(), " "); return read_user_input(user_input); // Returns true if input ends the loop } +static bool is_stdin_a_terminal() { +#if defined(_WIN32) + HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE); + DWORD mode; + return GetConsoleMode(hStdin, &mode); +#else + return isatty(STDIN_FILENO); +#endif +} + +static bool is_stdout_a_terminal() { +#if defined(_WIN32) + HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE); + DWORD mode; + return GetConsoleMode(hStdout, &mode); +#else + return isatty(STDOUT_FILENO); +#endif +} + // Function to tokenize the prompt static int chat_loop(LlamaData & llama_data, const std::string & user_) { int prev_len = 0; llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get())); + static const bool stdout_a_terminal = is_stdout_a_terminal(); while (true) { // Get user input std::string user_input; @@ -665,7 +787,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user_) { std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len); std::string response; - if (generate_response(llama_data, prompt, response)) { + if (generate_response(llama_data, prompt, response, stdout_a_terminal)) { return 1; } @@ -682,22 +804,13 @@ static int chat_loop(LlamaData & llama_data, const std::string & user_) { return 0; } -static void log_callback(const enum ggml_log_level level, const char * text, void *) { - if (level == GGML_LOG_LEVEL_ERROR) { +static void log_callback(const enum ggml_log_level level, const char * text, void * p) { + const Opt * opt = static_cast(p); + if (opt->verbose_ || level == GGML_LOG_LEVEL_ERROR) { printe("%s", text); } } -static bool is_stdin_a_terminal() { -#if defined(_WIN32) - HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE); - DWORD mode; - return GetConsoleMode(hStdin, &mode); -#else - return isatty(STDIN_FILENO); -#endif -} - static std::string read_pipe_data() { std::ostringstream result; result << std::cin.rdbuf(); // Read all data from std::cin @@ -721,7 +834,7 @@ int main(int argc, const char ** argv) { opt.user_ += read_pipe_data(); } - llama_log_set(log_callback, nullptr); + llama_log_set(log_callback, &opt); LlamaData llama_data; if (llama_data.init(opt)) { return 1;