Introduce --gpu and --tinyblas flags

Mozilla-Ocho · Jan 3, 2024 · 04d6e93 · 04d6e93
1 parent 8762f13
commit 04d6e93
Show file tree

Hide file tree

Showing 14 changed files with 267 additions and 68 deletions.
diff --git a/llama.cpp/common.cpp b/llama.cpp/common.cpp
@@ -523,6 +523,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             params.unsecure = true;
         } else if (arg == "--nocompile") {
             FLAG_nocompile = true;
+        } else if (arg == "--tinyblas") {
+            FLAG_tinyblas = true;  // undocumented
+        } else if (arg == "--gpu") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            FLAG_gpu = llamafile_gpu_parse(argv[i]);
+            if (FLAG_gpu == -1) {
+                fprintf(stderr, "error: invalid --gpu flag value: %s\n", argv[i]);
+                exit(1);
+            }
         } else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
             params.dump_kv_cache = true;
         } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
@@ -930,8 +942,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        draft model for speculative decoding (default: %s)\n", params.model.c_str());
     printf("  -ld LOGDIR, --logdir LOGDIR\n");
     printf("                        path under which to save YAML logs (no logging if unset)\n");
-    printf("  --unsecure            disables pledge() sandboxing on Linux and OpenBSD\n");
-    printf("  --nocompile           disables runtime compilation of gpu support\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
     printf("                        types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");

diff --git a/llama.cpp/ggml.c b/llama.cpp/ggml.c
@@ -14168,7 +14168,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
         return;
     }
 
-    if (!ggml_metal_supported() && ggml_cublas_loaded()) {
+    if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
     bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
     if (skip_cpu) {
         return;
@@ -16048,7 +16048,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                 //n_tasks = MIN(n_threads, MAX(1, nr0/128));
                 //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
 
-                if (!ggml_metal_supported() && ggml_cublas_loaded()) {
+                if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
                 if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
                     n_tasks = 1; // TODO: this actually is doing nothing
                                  //       the threads are still spinning

diff --git a/llama.cpp/llama.cpp b/llama.cpp/llama.cpp
@@ -734,25 +734,27 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
 //
 
 inline void * llama_host_malloc(size_t n) {
-    if (ggml_metal_supported()) {
-        return ggml_metal_host_malloc(n);
-    } else if (ggml_cublas_loaded()) {
-        return ggml_cuda_host_malloc(n);
-    } else {
-        return malloc(n);
+    switch (llamafile_gpu_supported()) {
+        case LLAMAFILE_GPU_APPLE:
+            return ggml_metal_host_malloc(n);
+        case LLAMAFILE_GPU_NVIDIA:
+            return ggml_cuda_host_malloc(n);
+        default:
+            return malloc(n);
     }
 #if GGML_USE_CPU_HBM
 #error fix me
 #endif
 }
 
 inline void llama_host_free(void * ptr) {
-    if (ggml_metal_supported()) {
-        return ggml_metal_host_free(ptr);
-    } else if (ggml_cublas_loaded()) {
-        return ggml_cuda_host_free(ptr);
-    } else {
-        return free(ptr);
+    switch (llamafile_gpu_supported()) {
+        case LLAMAFILE_GPU_APPLE:
+            return ggml_metal_host_free(ptr);
+        case LLAMAFILE_GPU_NVIDIA:
+            return ggml_cuda_host_free(ptr);
+        default:
+            return free(ptr);
     }
 #if GGML_USE_CPU_HBM
 #error fix me
@@ -895,7 +897,7 @@ struct llama_mmap {
 
         // report terminal progress of loading weights off the disk into
         // the cpu. if we're using gpu inference, then don't even bother
-        if (!ggml_metal_supported() && !ggml_cublas_loaded()) {
+        if (!llamafile_gpu_supported()) {
             llamafile_schlep(addr, size);
         }
     }
@@ -1276,7 +1278,7 @@ struct llama_kv_cache {
             ggml_free(ctx);
         }
 
-        if (!ggml_metal_supported() && ggml_cublas_loaded()) {
+        if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
             for (size_t i = 0; i < k_l.size(); ++i) {
                 ggml_cuda_free_data(k_l[i]);
                 ggml_cuda_free_data(v_l[i]);
@@ -1387,7 +1389,7 @@ struct llama_model {
             ggml_free(ctx);
         }
 
-        if (!ggml_metal_supported() && ggml_cublas_loaded()) {
+        if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
             for (size_t i = 0; i < tensors_by_name.size(); ++i) {
                 ggml_cuda_free_data(tensors_by_name[i].second);
             }
@@ -1515,7 +1517,7 @@ static bool llama_kv_cache_init(
         ggml_format_name(v, "cache_v_l%d", i);
         cache.k_l.push_back(k);
         cache.v_l.push_back(v);
-        if (!ggml_metal_supported() && ggml_cublas_loaded()) {
+        if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
         if (i >= i_gpu_start) {
             if (offload) {
                 ggml_cuda_assign_buffers_no_scratch(k);
@@ -2923,7 +2925,7 @@ static void llm_load_tensors(
     enum ggml_backend_type llama_backend_offload       = GGML_BACKEND_CPU;
     enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
 
-    if (!ggml_metal_supported() && ggml_cublas_loaded()) {
+    if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
         LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
         ggml_cuda_set_main_device(main_gpu);
 
@@ -3645,7 +3647,7 @@ static void llm_load_tensors(
 
         LLAMA_LOG_INFO("%s: mem required  = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
 
-        if (!ggml_metal_supported() && ggml_cublas_loaded()) {
+        if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
         const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
 
         LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
@@ -3668,7 +3670,7 @@ static void llm_load_tensors(
     }
 
     (void) tensor_split;
-    if (!ggml_metal_supported() && ggml_cublas_loaded())
+    if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA)
     {
         ggml_cuda_set_tensor_split(tensor_split);
     }
@@ -5975,7 +5977,7 @@ static struct ggml_cgraph * llama_build_graph(
 
         // this is needed for compatibility with Metal for example
         static offload_func_t ggml_offload_gpu;
-        if (!ggml_metal_supported() && ggml_cublas_loaded()) {
+        if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
             ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc;
         } else {
             ggml_offload_gpu = ggml_offload_nop;
@@ -6197,7 +6199,7 @@ static int llama_decode_internal(
         GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
     }
 
-    if (!ggml_metal_supported() && ggml_cublas_loaded()) {
+    if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
     for (int i = 0; i < gf->n_leafs; i++) {
         ggml_tensor * node = gf->leafs[i];
         if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
@@ -8916,7 +8918,7 @@ static int llama_apply_lora_from_file_internal(
             offload_func_t offload_func               = ggml_offload_nop;
             offload_func_t offload_func_force_inplace = ggml_offload_nop;
 
-            if (!ggml_metal_supported() && ggml_cublas_loaded()) {
+            if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
             if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
                 if (dest_t->type != GGML_TYPE_F16) {
                     ThrowRuntimeError(format(
@@ -9042,7 +9044,7 @@ struct llama_model_params llama_model_default_params() {
         /*.use_mlock                   =*/ false,
     };
 
-    if (ggml_metal_supported()) {
+    if (llamafile_gpu_supported() == LLAMAFILE_GPU_APPLE) {
         result.n_gpu_layers = 1;
     }
 
@@ -9277,7 +9279,7 @@ struct llama_context * llama_new_context_with_model(
             llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
             ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
 
-            if (ggml_metal_supported()) {
+            if (llamafile_gpu_supported() == LLAMAFILE_GPU_APPLE) {
             if (model->n_gpu_layers > 0) {
                 ctx->ctx_metal = ggml_metal_init(1);
                 if (!ctx->ctx_metal) {
@@ -9303,7 +9305,7 @@ struct llama_context * llama_new_context_with_model(
             if (ctx->ctx_metal) {
                 //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
             }
-            if (!ggml_metal_supported() && ggml_cuda_supported()) {
+            if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
             ggml_cuda_set_scratch_size(alloc_size);
             LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
 
@@ -9336,7 +9338,7 @@ struct llama_context * llama_new_context_with_model(
             }
         }
 
-        if (ggml_metal_supported()) {
+        if (llamafile_gpu_supported() == LLAMAFILE_GPU_APPLE) {
         if (model->n_gpu_layers > 0) {
             // this allocates all Metal resources and memory buffers
 

diff --git a/llama.cpp/main/main.1 b/llama.cpp/main/main.1
@@ -353,6 +353,44 @@ Force system to keep model in RAM rather than swapping or compressing.
 Do not memory-map model (slower load but may reduce pageouts if not using mlock).
 .It Fl Fl numa
 Attempt optimizations that help on some NUMA systems if run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437.
+.It Fl Fl nocompile
+Never compile GPU support at runtime.
+.Pp
+If
+.Pa ~/.llamafile/ggml-cuda.dll
+already exists on the file system (or .so for UNIX and .dylib for
+MacOS), then it'll be linked as-is without question. Otherwise,
+.Nm
+will fall back to CPU inference.
+.It Fl Fl gpu Ar GPU
+Specifies which brand of GPU should be used. Valid choices are:
+.Pp
+.Bl -dash
+.It
+.Ar AUTO :
+Use any GPU if possible, otherwise fall back to CPU inference (default)
+.It
+.Ar AMD :
+Use AMD GPU. The AMD ROCm SDK must be installed and the HIP_PATH
+environment variable must be defined. If an AMD GPU could not be used
+for any reason, then a fatal error will be raised.
+.It
+.Ar APPLE :
+Use Apple Metal GPU. This is only available on MacOS ARM64. If Metal
+could not be used for any reason, then a fatal error will be raised.
+.It
+.Ar NVIDIA :
+Use NVIDIA GPU. If an NVIDIA GPU could not be used for any reason, a
+fatal error will be raised. On Windows, NVIDIA GPU support will use our
+tinyBLAS library, since it works on stock Windows installs. If both MSVC
+and CUDA are installed beforehand, and
+.Nm
+is run for the first time on the x64 command prompt, then llamafile will
+use NVIDIA's faster cuBLAS library instead. On Linux and other systems,
+the CUDA SDK must always be installed, so that native GPU support can be
+compiled on the fly.
+.El
+.Pp
 .It Fl ngl Ar N , Fl Fl n-gpu-layers Ar N
 Number of layers to store in VRAM.
 .It Fl ngld Ar N , Fl Fl n-gpu-layers-draft Ar N

diff --git a/llama.cpp/main/main.1.asc b/llama.cpp/main/main.1.asc
@@ -324,6 +324,39 @@ OOPPTTIIOONNSS
              page cache before using this. See
              https://github.com/ggerganov/llama.cpp/issues/1437.
 
+     ----nnooccoommppiillee
+             Never compile GPU support at runtime.
+
+             If _~_/_._l_l_a_m_a_f_i_l_e_/_g_g_m_l_-_c_u_d_a_._d_l_l already exists on the file system
+             (or .so for UNIX and .dylib for MacOS), then it'll be linked as-
+             is without question. Otherwise, llllaammaaffiillee will fall back to CPU
+             inference.
+
+     ----ggppuu _G_P_U
+             Specifies which brand of GPU should be used. Valid choices are:
+
+             --   _A_U_T_O: Use any GPU if possible, otherwise fall back to CPU
+                 inference (default)
+
+             --   _A_M_D: Use AMD GPU. The AMD ROCm SDK must be installed and the
+                 HIP_PATH environment variable must be defined. If an AMD GPU
+                 could not be used for any reason, then a fatal error will be
+                 raised.
+
+             --   _A_P_P_L_E: Use Apple Metal GPU. This is only available on MacOS
+                 ARM64. If Metal could not be used for any reason, then a
+                 fatal error will be raised.
+
+             --   _N_V_I_D_I_A: Use NVIDIA GPU. If an NVIDIA GPU could not be used
+                 for any reason, a fatal error will be raised. On Windows,
+                 NVIDIA GPU support will use our tinyBLAS library, since it
+                 works on stock Windows installs. If both MSVC and CUDA are
+                 installed beforehand, and llllaammaaffiillee is run for the first time
+                 on the x64 command prompt, then llamafile will use NVIDIA's
+                 faster cuBLAS library instead. On Linux and other systems,
+                 the CUDA SDK must always be installed, so that native GPU
+                 support can be compiled on the fly.
+
      --nnggll _N, ----nn--ggppuu--llaayyeerrss _N
              Number of layers to store in VRAM.
 

diff --git a/llama.cpp/main/main.cpp b/llama.cpp/main/main.cpp
@@ -121,6 +121,7 @@ int main(int argc, char ** argv) {
 
     if (has_argument(argc, argv, "--help")) {
         llamafile_help("/zip/llama.cpp/main/main.1.asc");
+        __builtin_unreachable();
     }
 
     if (!has_argument(argc, argv, "--cli") &&
@@ -162,7 +163,7 @@ int main(int argc, char ** argv) {
     console::init(params.simple_io, params.use_color);
     atexit([]() { console::cleanup(); });
 
-    if (!params.unsecure && !ggml_metal_supported() && !ggml_cuda_supported()) {
+    if (!params.unsecure && !llamafile_gpu_supported()) {
         // Enable pledge() security on Linux and OpenBSD.
         // - We do this *after* opening the log file for writing.
         // - We do this *before* loading any weights or graphdefs.

diff --git a/llama.cpp/server/server.cpp b/llama.cpp/server/server.cpp
@@ -1979,9 +1979,6 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("                        Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
     printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA.\n");
     printf("  --log-disable         disables logging to a file.\n");
-    printf("  --nobrowser           Do not attempt to open a web browser tab at startup.\n");
-    printf("  --unsecure            disables pledge() sandboxing on Linux and OpenBSD\n");
-    printf("  --nocompile           disables runtime compilation of gpu support\n");
     printf("\n");
 }
 
@@ -2334,6 +2331,20 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
         {
             FLAG_nocompile = true;
         }
+        else if (arg == "--gpu")
+        {
+            if (++i >= argc)
+            {
+                invalid_param = true;
+                break;
+            }
+            FLAG_gpu = llamafile_gpu_parse(argv[i]);
+            if (FLAG_gpu == -1)
+            {
+                fprintf(stderr, "error: invalid --gpu flag value: %s\n", argv[i]);
+                exit(1);
+            }
+        }
         else
         {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@@ -3043,7 +3054,7 @@ int server_cli(int argc, char ** argv) {
         llamafile_launch_browser(url);
     }
 
-    if (!sparams.unsecure && !ggml_metal_supported() && !ggml_cuda_supported()) {
+    if (!sparams.unsecure && !llamafile_gpu_supported()) {
         // Enables pledge() security on Linux and OpenBSD.
         // - We do this *after* binding the server socket.
         // - We do this *after* opening the log file for writing.