Skip to content

Commit

Permalink
Introduce --gpu and --tinyblas flags
Browse files Browse the repository at this point in the history
  • Loading branch information
jart committed Jan 3, 2024
1 parent 8762f13 commit 04d6e93
Show file tree
Hide file tree
Showing 14 changed files with 267 additions and 68 deletions.
14 changes: 12 additions & 2 deletions llama.cpp/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
params.unsecure = true;
} else if (arg == "--nocompile") {
FLAG_nocompile = true;
} else if (arg == "--tinyblas") {
FLAG_tinyblas = true; // undocumented
} else if (arg == "--gpu") {
if (++i >= argc) {
invalid_param = true;
break;
}
FLAG_gpu = llamafile_gpu_parse(argv[i]);
if (FLAG_gpu == -1) {
fprintf(stderr, "error: invalid --gpu flag value: %s\n", argv[i]);
exit(1);
}
} else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
params.dump_kv_cache = true;
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
Expand Down Expand Up @@ -930,8 +942,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str());
printf(" -ld LOGDIR, --logdir LOGDIR\n");
printf(" path under which to save YAML logs (no logging if unset)\n");
printf(" --unsecure disables pledge() sandboxing on Linux and OpenBSD\n");
printf(" --nocompile disables runtime compilation of gpu support\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
Expand Down
4 changes: 2 additions & 2 deletions llama.cpp/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -14168,7 +14168,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
return;
}

if (!ggml_metal_supported() && ggml_cublas_loaded()) {
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
if (skip_cpu) {
return;
Expand Down Expand Up @@ -16048,7 +16048,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
//n_tasks = MIN(n_threads, MAX(1, nr0/128));
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);

if (!ggml_metal_supported() && ggml_cublas_loaded()) {
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
n_tasks = 1; // TODO: this actually is doing nothing
// the threads are still spinning
Expand Down
54 changes: 28 additions & 26 deletions llama.cpp/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -734,25 +734,27 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
//

inline void * llama_host_malloc(size_t n) {
if (ggml_metal_supported()) {
return ggml_metal_host_malloc(n);
} else if (ggml_cublas_loaded()) {
return ggml_cuda_host_malloc(n);
} else {
return malloc(n);
switch (llamafile_gpu_supported()) {
case LLAMAFILE_GPU_APPLE:
return ggml_metal_host_malloc(n);
case LLAMAFILE_GPU_NVIDIA:
return ggml_cuda_host_malloc(n);
default:
return malloc(n);
}
#if GGML_USE_CPU_HBM
#error fix me
#endif
}

inline void llama_host_free(void * ptr) {
if (ggml_metal_supported()) {
return ggml_metal_host_free(ptr);
} else if (ggml_cublas_loaded()) {
return ggml_cuda_host_free(ptr);
} else {
return free(ptr);
switch (llamafile_gpu_supported()) {
case LLAMAFILE_GPU_APPLE:
return ggml_metal_host_free(ptr);
case LLAMAFILE_GPU_NVIDIA:
return ggml_cuda_host_free(ptr);
default:
return free(ptr);
}
#if GGML_USE_CPU_HBM
#error fix me
Expand Down Expand Up @@ -895,7 +897,7 @@ struct llama_mmap {

// report terminal progress of loading weights off the disk into
// the cpu. if we're using gpu inference, then don't even bother
if (!ggml_metal_supported() && !ggml_cublas_loaded()) {
if (!llamafile_gpu_supported()) {
llamafile_schlep(addr, size);
}
}
Expand Down Expand Up @@ -1276,7 +1278,7 @@ struct llama_kv_cache {
ggml_free(ctx);
}

if (!ggml_metal_supported() && ggml_cublas_loaded()) {
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
for (size_t i = 0; i < k_l.size(); ++i) {
ggml_cuda_free_data(k_l[i]);
ggml_cuda_free_data(v_l[i]);
Expand Down Expand Up @@ -1387,7 +1389,7 @@ struct llama_model {
ggml_free(ctx);
}

if (!ggml_metal_supported() && ggml_cublas_loaded()) {
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
ggml_cuda_free_data(tensors_by_name[i].second);
}
Expand Down Expand Up @@ -1515,7 +1517,7 @@ static bool llama_kv_cache_init(
ggml_format_name(v, "cache_v_l%d", i);
cache.k_l.push_back(k);
cache.v_l.push_back(v);
if (!ggml_metal_supported() && ggml_cublas_loaded()) {
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
if (i >= i_gpu_start) {
if (offload) {
ggml_cuda_assign_buffers_no_scratch(k);
Expand Down Expand Up @@ -2923,7 +2925,7 @@ static void llm_load_tensors(
enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;

if (!ggml_metal_supported() && ggml_cublas_loaded()) {
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
ggml_cuda_set_main_device(main_gpu);

Expand Down Expand Up @@ -3645,7 +3647,7 @@ static void llm_load_tensors(

LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);

if (!ggml_metal_supported() && ggml_cublas_loaded()) {
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));

LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
Expand All @@ -3668,7 +3670,7 @@ static void llm_load_tensors(
}

(void) tensor_split;
if (!ggml_metal_supported() && ggml_cublas_loaded())
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA)
{
ggml_cuda_set_tensor_split(tensor_split);
}
Expand Down Expand Up @@ -5975,7 +5977,7 @@ static struct ggml_cgraph * llama_build_graph(

// this is needed for compatibility with Metal for example
static offload_func_t ggml_offload_gpu;
if (!ggml_metal_supported() && ggml_cublas_loaded()) {
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc;
} else {
ggml_offload_gpu = ggml_offload_nop;
Expand Down Expand Up @@ -6197,7 +6199,7 @@ static int llama_decode_internal(
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
}

if (!ggml_metal_supported() && ggml_cublas_loaded()) {
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
for (int i = 0; i < gf->n_leafs; i++) {
ggml_tensor * node = gf->leafs[i];
if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
Expand Down Expand Up @@ -8916,7 +8918,7 @@ static int llama_apply_lora_from_file_internal(
offload_func_t offload_func = ggml_offload_nop;
offload_func_t offload_func_force_inplace = ggml_offload_nop;

if (!ggml_metal_supported() && ggml_cublas_loaded()) {
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
if (dest_t->type != GGML_TYPE_F16) {
ThrowRuntimeError(format(
Expand Down Expand Up @@ -9042,7 +9044,7 @@ struct llama_model_params llama_model_default_params() {
/*.use_mlock =*/ false,
};

if (ggml_metal_supported()) {
if (llamafile_gpu_supported() == LLAMAFILE_GPU_APPLE) {
result.n_gpu_layers = 1;
}

Expand Down Expand Up @@ -9277,7 +9279,7 @@ struct llama_context * llama_new_context_with_model(
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));

if (ggml_metal_supported()) {
if (llamafile_gpu_supported() == LLAMAFILE_GPU_APPLE) {
if (model->n_gpu_layers > 0) {
ctx->ctx_metal = ggml_metal_init(1);
if (!ctx->ctx_metal) {
Expand All @@ -9303,7 +9305,7 @@ struct llama_context * llama_new_context_with_model(
if (ctx->ctx_metal) {
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
}
if (!ggml_metal_supported() && ggml_cuda_supported()) {
if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) {
ggml_cuda_set_scratch_size(alloc_size);
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);

Expand Down Expand Up @@ -9336,7 +9338,7 @@ struct llama_context * llama_new_context_with_model(
}
}

if (ggml_metal_supported()) {
if (llamafile_gpu_supported() == LLAMAFILE_GPU_APPLE) {
if (model->n_gpu_layers > 0) {
// this allocates all Metal resources and memory buffers

Expand Down
38 changes: 38 additions & 0 deletions llama.cpp/main/main.1
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,44 @@ Force system to keep model in RAM rather than swapping or compressing.
Do not memory-map model (slower load but may reduce pageouts if not using mlock).
.It Fl Fl numa
Attempt optimizations that help on some NUMA systems if run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437.
.It Fl Fl nocompile
Never compile GPU support at runtime.
.Pp
If
.Pa ~/.llamafile/ggml-cuda.dll
already exists on the file system (or .so for UNIX and .dylib for
MacOS), then it'll be linked as-is without question. Otherwise,
.Nm
will fall back to CPU inference.
.It Fl Fl gpu Ar GPU
Specifies which brand of GPU should be used. Valid choices are:
.Pp
.Bl -dash
.It
.Ar AUTO :
Use any GPU if possible, otherwise fall back to CPU inference (default)
.It
.Ar AMD :
Use AMD GPU. The AMD ROCm SDK must be installed and the HIP_PATH
environment variable must be defined. If an AMD GPU could not be used
for any reason, then a fatal error will be raised.
.It
.Ar APPLE :
Use Apple Metal GPU. This is only available on MacOS ARM64. If Metal
could not be used for any reason, then a fatal error will be raised.
.It
.Ar NVIDIA :
Use NVIDIA GPU. If an NVIDIA GPU could not be used for any reason, a
fatal error will be raised. On Windows, NVIDIA GPU support will use our
tinyBLAS library, since it works on stock Windows installs. If both MSVC
and CUDA are installed beforehand, and
.Nm
is run for the first time on the x64 command prompt, then llamafile will
use NVIDIA's faster cuBLAS library instead. On Linux and other systems,
the CUDA SDK must always be installed, so that native GPU support can be
compiled on the fly.
.El
.Pp
.It Fl ngl Ar N , Fl Fl n-gpu-layers Ar N
Number of layers to store in VRAM.
.It Fl ngld Ar N , Fl Fl n-gpu-layers-draft Ar N
Expand Down
33 changes: 33 additions & 0 deletions llama.cpp/main/main.1.asc
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,39 @@ OOPPTTIIOONNSS
page cache before using this. See
https://github.com/ggerganov/llama.cpp/issues/1437.

----nnooccoommppiillee
Never compile GPU support at runtime.

If _~_/_._l_l_a_m_a_f_i_l_e_/_g_g_m_l_-_c_u_d_a_._d_l_l already exists on the file system
(or .so for UNIX and .dylib for MacOS), then it'll be linked as-
is without question. Otherwise, llllaammaaffiillee will fall back to CPU
inference.

----ggppuu _G_P_U
Specifies which brand of GPU should be used. Valid choices are:

-- _A_U_T_O: Use any GPU if possible, otherwise fall back to CPU
inference (default)

-- _A_M_D: Use AMD GPU. The AMD ROCm SDK must be installed and the
HIP_PATH environment variable must be defined. If an AMD GPU
could not be used for any reason, then a fatal error will be
raised.

-- _A_P_P_L_E: Use Apple Metal GPU. This is only available on MacOS
ARM64. If Metal could not be used for any reason, then a
fatal error will be raised.

-- _N_V_I_D_I_A: Use NVIDIA GPU. If an NVIDIA GPU could not be used
for any reason, a fatal error will be raised. On Windows,
NVIDIA GPU support will use our tinyBLAS library, since it
works on stock Windows installs. If both MSVC and CUDA are
installed beforehand, and llllaammaaffiillee is run for the first time
on the x64 command prompt, then llamafile will use NVIDIA's
faster cuBLAS library instead. On Linux and other systems,
the CUDA SDK must always be installed, so that native GPU
support can be compiled on the fly.

--nnggll _N, ----nn--ggppuu--llaayyeerrss _N
Number of layers to store in VRAM.

Expand Down
3 changes: 2 additions & 1 deletion llama.cpp/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ int main(int argc, char ** argv) {

if (has_argument(argc, argv, "--help")) {
llamafile_help("/zip/llama.cpp/main/main.1.asc");
__builtin_unreachable();
}

if (!has_argument(argc, argv, "--cli") &&
Expand Down Expand Up @@ -162,7 +163,7 @@ int main(int argc, char ** argv) {
console::init(params.simple_io, params.use_color);
atexit([]() { console::cleanup(); });

if (!params.unsecure && !ggml_metal_supported() && !ggml_cuda_supported()) {
if (!params.unsecure && !llamafile_gpu_supported()) {
// Enable pledge() security on Linux and OpenBSD.
// - We do this *after* opening the log file for writing.
// - We do this *before* loading any weights or graphdefs.
Expand Down
19 changes: 15 additions & 4 deletions llama.cpp/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1979,9 +1979,6 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
printf(" --log-disable disables logging to a file.\n");
printf(" --nobrowser Do not attempt to open a web browser tab at startup.\n");
printf(" --unsecure disables pledge() sandboxing on Linux and OpenBSD\n");
printf(" --nocompile disables runtime compilation of gpu support\n");
printf("\n");
}

Expand Down Expand Up @@ -2334,6 +2331,20 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
{
FLAG_nocompile = true;
}
else if (arg == "--gpu")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
FLAG_gpu = llamafile_gpu_parse(argv[i]);
if (FLAG_gpu == -1)
{
fprintf(stderr, "error: invalid --gpu flag value: %s\n", argv[i]);
exit(1);
}
}
else
{
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
Expand Down Expand Up @@ -3043,7 +3054,7 @@ int server_cli(int argc, char ** argv) {
llamafile_launch_browser(url);
}

if (!sparams.unsecure && !ggml_metal_supported() && !ggml_cuda_supported()) {
if (!sparams.unsecure && !llamafile_gpu_supported()) {
// Enables pledge() security on Linux and OpenBSD.
// - We do this *after* binding the server socket.
// - We do this *after* opening the log file for writing.
Expand Down
Loading

0 comments on commit 04d6e93

Please sign in to comment.