diff --git a/rllm-cpp/src/rllm-cpp.rs b/rllm-cpp/src/rllm-cpp.rs index f56d9b51..18276ce3 100644 --- a/rllm-cpp/src/rllm-cpp.rs +++ b/rllm-cpp/src/rllm-cpp.rs @@ -11,11 +11,16 @@ pub struct CppArgs { /// Name of .gguf file inside of the model folder/repo. #[arg(long, help_heading = "Model")] pub gguf: Option, + + /// How many model layers to offload to GPU (if available) + #[arg(long, short = 'g', help_heading = "Model")] + pub gpu_layers: Option, } #[actix_web::main] async fn main() -> () { let mut args = parse_with_settings::(); args.args.gguf = args.gguf; + args.args.n_gpu_layers = args.gpu_layers; rllm::server::server_main(args.args).await; } diff --git a/rllm-cuda/src/lib.rs b/rllm-cuda/src/lib.rs index 4deebf00..72285f5b 100644 --- a/rllm-cuda/src/lib.rs +++ b/rllm-cuda/src/lib.rs @@ -42,10 +42,12 @@ pub struct LoaderArgs { pub model_id: String, pub revision: Option, pub local_weights: Option, - pub gguf: Option, pub alt: usize, pub aici: AiciConfig, + // llama.cpp-specific + pub gguf: Option, + pub n_gpu_layers: Option, #[cfg(not(feature = "tch"))] pub(crate) cached_model: Option, @@ -78,6 +80,7 @@ impl Default for LoaderArgs { alt: 0, dtype, device, + n_gpu_layers: None, #[cfg(not(feature = "tch"))] cached_model: None, } diff --git a/rllm-cuda/src/llamacpp/loader.rs b/rllm-cuda/src/llamacpp/loader.rs index fac72f51..34bfaf2e 100644 --- a/rllm-cuda/src/llamacpp/loader.rs +++ b/rllm-cuda/src/llamacpp/loader.rs @@ -46,12 +46,15 @@ fn do_load(args: &mut LoaderArgs) -> Result { let mut mparams = cpp::ModelParams::default(); // TODO: make this configurable mparams.set_split_mode(cpp::SplitMode::None); - mparams.n_gpu_layers = 1000; + mparams.n_gpu_layers = args.n_gpu_layers.unwrap_or(0) as i32; + log::info!("{} layer(s) offloaded to GPU", mparams.n_gpu_layers); // don't GPU offload on Intel macs - it just fails there #[cfg(all(target_os = "macos", target_arch = "x86_64"))] { - log::warn!("disabling GPU (Intel macOS)"); - mparams.n_gpu_layers = 0; + if mparams.n_gpu_layers > 0 { + log::warn!("disabling GPU (Intel macOS)"); + mparams.n_gpu_layers = 0; + } } let m = cpp::Model::from_file(file.to_str().unwrap(), mparams)?; diff --git a/rllm-cuda/src/server/mod.rs b/rllm-cuda/src/server/mod.rs index f7be50f2..b2b7bf13 100644 --- a/rllm-cuda/src/server/mod.rs +++ b/rllm-cuda/src/server/mod.rs @@ -22,9 +22,9 @@ use std::{ }; use tokio::sync::mpsc::{channel, error::TryRecvError, Receiver, Sender}; +mod api; mod completion; mod openai; -mod api; #[derive(Debug)] pub struct APIError { @@ -206,6 +206,8 @@ pub struct RllmCliArgs { // these are copied from command-specific parsers #[arg(skip)] pub gguf: Option, + #[arg(skip)] + pub n_gpu_layers: Option, } #[actix_web::get("/v1/controllers/tags")] @@ -594,6 +596,7 @@ pub async fn server_main(mut args: RllmCliArgs) -> () { loader_args.revision = args.revision.clone(); loader_args.local_weights = args.local_weights.clone(); loader_args.gguf = args.gguf.clone(); + loader_args.n_gpu_layers = args.n_gpu_layers; if dtype.is_some() { loader_args.dtype = dtype; }