Skip to content

Commit

Permalink
add --gpu-layers to rllm-cpp; see #46
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Feb 1, 2024
1 parent 57e3c2f commit 86e2393
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 5 deletions.
5 changes: 5 additions & 0 deletions rllm-cpp/src/rllm-cpp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,16 @@ pub struct CppArgs {
/// Name of .gguf file inside of the model folder/repo.
#[arg(long, help_heading = "Model")]
pub gguf: Option<String>,

/// How many model layers to offload to GPU (if available)
#[arg(long, short = 'g', help_heading = "Model")]
pub gpu_layers: Option<usize>,
}

#[actix_web::main]
async fn main() -> () {
let mut args = parse_with_settings::<CppArgs>();
args.args.gguf = args.gguf;
args.args.n_gpu_layers = args.gpu_layers;
rllm::server::server_main(args.args).await;
}
5 changes: 4 additions & 1 deletion rllm-cuda/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,12 @@ pub struct LoaderArgs {
pub model_id: String,
pub revision: Option<String>,
pub local_weights: Option<String>,
pub gguf: Option<String>,
pub alt: usize,
pub aici: AiciConfig,

// llama.cpp-specific
pub gguf: Option<String>,
pub n_gpu_layers: Option<usize>,
#[cfg(not(feature = "tch"))]
pub(crate) cached_model: Option<llamacpp::Model>,

Expand Down Expand Up @@ -78,6 +80,7 @@ impl Default for LoaderArgs {
alt: 0,
dtype,
device,
n_gpu_layers: None,
#[cfg(not(feature = "tch"))]
cached_model: None,
}
Expand Down
9 changes: 6 additions & 3 deletions rllm-cuda/src/llamacpp/loader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,15 @@ fn do_load(args: &mut LoaderArgs) -> Result<cpp::Model> {
let mut mparams = cpp::ModelParams::default();
// TODO: make this configurable
mparams.set_split_mode(cpp::SplitMode::None);
mparams.n_gpu_layers = 1000;
mparams.n_gpu_layers = args.n_gpu_layers.unwrap_or(0) as i32;
log::info!("{} layer(s) offloaded to GPU", mparams.n_gpu_layers);
// don't GPU offload on Intel macs - it just fails there
#[cfg(all(target_os = "macos", target_arch = "x86_64"))]
{
log::warn!("disabling GPU (Intel macOS)");
mparams.n_gpu_layers = 0;
if mparams.n_gpu_layers > 0 {
log::warn!("disabling GPU (Intel macOS)");
mparams.n_gpu_layers = 0;
}
}

let m = cpp::Model::from_file(file.to_str().unwrap(), mparams)?;
Expand Down
5 changes: 4 additions & 1 deletion rllm-cuda/src/server/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ use std::{
};
use tokio::sync::mpsc::{channel, error::TryRecvError, Receiver, Sender};

mod api;
mod completion;
mod openai;
mod api;

#[derive(Debug)]
pub struct APIError {
Expand Down Expand Up @@ -206,6 +206,8 @@ pub struct RllmCliArgs {
// these are copied from command-specific parsers
#[arg(skip)]
pub gguf: Option<String>,
#[arg(skip)]
pub n_gpu_layers: Option<usize>,
}

#[actix_web::get("/v1/controllers/tags")]
Expand Down Expand Up @@ -594,6 +596,7 @@ pub async fn server_main(mut args: RllmCliArgs) -> () {
loader_args.revision = args.revision.clone();
loader_args.local_weights = args.local_weights.clone();
loader_args.gguf = args.gguf.clone();
loader_args.n_gpu_layers = args.n_gpu_layers;
if dtype.is_some() {
loader_args.dtype = dtype;
}
Expand Down

0 comments on commit 86e2393

Please sign in to comment.