Skip to content

Commit

Permalink
split cpp/cuda code into folders
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Feb 7, 2024
1 parent 12ae462 commit 469cf64
Show file tree
Hide file tree
Showing 38 changed files with 112 additions and 61 deletions.
42 changes: 37 additions & 5 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ members = [
"pyctrl",
"jsctrl",
"uppercase",
"rllm-lib",
"rllm-cuda",
"rllm-cpp",
"tch-cuda",
Expand Down
3 changes: 2 additions & 1 deletion rllm-cpp/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@ rust-version = "1.75.0"

[dependencies]
actix-web = "4.4.0"
anyhow = "1.0.79"
clap = { version = "4.4.18", features = ["derive"] }
llama_cpp_low = { path = "../llama-cpp-low" }
rllm = { path = "../rllm-cuda", default-features = false, features = ["llamacpp"] }
rllm = { path = "../rllm-lib" }

[[bin]]
name = "rllm-cpp"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::{
use rllm::{
seq::{Sequence, SequenceGroup},
SchedulerOutputs, TBlockSpaceManager,
};
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::sync::Arc;

use crate::{config::ModelMeta, LoaderArgs, Repo, RllmEngine};
use rllm::{config::ModelMeta, LoaderArgs, Repo, RllmEngine};
use anyhow::{bail, Result};

use llama_cpp_low as cpp;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::TensorOps;
use rllm::TensorOps;

pub mod blocks;
pub mod loader;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::sync::Mutex;

use crate::{HashMap, SeqId, SequenceManager};
use rllm::{HashMap, SeqId, SequenceManager};
use llama_cpp_low as cpp;

pub struct CppSequenceManager {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::{
use rllm::{
config::{ModelMeta, RllmConfig}, seq::SchedulingPhase, AiciBias, HashMap, LoaderArgs, LogitsProcessor, ModelExec, SchedulerOutputs, TensorOps
};
use aicirt::{with_timer, TimerRef};
Expand Down Expand Up @@ -179,7 +179,7 @@ impl ModelExec for TModel {
fn load_rllm_engine(
args: LoaderArgs,
model_args: Self::ModelLoaderArgs,
) -> Result<crate::RllmEngine<Self>> {
) -> Result<rllm::RllmEngine<Self>> {
load_rllm_engine(args, model_args)
}

Expand Down
7 changes: 3 additions & 4 deletions rllm-cpp/src/rllm-cpp.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
mod llamacpp;
use clap::Parser;
use rllm::{
llamacpp::tmodel::{CppLoaderArgs, TModel},
util::parse_with_settings,
};
use llamacpp::tmodel::{CppLoaderArgs, TModel};
use rllm::util::parse_with_settings;

/// Serve LLMs with AICI over HTTP with llama.cpp backend.
#[derive(Parser, Debug)]
Expand Down
18 changes: 7 additions & 11 deletions rllm-cuda/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[package]
name = "rllm"
name = "rllm-cuda"
version = "0.1.0"
edition = "2021"
default-run = "rllm-server"
Expand All @@ -20,14 +20,13 @@ futures = "0.3.29"
uuid = { version = "1.6.1", features = ["v4"] }
derive_more = "0.99.17"

tch = { version = "0.14.0", optional = true }
torch-sys = { version = "0.14.0", optional = true }
tch = { version = "0.14.0" }
torch-sys = { version = "0.14.0" }

cudarc = { version = "0.10.0", features = ["f16"], optional = true }
tch-cuda = { path = "../tch-cuda", optional = true }

llama_cpp_low = { path = "../llama-cpp-low", optional = true }

rllm = { path = "../rllm-lib" }
aicirt = { path = "../aicirt" }
aici_abi = { path = "../aici_abi" }
libc = "0.2.150"
Expand All @@ -42,11 +41,8 @@ percent-encoding = "2.3.1"

[[bin]]
name = "rllm-server"
path = "src/driver.rs"
path = "src/rllm-cuda.rs"

[features]
#default = ["llamacpp"]
default = ["tch", "cuda"]
tch = ["dep:tch", "dep:torch-sys"]
cuda = ["tch", "dep:tch-cuda", "dep:cudarc", "llama_cpp_low?/cuda"]
llamacpp = ["dep:llama_cpp_low"]
default = ["cuda"]
cuda = ["dep:tch-cuda", "dep:cudarc"]
6 changes: 0 additions & 6 deletions rllm-cuda/SHIP_TODO.md

This file was deleted.

2 changes: 1 addition & 1 deletion rllm-cuda/src/llm/config.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::config::{ModelMeta, RllmConfig};
use rllm::config::{ModelMeta, RllmConfig};
use aicirt::bail_user;
use anyhow::Result;
use tch::Device;
Expand Down
2 changes: 1 addition & 1 deletion rllm-cuda/src/llm/loader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use super::{
tmodel::TModel,
util::{gpu_memory_size, gpu_peak_allocated_bytes, log_mem_stats, reset_mem_stats},
};
use crate::{
use rllm::{
config::{ModelMeta, RllmConfig},
CacheSize, HashSet, LoaderArgs, Repo, RllmEngine,
};
Expand Down
2 changes: 1 addition & 1 deletion rllm-cuda/src/llm/paged/batch_info.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use super::super::{kernels::to_offsets, tmodel::TModel};
use super::cache_engine::CacheEngine;
use super::BlockAllocator;
use crate::{
use rllm::{
config::RllmConfig, seq::SchedulingPhase, util::pad_to_multiple, HashMap, SchedulerOutputs,
};
use aicirt::api::Token;
Expand Down
2 changes: 1 addition & 1 deletion rllm-cuda/src/llm/paged/blocks.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use super::super::tmodel::TModel;
use super::cache_engine::CacheEngine;
use crate::{
use rllm::{
config::RllmConfig,
seq::{SchedulingPhase, Sequence, SequenceGroup},
BlockLocation, CacheSize, HashMap, SchedulerOutputs, SeqId, SequenceManager,
Expand Down
2 changes: 1 addition & 1 deletion rllm-cuda/src/llm/paged/cache_engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

use super::super::{config::TchRllmConfig, kernels, tmodel::TModel};
use super::CacheIface;
use crate::{config::RllmConfig, CacheSize, HashMap};
use rllm::{config::RllmConfig, CacheSize, HashMap};
use std::sync::Arc;
use tch::{Device, Tensor};

Expand Down
2 changes: 1 addition & 1 deletion rllm-cuda/src/llm/refkernels.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use super::util::{check_all_close_attn, to_vec1};
use crate::HashMap;
use rllm::HashMap;
use tch::{IndexOp, Kind, Tensor};

pub fn reshape_and_cache(
Expand Down
10 changes: 5 additions & 5 deletions rllm-cuda/src/llm/tmodel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use super::{
util::synchronize,
DType,
};
use crate::{
use rllm::{
config::RllmConfig, AiciBias, LogitsProcessor, ModelExec, SchedulerOutputs, TensorOps,
};
use aicirt::{with_timer, TimerRef};
Expand Down Expand Up @@ -45,9 +45,9 @@ impl ModelExec for TModel {
type SequenceManager = TchSeqMgr;

fn load_model_config(
args: &crate::LoaderArgs,
args: &rllm::LoaderArgs,
model_args: &mut Self::ModelLoaderArgs,
) -> Result<(crate::config::ModelMeta, Self::ModelConfig)> {
) -> Result<(rllm::config::ModelMeta, Self::ModelConfig)> {
let m = load_model_config(args, model_args)?;
Ok((m.meta.clone(), m))
}
Expand All @@ -57,9 +57,9 @@ impl ModelExec for TModel {
}

fn load_rllm_engine(
args: crate::LoaderArgs,
args: rllm::LoaderArgs,
model_args: Self::ModelLoaderArgs,
) -> Result<crate::RllmEngine<Self>> {
) -> Result<rllm::RllmEngine<Self>> {
load_rllm_engine(args, model_args)
}

Expand Down
2 changes: 1 addition & 1 deletion rllm-cuda/src/llm/util.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use super::DType;
use crate::util::get_setting;
use rllm::util::get_setting;
use tch::{kind::Element, Device, IndexOp as _, Tensor};

#[cfg(feature = "cuda")]
Expand Down
18 changes: 11 additions & 7 deletions rllm-cuda/src/driver.rs → rllm-cuda/src/rllm-cuda.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
mod llm;

use clap::Parser;
use rllm::{
llm::{
tmodel::{TModel, TchLoaderArgs},
DType,
},
util::parse_with_settings,
use llm::{
tmodel::{TModel, TchLoaderArgs},
DType,
};
use rllm::util::parse_with_settings;
use tch::Device;

/// Serve LLMs with AICI over HTTP with tch (torch) backend.
Expand Down Expand Up @@ -48,6 +48,10 @@ async fn main() -> () {
_ => panic!("invalid dtype; try one of bf16, f16, f32"),
};

let model_args = TchLoaderArgs { device, dtype, profile_step_no: args.profile_step };
let model_args = TchLoaderArgs {
device,
dtype,
profile_step_no: args.profile_step,
};
rllm::server::server_main::<TModel>(args.args, model_args).await;
}
32 changes: 32 additions & 0 deletions rllm-lib/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[package]
name = "rllm"
version = "0.1.0"
edition = "2021"

[dependencies]
anyhow = "1.0.75"
clap = "4.4.8"
hf-hub = "0.3.2"
tokenizers = { version = "0.15.0", features = ["hf-hub"] }
serde_json = "1.0.108"
serde = { version = "1.0.193", features = ["derive"] }
rand = "0.8.5"
half = "2.3.1"
log = "0.4.20"
actix-web = "4.4.0"
tokio = { version = "1.34.0", features = ["sync"] }
futures = "0.3.29"
uuid = { version = "1.6.1", features = ["v4"] }
derive_more = "0.99.17"

aicirt = { path = "../aicirt" }
aici_abi = { path = "../aici_abi" }
libc = "0.2.150"
base64 = "0.21.5"
indicatif = "0.17.7"
memmap2 = "0.9.0"
safetensors = "0.4.1"
lazy_static = "1.4.0"
fxhash = "0.2.1"
cfg-if = "1.0.0"
percent-encoding = "2.3.1"
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
8 changes: 0 additions & 8 deletions rllm-cuda/src/lib.rs → rllm-lib/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,6 @@ pub use logits::LogitsProcessor;
pub use scheduler::*;
use std::sync::atomic::AtomicBool;

cfg_if::cfg_if! {
if #[cfg(feature = "tch")] {
pub mod llm;
} else {
pub mod llamacpp;
}
}

pub use fxhash::FxHashMap as HashMap;
pub use fxhash::FxHashSet as HashSet;

Expand Down
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion rllm-cuda/src/seq.rs → rllm-lib/src/seq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ pub struct Sequence {
pub prompt_len: usize,
pub(crate) output_ptr: usize,
pub(crate) output_pending: Vec<u8>,
pub(crate) num_kv_computed: usize,
pub num_kv_computed: usize,
pub(crate) has_aici: bool,
pub(crate) aici_sampling: AiciSampling,
pub aici_logs: Vec<SequenceResult>,
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

0 comments on commit 469cf64

Please sign in to comment.