split cpp/cuda code into folders

microsoft · Feb 7, 2024 · 469cf64 · 469cf64
1 parent 12ae462
commit 469cf64
Show file tree

Hide file tree

Showing 38 changed files with 112 additions and 61 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -7,6 +7,7 @@ members = [
     "pyctrl",
     "jsctrl",
     "uppercase",
+    "rllm-lib",
     "rllm-cuda",
     "rllm-cpp",
     "tch-cuda",

diff --git a/rllm-cpp/Cargo.toml b/rllm-cpp/Cargo.toml
@@ -6,9 +6,10 @@ rust-version = "1.75.0"
 
 [dependencies]
 actix-web = "4.4.0"
+anyhow = "1.0.79"
 clap = { version = "4.4.18", features = ["derive"] }
 llama_cpp_low = { path = "../llama-cpp-low" }
-rllm = { path = "../rllm-cuda", default-features = false, features = ["llamacpp"] }
+rllm = { path = "../rllm-lib" }
 
 [[bin]]
 name = "rllm-cpp"

diff --git a/rllm-cuda/src/llamacpp/blocks.rs → rllm-cpp/src/llamacpp/blocks.rs b/rllm-cuda/src/llamacpp/blocks.rs → rllm-cpp/src/llamacpp/blocks.rs
@@ -1,4 +1,4 @@
-use crate::{
+use rllm::{
     seq::{Sequence, SequenceGroup},
     SchedulerOutputs, TBlockSpaceManager,
 };

diff --git a/rllm-cuda/src/llamacpp/loader.rs → rllm-cpp/src/llamacpp/loader.rs b/rllm-cuda/src/llamacpp/loader.rs → rllm-cpp/src/llamacpp/loader.rs
@@ -1,6 +1,6 @@
 use std::sync::Arc;
 
-use crate::{config::ModelMeta, LoaderArgs, Repo, RllmEngine};
+use rllm::{config::ModelMeta, LoaderArgs, Repo, RllmEngine};
 use anyhow::{bail, Result};
 
 use llama_cpp_low as cpp;

diff --git a/rllm-cuda/src/llamacpp/mod.rs → rllm-cpp/src/llamacpp/mod.rs b/rllm-cuda/src/llamacpp/mod.rs → rllm-cpp/src/llamacpp/mod.rs
@@ -1,4 +1,4 @@
-use crate::TensorOps;
+use rllm::TensorOps;
 
 pub mod blocks;
 pub mod loader;

diff --git a/rllm-cuda/src/llamacpp/seqid.rs → rllm-cpp/src/llamacpp/seqid.rs b/rllm-cuda/src/llamacpp/seqid.rs → rllm-cpp/src/llamacpp/seqid.rs
@@ -1,6 +1,6 @@
 use std::sync::Mutex;
 
-use crate::{HashMap, SeqId, SequenceManager};
+use rllm::{HashMap, SeqId, SequenceManager};
 use llama_cpp_low as cpp;
 
 pub struct CppSequenceManager {

diff --git a/rllm-cuda/src/llamacpp/tmodel.rs → rllm-cpp/src/llamacpp/tmodel.rs b/rllm-cuda/src/llamacpp/tmodel.rs → rllm-cpp/src/llamacpp/tmodel.rs
@@ -1,4 +1,4 @@
-use crate::{
+use rllm::{
     config::{ModelMeta, RllmConfig}, seq::SchedulingPhase, AiciBias, HashMap, LoaderArgs, LogitsProcessor, ModelExec, SchedulerOutputs, TensorOps
 };
 use aicirt::{with_timer, TimerRef};
@@ -179,7 +179,7 @@ impl ModelExec for TModel {
     fn load_rllm_engine(
         args: LoaderArgs,
         model_args: Self::ModelLoaderArgs,
-    ) -> Result<crate::RllmEngine<Self>> {
+    ) -> Result<rllm::RllmEngine<Self>> {
         load_rllm_engine(args, model_args)
     }
 

diff --git a/rllm-cpp/src/rllm-cpp.rs b/rllm-cpp/src/rllm-cpp.rs
@@ -1,8 +1,7 @@
+mod llamacpp;
 use clap::Parser;
-use rllm::{
-    llamacpp::tmodel::{CppLoaderArgs, TModel},
-    util::parse_with_settings,
-};
+use llamacpp::tmodel::{CppLoaderArgs, TModel};
+use rllm::util::parse_with_settings;
 
 /// Serve LLMs with AICI over HTTP with llama.cpp backend.
 #[derive(Parser, Debug)]

diff --git a/rllm-cuda/Cargo.toml b/rllm-cuda/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "rllm"
+name = "rllm-cuda"
 version = "0.1.0"
 edition = "2021"
 default-run = "rllm-server"
@@ -20,14 +20,13 @@ futures = "0.3.29"
 uuid = { version = "1.6.1", features = ["v4"] }
 derive_more = "0.99.17"
 
-tch = { version = "0.14.0", optional = true }
-torch-sys = { version = "0.14.0", optional = true }
+tch = { version = "0.14.0" }
+torch-sys = { version = "0.14.0" }
 
 cudarc = { version = "0.10.0", features = ["f16"], optional = true }
 tch-cuda = { path = "../tch-cuda", optional = true }
 
-llama_cpp_low = { path = "../llama-cpp-low", optional = true }
-
+rllm = { path = "../rllm-lib" }
 aicirt = { path = "../aicirt" }
 aici_abi = { path = "../aici_abi" }
 libc = "0.2.150"
@@ -42,11 +41,8 @@ percent-encoding = "2.3.1"
 
 [[bin]]
 name = "rllm-server"
-path = "src/driver.rs"
+path = "src/rllm-cuda.rs"
 
 [features]
-#default = ["llamacpp"]
-default = ["tch", "cuda"]
-tch = ["dep:tch", "dep:torch-sys"]
-cuda = ["tch", "dep:tch-cuda", "dep:cudarc", "llama_cpp_low?/cuda"]
-llamacpp = ["dep:llama_cpp_low"]
+default = ["cuda"]
+cuda = ["dep:tch-cuda", "dep:cudarc"]
diff --git a/rllm-cuda/SHIP_TODO.md b/rllm-cuda/SHIP_TODO.md
diff --git a/rllm-cuda/src/llm/config.rs b/rllm-cuda/src/llm/config.rs
@@ -1,4 +1,4 @@
-use crate::config::{ModelMeta, RllmConfig};
+use rllm::config::{ModelMeta, RllmConfig};
 use aicirt::bail_user;
 use anyhow::Result;
 use tch::Device;

diff --git a/rllm-cuda/src/llm/loader.rs b/rllm-cuda/src/llm/loader.rs
@@ -6,7 +6,7 @@ use super::{
     tmodel::TModel,
     util::{gpu_memory_size, gpu_peak_allocated_bytes, log_mem_stats, reset_mem_stats},
 };
-use crate::{
+use rllm::{
     config::{ModelMeta, RllmConfig},
     CacheSize, HashSet, LoaderArgs, Repo, RllmEngine,
 };

diff --git a/rllm-cuda/src/llm/paged/batch_info.rs b/rllm-cuda/src/llm/paged/batch_info.rs
@@ -1,7 +1,7 @@
 use super::super::{kernels::to_offsets, tmodel::TModel};
 use super::cache_engine::CacheEngine;
 use super::BlockAllocator;
-use crate::{
+use rllm::{
     config::RllmConfig, seq::SchedulingPhase, util::pad_to_multiple, HashMap, SchedulerOutputs,
 };
 use aicirt::api::Token;

diff --git a/rllm-cuda/src/llm/paged/blocks.rs b/rllm-cuda/src/llm/paged/blocks.rs
@@ -1,6 +1,6 @@
 use super::super::tmodel::TModel;
 use super::cache_engine::CacheEngine;
-use crate::{
+use rllm::{
     config::RllmConfig,
     seq::{SchedulingPhase, Sequence, SequenceGroup},
     BlockLocation, CacheSize, HashMap, SchedulerOutputs, SeqId, SequenceManager,

diff --git a/rllm-cuda/src/llm/paged/cache_engine.rs b/rllm-cuda/src/llm/paged/cache_engine.rs
@@ -2,7 +2,7 @@
 
 use super::super::{config::TchRllmConfig, kernels, tmodel::TModel};
 use super::CacheIface;
-use crate::{config::RllmConfig, CacheSize, HashMap};
+use rllm::{config::RllmConfig, CacheSize, HashMap};
 use std::sync::Arc;
 use tch::{Device, Tensor};
 

diff --git a/rllm-cuda/src/llm/refkernels.rs b/rllm-cuda/src/llm/refkernels.rs
@@ -1,5 +1,5 @@
 use super::util::{check_all_close_attn, to_vec1};
-use crate::HashMap;
+use rllm::HashMap;
 use tch::{IndexOp, Kind, Tensor};
 
 pub fn reshape_and_cache(

diff --git a/rllm-cuda/src/llm/tmodel.rs b/rllm-cuda/src/llm/tmodel.rs
@@ -5,7 +5,7 @@ use super::{
     util::synchronize,
     DType,
 };
-use crate::{
+use rllm::{
     config::RllmConfig, AiciBias, LogitsProcessor, ModelExec, SchedulerOutputs, TensorOps,
 };
 use aicirt::{with_timer, TimerRef};
@@ -45,9 +45,9 @@ impl ModelExec for TModel {
     type SequenceManager = TchSeqMgr;
 
     fn load_model_config(
-        args: &crate::LoaderArgs,
+        args: &rllm::LoaderArgs,
         model_args: &mut Self::ModelLoaderArgs,
-    ) -> Result<(crate::config::ModelMeta, Self::ModelConfig)> {
+    ) -> Result<(rllm::config::ModelMeta, Self::ModelConfig)> {
         let m = load_model_config(args, model_args)?;
         Ok((m.meta.clone(), m))
     }
@@ -57,9 +57,9 @@ impl ModelExec for TModel {
     }
 
     fn load_rllm_engine(
-        args: crate::LoaderArgs,
+        args: rllm::LoaderArgs,
         model_args: Self::ModelLoaderArgs,
-    ) -> Result<crate::RllmEngine<Self>> {
+    ) -> Result<rllm::RllmEngine<Self>> {
         load_rllm_engine(args, model_args)
     }
 

diff --git a/rllm-cuda/src/llm/util.rs b/rllm-cuda/src/llm/util.rs
@@ -1,5 +1,5 @@
 use super::DType;
-use crate::util::get_setting;
+use rllm::util::get_setting;
 use tch::{kind::Element, Device, IndexOp as _, Tensor};
 
 #[cfg(feature = "cuda")]

diff --git a/rllm-cuda/src/driver.rs → rllm-cuda/src/rllm-cuda.rs b/rllm-cuda/src/driver.rs → rllm-cuda/src/rllm-cuda.rs
@@ -1,11 +1,11 @@
+mod llm;
+
 use clap::Parser;
-use rllm::{
-    llm::{
-        tmodel::{TModel, TchLoaderArgs},
-        DType,
-    },
-    util::parse_with_settings,
+use llm::{
+    tmodel::{TModel, TchLoaderArgs},
+    DType,
 };
+use rllm::util::parse_with_settings;
 use tch::Device;
 
 /// Serve LLMs with AICI over HTTP with tch (torch) backend.
@@ -48,6 +48,10 @@ async fn main() -> () {
         _ => panic!("invalid dtype; try one of bf16, f16, f32"),
     };
 
-    let model_args = TchLoaderArgs { device, dtype, profile_step_no: args.profile_step };
+    let model_args = TchLoaderArgs {
+        device,
+        dtype,
+        profile_step_no: args.profile_step,
+    };
     rllm::server::server_main::<TModel>(args.args, model_args).await;
 }
diff --git a/rllm-lib/Cargo.toml b/rllm-lib/Cargo.toml
@@ -0,0 +1,32 @@
+[package]
+name = "rllm"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+anyhow = "1.0.75"
+clap = "4.4.8"
+hf-hub = "0.3.2"
+tokenizers = { version = "0.15.0", features = ["hf-hub"] }
+serde_json = "1.0.108"
+serde = { version = "1.0.193", features = ["derive"] }
+rand = "0.8.5"
+half = "2.3.1"
+log = "0.4.20"
+actix-web = "4.4.0"
+tokio = { version = "1.34.0", features = ["sync"] }
+futures = "0.3.29"
+uuid = { version = "1.6.1", features = ["v4"] }
+derive_more = "0.99.17"
+
+aicirt = { path = "../aicirt" }
+aici_abi = { path = "../aici_abi" }
+libc = "0.2.150"
+base64 = "0.21.5"
+indicatif = "0.17.7"
+memmap2 = "0.9.0"
+safetensors = "0.4.1"
+lazy_static = "1.4.0"
+fxhash = "0.2.1"
+cfg-if = "1.0.0"
+percent-encoding = "2.3.1"
diff --git a/rllm-cuda/src/config.rs → rllm-lib/src/config.rs b/rllm-cuda/src/config.rs → rllm-lib/src/config.rs
diff --git a/rllm-cuda/src/engine.rs → rllm-lib/src/engine.rs b/rllm-cuda/src/engine.rs → rllm-lib/src/engine.rs
diff --git a/rllm-cuda/src/exec.rs → rllm-lib/src/exec.rs b/rllm-cuda/src/exec.rs → rllm-lib/src/exec.rs
diff --git a/rllm-cuda/src/expected.rs → rllm-lib/src/expected.rs b/rllm-cuda/src/expected.rs → rllm-lib/src/expected.rs
diff --git a/rllm-cuda/src/iface.rs → rllm-lib/src/iface.rs b/rllm-cuda/src/iface.rs → rllm-lib/src/iface.rs
diff --git a/rllm-cuda/src/lib.rs → rllm-lib/src/lib.rs b/rllm-cuda/src/lib.rs → rllm-lib/src/lib.rs
@@ -18,14 +18,6 @@ pub use logits::LogitsProcessor;
 pub use scheduler::*;
 use std::sync::atomic::AtomicBool;
 
-cfg_if::cfg_if! {
-    if #[cfg(feature = "tch")] {
-        pub mod llm;
-    } else {
-        pub mod llamacpp;
-    }
-}
-
 pub use fxhash::FxHashMap as HashMap;
 pub use fxhash::FxHashSet as HashSet;
 

diff --git a/rllm-cuda/src/logits.rs → rllm-lib/src/logits.rs b/rllm-cuda/src/logits.rs → rllm-lib/src/logits.rs
diff --git a/rllm-cuda/src/scheduler.rs → rllm-lib/src/scheduler.rs b/rllm-cuda/src/scheduler.rs → rllm-lib/src/scheduler.rs
diff --git a/rllm-cuda/src/seq.rs → rllm-lib/src/seq.rs b/rllm-cuda/src/seq.rs → rllm-lib/src/seq.rs
@@ -75,7 +75,7 @@ pub struct Sequence {
     pub prompt_len: usize,
     pub(crate) output_ptr: usize,
     pub(crate) output_pending: Vec<u8>,
-    pub(crate) num_kv_computed: usize,
+    pub num_kv_computed: usize,
     pub(crate) has_aici: bool,
     pub(crate) aici_sampling: AiciSampling,
     pub aici_logs: Vec<SequenceResult>,

diff --git a/rllm-cuda/src/server/api.rs → rllm-lib/src/server/api.rs b/rllm-cuda/src/server/api.rs → rllm-lib/src/server/api.rs
diff --git a/rllm-cuda/src/server/completion.rs → rllm-lib/src/server/completion.rs b/rllm-cuda/src/server/completion.rs → rllm-lib/src/server/completion.rs
diff --git a/rllm-cuda/src/server/mod.rs → rllm-lib/src/server/mod.rs b/rllm-cuda/src/server/mod.rs → rllm-lib/src/server/mod.rs
diff --git a/rllm-cuda/src/server/openai/LICENSE → rllm-lib/src/server/openai/LICENSE b/rllm-cuda/src/server/openai/LICENSE → rllm-lib/src/server/openai/LICENSE
diff --git a/rllm-cuda/src/server/openai/mod.rs → rllm-lib/src/server/openai/mod.rs b/rllm-cuda/src/server/openai/mod.rs → rllm-lib/src/server/openai/mod.rs
diff --git a/rllm-cuda/src/server/openai/requests.rs → rllm-lib/src/server/openai/requests.rs b/rllm-cuda/src/server/openai/requests.rs → rllm-lib/src/server/openai/requests.rs
diff --git a/rllm-cuda/src/server/openai/responses.rs → rllm-lib/src/server/openai/responses.rs b/rllm-cuda/src/server/openai/responses.rs → rllm-lib/src/server/openai/responses.rs
diff --git a/rllm-cuda/src/util.rs → rllm-lib/src/util.rs b/rllm-cuda/src/util.rs → rllm-lib/src/util.rs