diff --git a/rllm-cuda/src/engine.rs b/rllm-cuda/src/engine.rs index 77e20bb4..128d00c3 100644 --- a/rllm-cuda/src/engine.rs +++ b/rllm-cuda/src/engine.rs @@ -1,14 +1,13 @@ use crate::{ config::{CacheConfig, ParallelConfig, RllmConfig, SamplingParams, SchedulerConfig}, iface::AiciRtIface, - paged::{CacheSize, Scheduler, SchedulerOutputs}, seq::{ AiciSampling, FinishReason, RequestOutput, SchedulingPhase, SeqOutput, Sequence, SequenceGroup, Token, TokenUsage, }, util::get_setting, - AiciBias as _, HashMap, LoaderArgs, LogitsProcessor, ModelExec, SequenceManager, - TBlockSpaceManager as _, TensorOps, + AiciBias as _, CacheSize, HashMap, LoaderArgs, LogitsProcessor, ModelExec, Scheduler, + SchedulerOutputs, SequenceManager, TBlockSpaceManager as _, TensorOps, }; use aici_abi::toktree::TokTrie; use aicirt::{ diff --git a/rllm-cuda/src/exec.rs b/rllm-cuda/src/exec.rs index dd85df67..caa6d86c 100644 --- a/rllm-cuda/src/exec.rs +++ b/rllm-cuda/src/exec.rs @@ -5,7 +5,7 @@ use anyhow::Result; use crate::{ config::{ModelMeta, RllmConfig}, - paged::{CacheSize, SchedulerOutputs}, + scheduler::{CacheSize, SchedulerOutputs}, seq::{Sequence, SequenceGroup}, HashMap, LoaderArgs, LogitsProcessor, RllmEngine, }; diff --git a/rllm-cuda/src/lib.rs b/rllm-cuda/src/lib.rs index ea29efc2..ecf42d1e 100644 --- a/rllm-cuda/src/lib.rs +++ b/rllm-cuda/src/lib.rs @@ -1,4 +1,3 @@ -pub mod paged; pub mod seq; // vllm modules @@ -8,6 +7,7 @@ mod exec; mod expected; pub mod iface; mod logits; +mod scheduler; pub mod server; pub mod util; @@ -15,12 +15,13 @@ use config::AiciConfig; pub use engine::*; pub use exec::*; pub use logits::LogitsProcessor; +pub use scheduler::*; use std::sync::atomic::AtomicBool; cfg_if::cfg_if! { if #[cfg(feature = "tch")] { pub mod llm; - pub(crate) use paged::BlockRef; + pub(crate) use llm::paged::BlockRef; } else { pub mod llamacpp; pub use llamacpp::BlockRef; diff --git a/rllm-cuda/src/llm/kernels.rs b/rllm-cuda/src/llm/kernels.rs index 17086d01..819c9896 100644 --- a/rllm-cuda/src/llm/kernels.rs +++ b/rllm-cuda/src/llm/kernels.rs @@ -1,5 +1,5 @@ #[cfg(not(feature = "cuda"))] -pub use crate::llm::refkernels::*; +pub use super::refkernels::*; use tch::{Device, Tensor}; #[cfg(feature = "cuda")] pub use tch_cuda::flash_attn_varlen as varlen_attn; diff --git a/rllm-cuda/src/llm/llama.rs b/rllm-cuda/src/llm/llama.rs index a7f31138..8f27b3b8 100644 --- a/rllm-cuda/src/llm/llama.rs +++ b/rllm-cuda/src/llm/llama.rs @@ -2,13 +2,17 @@ use super::{ config::{CommonModelConfig, ModelConfig, ModelType, RllmModelConfig}, - linear_no_bias, varlen_attn, RmsNorm, RotaryEmbedding, + linear_no_bias, + paged::BatchInfo, + varlen_attn, RmsNorm, RotaryEmbedding, }; -use crate::paged::BatchInfo; use anyhow::Result; use serde::Deserialize; use std::rc::Rc; -use tch::{nn::{self, Module, Path}, Tensor}; +use tch::{ + nn::{self, Module, Path}, + Tensor, +}; use super::tmodel::TModelInner; diff --git a/rllm-cuda/src/llm/loader.rs b/rllm-cuda/src/llm/loader.rs index 177fefa9..3dc75b27 100644 --- a/rllm-cuda/src/llm/loader.rs +++ b/rllm-cuda/src/llm/loader.rs @@ -1,13 +1,14 @@ +use super::{ + config::ModelType, + llama, + paged::{BatchInfoBuilder, CacheEngine}, + phi, + tmodel::TModel, + util::{gpu_memory_size, gpu_peak_allocated_bytes, log_mem_stats, reset_mem_stats}, +}; use crate::{ config::{ModelMeta, RllmConfig}, - llm::{ - config::ModelType, - llama, phi, - tmodel::TModel, - util::{gpu_memory_size, gpu_peak_allocated_bytes, log_mem_stats, reset_mem_stats}, - }, - paged::{BatchInfoBuilder, CacheEngine, CacheSize}, - HashSet, LoaderArgs, Repo, RllmEngine, + CacheSize, HashSet, LoaderArgs, Repo, RllmEngine, }; use anyhow::{bail, Result}; use safetensors::Dtype; diff --git a/rllm-cuda/src/llm/mod.rs b/rllm-cuda/src/llm/mod.rs index 762db8cd..c89c8547 100644 --- a/rllm-cuda/src/llm/mod.rs +++ b/rllm-cuda/src/llm/mod.rs @@ -7,17 +7,16 @@ pub mod refkernels; pub mod seqid; pub mod tmodel; pub mod util; +pub mod paged; use self::config::ModelConfig; -use crate::{ - llm::util::{check_all_close, check_all_close_attn}, - paged::BatchInfo, -}; +use paged::BatchInfo; use std::rc::Rc; use tch::{ nn::{self, Module, Path}, IndexOp, Tensor, }; +use util::{check_all_close, check_all_close_attn}; // note that this doesn't work for phi-2 - it seems particularly numerically unstable const CHECK: bool = false; diff --git a/rllm-cuda/src/paged/batch_info.rs b/rllm-cuda/src/llm/paged/batch_info.rs similarity index 99% rename from rllm-cuda/src/paged/batch_info.rs rename to rllm-cuda/src/llm/paged/batch_info.rs index c8baa564..f8a6a36b 100644 --- a/rllm-cuda/src/paged/batch_info.rs +++ b/rllm-cuda/src/llm/paged/batch_info.rs @@ -1,10 +1,10 @@ -use super::{cache_engine::CacheEngine, scheduler::SchedulerOutputs}; +use super::cache_engine::CacheEngine; use crate::{ config::RllmConfig, llm::{kernels::to_offsets, tmodel::TModel}, seq::SchedulingPhase, util::pad_to_multiple, - HashMap, + HashMap, SchedulerOutputs, }; use aicirt::api::Token; use std::{ diff --git a/rllm-cuda/src/paged/blocks.rs b/rllm-cuda/src/llm/paged/blocks.rs similarity index 98% rename from rllm-cuda/src/paged/blocks.rs rename to rllm-cuda/src/llm/paged/blocks.rs index 711b778a..e04fc906 100644 --- a/rllm-cuda/src/paged/blocks.rs +++ b/rllm-cuda/src/llm/paged/blocks.rs @@ -1,9 +1,9 @@ +use super::cache_engine::CacheEngine; use crate::{ config::RllmConfig, llm::tmodel::TModel, - paged::{cache_engine::CacheEngine, scheduler::SchedulerOutputs, CacheSize}, seq::{SchedulingPhase, Sequence, SequenceGroup}, - BlockLocation, HashMap, TBlockSpaceManager, + BlockLocation, CacheSize, HashMap, SchedulerOutputs, TBlockSpaceManager, }; use std::{ sync::{Arc, Mutex}, diff --git a/rllm-cuda/src/paged/cache_engine.rs b/rllm-cuda/src/llm/paged/cache_engine.rs similarity index 97% rename from rllm-cuda/src/paged/cache_engine.rs rename to rllm-cuda/src/llm/paged/cache_engine.rs index e3607f0e..2ac2462c 100644 --- a/rllm-cuda/src/paged/cache_engine.rs +++ b/rllm-cuda/src/llm/paged/cache_engine.rs @@ -7,13 +7,11 @@ use tch::{Device, Tensor}; use tch_cuda::{CudaEvent, CudaStream}; use crate::{ - config::RllmConfig, - llm::{config::TchRllmConfig, kernels, tmodel::TModel}, - HashMap, + config::RllmConfig, llm::{config::TchRllmConfig, kernels, tmodel::TModel}, CacheSize, HashMap }; use std::sync::Arc; -use super::{CacheIface, CacheSize}; +use super::CacheIface; type KVCache = (Tensor, Tensor); diff --git a/rllm-cuda/src/paged/cuda_stub.rs b/rllm-cuda/src/llm/paged/cuda_stub.rs similarity index 100% rename from rllm-cuda/src/paged/cuda_stub.rs rename to rllm-cuda/src/llm/paged/cuda_stub.rs diff --git a/rllm-cuda/src/llm/paged/mod.rs b/rllm-cuda/src/llm/paged/mod.rs new file mode 100644 index 00000000..84f62a1f --- /dev/null +++ b/rllm-cuda/src/llm/paged/mod.rs @@ -0,0 +1,10 @@ +#[cfg(not(feature = "cuda"))] +mod cuda_stub; + +mod batch_info; +mod blocks; +mod cache_engine; + +pub use batch_info::*; +pub use blocks::*; +pub use cache_engine::*; diff --git a/rllm-cuda/src/llm/phi.rs b/rllm-cuda/src/llm/phi.rs index ea5f1e92..76a556e1 100644 --- a/rllm-cuda/src/llm/phi.rs +++ b/rllm-cuda/src/llm/phi.rs @@ -1,8 +1,9 @@ use super::{ config::{CommonModelConfig, ModelConfig, ModelType, RllmModelConfig}, - layer_norm, linear, varlen_attn, RotaryEmbedding, + layer_norm, linear, + paged::BatchInfo, + varlen_attn, RotaryEmbedding, }; -use crate::paged::BatchInfo; use serde::Deserialize; use std::rc::Rc; use tch::{ diff --git a/rllm-cuda/src/llm/refkernels.rs b/rllm-cuda/src/llm/refkernels.rs index 93881591..2286f69f 100644 --- a/rllm-cuda/src/llm/refkernels.rs +++ b/rllm-cuda/src/llm/refkernels.rs @@ -1,4 +1,4 @@ -use crate::llm::util::{check_all_close_attn, to_vec1}; +use super::util::{check_all_close_attn, to_vec1}; use crate::HashMap; use tch::{IndexOp, Kind, Tensor}; diff --git a/rllm-cuda/src/llm/tmodel.rs b/rllm-cuda/src/llm/tmodel.rs index 5f3cfdc5..dd77276d 100644 --- a/rllm-cuda/src/llm/tmodel.rs +++ b/rllm-cuda/src/llm/tmodel.rs @@ -1,24 +1,20 @@ -use std::{sync::Arc, time::Instant}; - -use aicirt::{with_timer, TimerRef}; -use anyhow::Result; -use rand::distributions::Distribution as _; -use tch::{Device, IndexOp, Tensor}; - -use crate::{ - config::RllmConfig, - llm::{loader::load_model_config, util::synchronize, DType}, - paged::{ - BatchInfo, BatchInfoBuilder, BlockSpaceManager, CacheEngine, CacheIface, SchedulerOutputs, - }, - AiciBias, LogitsProcessor, ModelExec, TensorOps, -}; - use super::{ config::{self, TchRllmConfig}, + loader::load_model_config, loader::load_rllm_engine, + paged::{BatchInfo, BatchInfoBuilder, BlockSpaceManager, CacheEngine, CacheIface}, seqid::TchSeqMgr, + util::synchronize, + DType, +}; +use crate::{ + config::RllmConfig, AiciBias, LogitsProcessor, ModelExec, SchedulerOutputs, TensorOps, }; +use aicirt::{with_timer, TimerRef}; +use anyhow::Result; +use rand::distributions::Distribution as _; +use std::{sync::Arc, time::Instant}; +use tch::{Device, IndexOp, Tensor}; pub trait TModelInner { fn forward(&self, batch_info: &mut BatchInfo) -> Tensor; diff --git a/rllm-cuda/src/llm/util.rs b/rllm-cuda/src/llm/util.rs index 488f9507..fe5ed830 100644 --- a/rllm-cuda/src/llm/util.rs +++ b/rllm-cuda/src/llm/util.rs @@ -1,4 +1,5 @@ -use crate::{llm::DType, util::get_setting}; +use super::DType; +use crate::util::get_setting; use tch::{kind::Element, Device, IndexOp as _, Tensor}; #[cfg(feature = "cuda")] diff --git a/rllm-cuda/src/paged/mod.rs b/rllm-cuda/src/paged/mod.rs deleted file mode 100644 index abe3e10e..00000000 --- a/rllm-cuda/src/paged/mod.rs +++ /dev/null @@ -1,23 +0,0 @@ -mod scheduler; - -pub use scheduler::*; - -cfg_if::cfg_if! { - if #[cfg(feature = "tch")] { - #[cfg(not(feature = "cuda"))] - mod cuda_stub; - - mod blocks; - mod cache_engine; - mod batch_info; - - pub use batch_info::*; - pub use cache_engine::*; - pub use blocks::*; - } -} - -pub struct CacheSize { - pub gpu: usize, - pub cpu: usize, -} diff --git a/rllm-cuda/src/paged/scheduler.rs b/rllm-cuda/src/scheduler.rs similarity index 98% rename from rllm-cuda/src/paged/scheduler.rs rename to rllm-cuda/src/scheduler.rs index 96a7cb1b..d135ac42 100644 --- a/rllm-cuda/src/paged/scheduler.rs +++ b/rllm-cuda/src/scheduler.rs @@ -1,5 +1,8 @@ use crate::{ - config::RllmConfig, paged::CacheSize, seq::{FinishReason, SchedulingPhase, Sequence, SequenceGroup}, util::limit_str, HashMap, ModelExec, SequenceManager, TBlockSpaceManager + config::RllmConfig, + seq::{FinishReason, SchedulingPhase, Sequence, SequenceGroup}, + util::limit_str, + HashMap, ModelExec, SequenceManager, TBlockSpaceManager, }; use aicirt::api::SequenceResult; use std::{ @@ -484,3 +487,8 @@ impl Scheduler { } } } + +pub struct CacheSize { + pub gpu: usize, + pub cpu: usize, +}