move paged into llm/

microsoft · Feb 6, 2024 · b0254f8 · b0254f8
1 parent 662fd23
commit b0254f8
Show file tree

Hide file tree

Showing 18 changed files with 69 additions and 74 deletions.
diff --git a/rllm-cuda/src/engine.rs b/rllm-cuda/src/engine.rs
@@ -1,14 +1,13 @@
 use crate::{
     config::{CacheConfig, ParallelConfig, RllmConfig, SamplingParams, SchedulerConfig},
     iface::AiciRtIface,
-    paged::{CacheSize, Scheduler, SchedulerOutputs},
     seq::{
         AiciSampling, FinishReason, RequestOutput, SchedulingPhase, SeqOutput, Sequence,
         SequenceGroup, Token, TokenUsage,
     },
     util::get_setting,
-    AiciBias as _, HashMap, LoaderArgs, LogitsProcessor, ModelExec, SequenceManager,
-    TBlockSpaceManager as _, TensorOps,
+    AiciBias as _, CacheSize, HashMap, LoaderArgs, LogitsProcessor, ModelExec, Scheduler,
+    SchedulerOutputs, SequenceManager, TBlockSpaceManager as _, TensorOps,
 };
 use aici_abi::toktree::TokTrie;
 use aicirt::{

diff --git a/rllm-cuda/src/exec.rs b/rllm-cuda/src/exec.rs
@@ -5,7 +5,7 @@ use anyhow::Result;
 
 use crate::{
     config::{ModelMeta, RllmConfig},
-    paged::{CacheSize, SchedulerOutputs},
+    scheduler::{CacheSize, SchedulerOutputs},
     seq::{Sequence, SequenceGroup},
     HashMap, LoaderArgs, LogitsProcessor, RllmEngine,
 };

diff --git a/rllm-cuda/src/lib.rs b/rllm-cuda/src/lib.rs
@@ -1,4 +1,3 @@
-pub mod paged;
 pub mod seq;
 
 // vllm modules
@@ -8,19 +7,21 @@ mod exec;
 mod expected;
 pub mod iface;
 mod logits;
+mod scheduler;
 pub mod server;
 pub mod util;
 
 use config::AiciConfig;
 pub use engine::*;
 pub use exec::*;
 pub use logits::LogitsProcessor;
+pub use scheduler::*;
 use std::sync::atomic::AtomicBool;
 
 cfg_if::cfg_if! {
     if #[cfg(feature = "tch")] {
         pub mod llm;
-        pub(crate) use paged::BlockRef;
+        pub(crate) use llm::paged::BlockRef;
     } else {
         pub mod llamacpp;
         pub use llamacpp::BlockRef;

diff --git a/rllm-cuda/src/llm/kernels.rs b/rllm-cuda/src/llm/kernels.rs
@@ -1,5 +1,5 @@
 #[cfg(not(feature = "cuda"))]
-pub use crate::llm::refkernels::*;
+pub use super::refkernels::*;
 use tch::{Device, Tensor};
 #[cfg(feature = "cuda")]
 pub use tch_cuda::flash_attn_varlen as varlen_attn;

diff --git a/rllm-cuda/src/llm/llama.rs b/rllm-cuda/src/llm/llama.rs
@@ -2,13 +2,17 @@
 
 use super::{
     config::{CommonModelConfig, ModelConfig, ModelType, RllmModelConfig},
-    linear_no_bias, varlen_attn, RmsNorm, RotaryEmbedding,
+    linear_no_bias,
+    paged::BatchInfo,
+    varlen_attn, RmsNorm, RotaryEmbedding,
 };
-use crate::paged::BatchInfo;
 use anyhow::Result;
 use serde::Deserialize;
 use std::rc::Rc;
-use tch::{nn::{self, Module, Path}, Tensor};
+use tch::{
+    nn::{self, Module, Path},
+    Tensor,
+};
 
 use super::tmodel::TModelInner;
 

diff --git a/rllm-cuda/src/llm/loader.rs b/rllm-cuda/src/llm/loader.rs
@@ -1,13 +1,14 @@
+use super::{
+    config::ModelType,
+    llama,
+    paged::{BatchInfoBuilder, CacheEngine},
+    phi,
+    tmodel::TModel,
+    util::{gpu_memory_size, gpu_peak_allocated_bytes, log_mem_stats, reset_mem_stats},
+};
 use crate::{
     config::{ModelMeta, RllmConfig},
-    llm::{
-        config::ModelType,
-        llama, phi,
-        tmodel::TModel,
-        util::{gpu_memory_size, gpu_peak_allocated_bytes, log_mem_stats, reset_mem_stats},
-    },
-    paged::{BatchInfoBuilder, CacheEngine, CacheSize},
-    HashSet, LoaderArgs, Repo, RllmEngine,
+    CacheSize, HashSet, LoaderArgs, Repo, RllmEngine,
 };
 use anyhow::{bail, Result};
 use safetensors::Dtype;

diff --git a/rllm-cuda/src/llm/mod.rs b/rllm-cuda/src/llm/mod.rs
@@ -7,17 +7,16 @@ pub mod refkernels;
 pub mod seqid;
 pub mod tmodel;
 pub mod util;
+pub mod paged;
 
 use self::config::ModelConfig;
-use crate::{
-    llm::util::{check_all_close, check_all_close_attn},
-    paged::BatchInfo,
-};
+use paged::BatchInfo;
 use std::rc::Rc;
 use tch::{
     nn::{self, Module, Path},
     IndexOp, Tensor,
 };
+use util::{check_all_close, check_all_close_attn};
 
 // note that this doesn't work for phi-2 - it seems particularly numerically unstable
 const CHECK: bool = false;

diff --git a/rllm-cuda/src/paged/batch_info.rs → rllm-cuda/src/llm/paged/batch_info.rs b/rllm-cuda/src/paged/batch_info.rs → rllm-cuda/src/llm/paged/batch_info.rs
@@ -1,10 +1,10 @@
-use super::{cache_engine::CacheEngine, scheduler::SchedulerOutputs};
+use super::cache_engine::CacheEngine;
 use crate::{
     config::RllmConfig,
     llm::{kernels::to_offsets, tmodel::TModel},
     seq::SchedulingPhase,
     util::pad_to_multiple,
-    HashMap,
+    HashMap, SchedulerOutputs,
 };
 use aicirt::api::Token;
 use std::{

diff --git a/rllm-cuda/src/paged/blocks.rs → rllm-cuda/src/llm/paged/blocks.rs b/rllm-cuda/src/paged/blocks.rs → rllm-cuda/src/llm/paged/blocks.rs
@@ -1,9 +1,9 @@
+use super::cache_engine::CacheEngine;
 use crate::{
     config::RllmConfig,
     llm::tmodel::TModel,
-    paged::{cache_engine::CacheEngine, scheduler::SchedulerOutputs, CacheSize},
     seq::{SchedulingPhase, Sequence, SequenceGroup},
-    BlockLocation, HashMap, TBlockSpaceManager,
+    BlockLocation, CacheSize, HashMap, SchedulerOutputs, TBlockSpaceManager,
 };
 use std::{
     sync::{Arc, Mutex},

diff --git a/rllm-cuda/src/paged/cache_engine.rs → rllm-cuda/src/llm/paged/cache_engine.rs b/rllm-cuda/src/paged/cache_engine.rs → rllm-cuda/src/llm/paged/cache_engine.rs
@@ -7,13 +7,11 @@ use tch::{Device, Tensor};
 use tch_cuda::{CudaEvent, CudaStream};
 
 use crate::{
-    config::RllmConfig,
-    llm::{config::TchRllmConfig, kernels, tmodel::TModel},
-    HashMap,
+    config::RllmConfig, llm::{config::TchRllmConfig, kernels, tmodel::TModel}, CacheSize, HashMap
 };
 use std::sync::Arc;
 
-use super::{CacheIface, CacheSize};
+use super::CacheIface;
 
 type KVCache = (Tensor, Tensor);
 

diff --git a/rllm-cuda/src/paged/cuda_stub.rs → rllm-cuda/src/llm/paged/cuda_stub.rs b/rllm-cuda/src/paged/cuda_stub.rs → rllm-cuda/src/llm/paged/cuda_stub.rs
diff --git a/rllm-cuda/src/llm/paged/mod.rs b/rllm-cuda/src/llm/paged/mod.rs
@@ -0,0 +1,10 @@
+#[cfg(not(feature = "cuda"))]
+mod cuda_stub;
+
+mod batch_info;
+mod blocks;
+mod cache_engine;
+
+pub use batch_info::*;
+pub use blocks::*;
+pub use cache_engine::*;
diff --git a/rllm-cuda/src/llm/phi.rs b/rllm-cuda/src/llm/phi.rs
@@ -1,8 +1,9 @@
 use super::{
     config::{CommonModelConfig, ModelConfig, ModelType, RllmModelConfig},
-    layer_norm, linear, varlen_attn, RotaryEmbedding,
+    layer_norm, linear,
+    paged::BatchInfo,
+    varlen_attn, RotaryEmbedding,
 };
-use crate::paged::BatchInfo;
 use serde::Deserialize;
 use std::rc::Rc;
 use tch::{

diff --git a/rllm-cuda/src/llm/refkernels.rs b/rllm-cuda/src/llm/refkernels.rs
@@ -1,4 +1,4 @@
-use crate::llm::util::{check_all_close_attn, to_vec1};
+use super::util::{check_all_close_attn, to_vec1};
 use crate::HashMap;
 use tch::{IndexOp, Kind, Tensor};
 

diff --git a/rllm-cuda/src/llm/tmodel.rs b/rllm-cuda/src/llm/tmodel.rs
@@ -1,24 +1,20 @@
-use std::{sync::Arc, time::Instant};
-
-use aicirt::{with_timer, TimerRef};
-use anyhow::Result;
-use rand::distributions::Distribution as _;
-use tch::{Device, IndexOp, Tensor};
-
-use crate::{
-    config::RllmConfig,
-    llm::{loader::load_model_config, util::synchronize, DType},
-    paged::{
-        BatchInfo, BatchInfoBuilder, BlockSpaceManager, CacheEngine, CacheIface, SchedulerOutputs,
-    },
-    AiciBias, LogitsProcessor, ModelExec, TensorOps,
-};
-
 use super::{
     config::{self, TchRllmConfig},
+    loader::load_model_config,
     loader::load_rllm_engine,
+    paged::{BatchInfo, BatchInfoBuilder, BlockSpaceManager, CacheEngine, CacheIface},
     seqid::TchSeqMgr,
+    util::synchronize,
+    DType,
+};
+use crate::{
+    config::RllmConfig, AiciBias, LogitsProcessor, ModelExec, SchedulerOutputs, TensorOps,
 };
+use aicirt::{with_timer, TimerRef};
+use anyhow::Result;
+use rand::distributions::Distribution as _;
+use std::{sync::Arc, time::Instant};
+use tch::{Device, IndexOp, Tensor};
 
 pub trait TModelInner {
     fn forward(&self, batch_info: &mut BatchInfo) -> Tensor;

diff --git a/rllm-cuda/src/llm/util.rs b/rllm-cuda/src/llm/util.rs
@@ -1,4 +1,5 @@
-use crate::{llm::DType, util::get_setting};
+use super::DType;
+use crate::util::get_setting;
 use tch::{kind::Element, Device, IndexOp as _, Tensor};
 
 #[cfg(feature = "cuda")]

diff --git a/rllm-cuda/src/paged/mod.rs b/rllm-cuda/src/paged/mod.rs
diff --git a/rllm-cuda/src/paged/scheduler.rs → rllm-cuda/src/scheduler.rs b/rllm-cuda/src/paged/scheduler.rs → rllm-cuda/src/scheduler.rs
@@ -1,5 +1,8 @@
 use crate::{
-    config::RllmConfig, paged::CacheSize, seq::{FinishReason, SchedulingPhase, Sequence, SequenceGroup}, util::limit_str, HashMap, ModelExec, SequenceManager, TBlockSpaceManager
+    config::RllmConfig,
+    seq::{FinishReason, SchedulingPhase, Sequence, SequenceGroup},
+    util::limit_str,
+    HashMap, ModelExec, SequenceManager, TBlockSpaceManager,
 };
 use aicirt::api::SequenceResult;
 use std::{
@@ -484,3 +487,8 @@ impl<ME: ModelExec> Scheduler<ME> {
         }
     }
 }
+
+pub struct CacheSize {
+    pub gpu: usize,
+    pub cpu: usize,
+}