diff --git a/rllm-cuda/src/engine.rs b/rllm-cuda/src/engine.rs
index 77e20bb4..128d00c3 100644
--- a/rllm-cuda/src/engine.rs
+++ b/rllm-cuda/src/engine.rs
@@ -1,14 +1,13 @@
 use crate::{
     config::{CacheConfig, ParallelConfig, RllmConfig, SamplingParams, SchedulerConfig},
     iface::AiciRtIface,
-    paged::{CacheSize, Scheduler, SchedulerOutputs},
     seq::{
         AiciSampling, FinishReason, RequestOutput, SchedulingPhase, SeqOutput, Sequence,
         SequenceGroup, Token, TokenUsage,
     },
     util::get_setting,
-    AiciBias as _, HashMap, LoaderArgs, LogitsProcessor, ModelExec, SequenceManager,
-    TBlockSpaceManager as _, TensorOps,
+    AiciBias as _, CacheSize, HashMap, LoaderArgs, LogitsProcessor, ModelExec, Scheduler,
+    SchedulerOutputs, SequenceManager, TBlockSpaceManager as _, TensorOps,
 };
 use aici_abi::toktree::TokTrie;
 use aicirt::{
diff --git a/rllm-cuda/src/exec.rs b/rllm-cuda/src/exec.rs
index dd85df67..caa6d86c 100644
--- a/rllm-cuda/src/exec.rs
+++ b/rllm-cuda/src/exec.rs
@@ -5,7 +5,7 @@ use anyhow::Result;
 
 use crate::{
     config::{ModelMeta, RllmConfig},
-    paged::{CacheSize, SchedulerOutputs},
+    scheduler::{CacheSize, SchedulerOutputs},
     seq::{Sequence, SequenceGroup},
     HashMap, LoaderArgs, LogitsProcessor, RllmEngine,
 };
diff --git a/rllm-cuda/src/lib.rs b/rllm-cuda/src/lib.rs
index ea29efc2..ecf42d1e 100644
--- a/rllm-cuda/src/lib.rs
+++ b/rllm-cuda/src/lib.rs
@@ -1,4 +1,3 @@
-pub mod paged;
 pub mod seq;
 
 // vllm modules
@@ -8,6 +7,7 @@ mod exec;
 mod expected;
 pub mod iface;
 mod logits;
+mod scheduler;
 pub mod server;
 pub mod util;
 
@@ -15,12 +15,13 @@ use config::AiciConfig;
 pub use engine::*;
 pub use exec::*;
 pub use logits::LogitsProcessor;
+pub use scheduler::*;
 use std::sync::atomic::AtomicBool;
 
 cfg_if::cfg_if! {
     if #[cfg(feature = "tch")] {
         pub mod llm;
-        pub(crate) use paged::BlockRef;
+        pub(crate) use llm::paged::BlockRef;
     } else {
         pub mod llamacpp;
         pub use llamacpp::BlockRef;
diff --git a/rllm-cuda/src/llm/kernels.rs b/rllm-cuda/src/llm/kernels.rs
index 17086d01..819c9896 100644
--- a/rllm-cuda/src/llm/kernels.rs
+++ b/rllm-cuda/src/llm/kernels.rs
@@ -1,5 +1,5 @@
 #[cfg(not(feature = "cuda"))]
-pub use crate::llm::refkernels::*;
+pub use super::refkernels::*;
 use tch::{Device, Tensor};
 #[cfg(feature = "cuda")]
 pub use tch_cuda::flash_attn_varlen as varlen_attn;
diff --git a/rllm-cuda/src/llm/llama.rs b/rllm-cuda/src/llm/llama.rs
index a7f31138..8f27b3b8 100644
--- a/rllm-cuda/src/llm/llama.rs
+++ b/rllm-cuda/src/llm/llama.rs
@@ -2,13 +2,17 @@
 
 use super::{
     config::{CommonModelConfig, ModelConfig, ModelType, RllmModelConfig},
-    linear_no_bias, varlen_attn, RmsNorm, RotaryEmbedding,
+    linear_no_bias,
+    paged::BatchInfo,
+    varlen_attn, RmsNorm, RotaryEmbedding,
 };
-use crate::paged::BatchInfo;
 use anyhow::Result;
 use serde::Deserialize;
 use std::rc::Rc;
-use tch::{nn::{self, Module, Path}, Tensor};
+use tch::{
+    nn::{self, Module, Path},
+    Tensor,
+};
 
 use super::tmodel::TModelInner;
 
diff --git a/rllm-cuda/src/llm/loader.rs b/rllm-cuda/src/llm/loader.rs
index 177fefa9..3dc75b27 100644
--- a/rllm-cuda/src/llm/loader.rs
+++ b/rllm-cuda/src/llm/loader.rs
@@ -1,13 +1,14 @@
+use super::{
+    config::ModelType,
+    llama,
+    paged::{BatchInfoBuilder, CacheEngine},
+    phi,
+    tmodel::TModel,
+    util::{gpu_memory_size, gpu_peak_allocated_bytes, log_mem_stats, reset_mem_stats},
+};
 use crate::{
     config::{ModelMeta, RllmConfig},
-    llm::{
-        config::ModelType,
-        llama, phi,
-        tmodel::TModel,
-        util::{gpu_memory_size, gpu_peak_allocated_bytes, log_mem_stats, reset_mem_stats},
-    },
-    paged::{BatchInfoBuilder, CacheEngine, CacheSize},
-    HashSet, LoaderArgs, Repo, RllmEngine,
+    CacheSize, HashSet, LoaderArgs, Repo, RllmEngine,
 };
 use anyhow::{bail, Result};
 use safetensors::Dtype;
diff --git a/rllm-cuda/src/llm/mod.rs b/rllm-cuda/src/llm/mod.rs
index 762db8cd..c89c8547 100644
--- a/rllm-cuda/src/llm/mod.rs
+++ b/rllm-cuda/src/llm/mod.rs
@@ -7,17 +7,16 @@ pub mod refkernels;
 pub mod seqid;
 pub mod tmodel;
 pub mod util;
+pub mod paged;
 
 use self::config::ModelConfig;
-use crate::{
-    llm::util::{check_all_close, check_all_close_attn},
-    paged::BatchInfo,
-};
+use paged::BatchInfo;
 use std::rc::Rc;
 use tch::{
     nn::{self, Module, Path},
     IndexOp, Tensor,
 };
+use util::{check_all_close, check_all_close_attn};
 
 // note that this doesn't work for phi-2 - it seems particularly numerically unstable
 const CHECK: bool = false;
diff --git a/rllm-cuda/src/paged/batch_info.rs b/rllm-cuda/src/llm/paged/batch_info.rs
similarity index 99%
rename from rllm-cuda/src/paged/batch_info.rs
rename to rllm-cuda/src/llm/paged/batch_info.rs
index c8baa564..f8a6a36b 100644
--- a/rllm-cuda/src/paged/batch_info.rs
+++ b/rllm-cuda/src/llm/paged/batch_info.rs
@@ -1,10 +1,10 @@
-use super::{cache_engine::CacheEngine, scheduler::SchedulerOutputs};
+use super::cache_engine::CacheEngine;
 use crate::{
     config::RllmConfig,
     llm::{kernels::to_offsets, tmodel::TModel},
     seq::SchedulingPhase,
     util::pad_to_multiple,
-    HashMap,
+    HashMap, SchedulerOutputs,
 };
 use aicirt::api::Token;
 use std::{
diff --git a/rllm-cuda/src/paged/blocks.rs b/rllm-cuda/src/llm/paged/blocks.rs
similarity index 98%
rename from rllm-cuda/src/paged/blocks.rs
rename to rllm-cuda/src/llm/paged/blocks.rs
index 711b778a..e04fc906 100644
--- a/rllm-cuda/src/paged/blocks.rs
+++ b/rllm-cuda/src/llm/paged/blocks.rs
@@ -1,9 +1,9 @@
+use super::cache_engine::CacheEngine;
 use crate::{
     config::RllmConfig,
     llm::tmodel::TModel,
-    paged::{cache_engine::CacheEngine, scheduler::SchedulerOutputs, CacheSize},
     seq::{SchedulingPhase, Sequence, SequenceGroup},
-    BlockLocation, HashMap, TBlockSpaceManager,
+    BlockLocation, CacheSize, HashMap, SchedulerOutputs, TBlockSpaceManager,
 };
 use std::{
     sync::{Arc, Mutex},
diff --git a/rllm-cuda/src/paged/cache_engine.rs b/rllm-cuda/src/llm/paged/cache_engine.rs
similarity index 97%
rename from rllm-cuda/src/paged/cache_engine.rs
rename to rllm-cuda/src/llm/paged/cache_engine.rs
index e3607f0e..2ac2462c 100644
--- a/rllm-cuda/src/paged/cache_engine.rs
+++ b/rllm-cuda/src/llm/paged/cache_engine.rs
@@ -7,13 +7,11 @@ use tch::{Device, Tensor};
 use tch_cuda::{CudaEvent, CudaStream};
 
 use crate::{
-    config::RllmConfig,
-    llm::{config::TchRllmConfig, kernels, tmodel::TModel},
-    HashMap,
+    config::RllmConfig, llm::{config::TchRllmConfig, kernels, tmodel::TModel}, CacheSize, HashMap
 };
 use std::sync::Arc;
 
-use super::{CacheIface, CacheSize};
+use super::CacheIface;
 
 type KVCache = (Tensor, Tensor);
 
diff --git a/rllm-cuda/src/paged/cuda_stub.rs b/rllm-cuda/src/llm/paged/cuda_stub.rs
similarity index 100%
rename from rllm-cuda/src/paged/cuda_stub.rs
rename to rllm-cuda/src/llm/paged/cuda_stub.rs
diff --git a/rllm-cuda/src/llm/paged/mod.rs b/rllm-cuda/src/llm/paged/mod.rs
new file mode 100644
index 00000000..84f62a1f
--- /dev/null
+++ b/rllm-cuda/src/llm/paged/mod.rs
@@ -0,0 +1,10 @@
+#[cfg(not(feature = "cuda"))]
+mod cuda_stub;
+
+mod batch_info;
+mod blocks;
+mod cache_engine;
+
+pub use batch_info::*;
+pub use blocks::*;
+pub use cache_engine::*;
diff --git a/rllm-cuda/src/llm/phi.rs b/rllm-cuda/src/llm/phi.rs
index ea5f1e92..76a556e1 100644
--- a/rllm-cuda/src/llm/phi.rs
+++ b/rllm-cuda/src/llm/phi.rs
@@ -1,8 +1,9 @@
 use super::{
     config::{CommonModelConfig, ModelConfig, ModelType, RllmModelConfig},
-    layer_norm, linear, varlen_attn, RotaryEmbedding,
+    layer_norm, linear,
+    paged::BatchInfo,
+    varlen_attn, RotaryEmbedding,
 };
-use crate::paged::BatchInfo;
 use serde::Deserialize;
 use std::rc::Rc;
 use tch::{
diff --git a/rllm-cuda/src/llm/refkernels.rs b/rllm-cuda/src/llm/refkernels.rs
index 93881591..2286f69f 100644
--- a/rllm-cuda/src/llm/refkernels.rs
+++ b/rllm-cuda/src/llm/refkernels.rs
@@ -1,4 +1,4 @@
-use crate::llm::util::{check_all_close_attn, to_vec1};
+use super::util::{check_all_close_attn, to_vec1};
 use crate::HashMap;
 use tch::{IndexOp, Kind, Tensor};
 
diff --git a/rllm-cuda/src/llm/tmodel.rs b/rllm-cuda/src/llm/tmodel.rs
index 5f3cfdc5..dd77276d 100644
--- a/rllm-cuda/src/llm/tmodel.rs
+++ b/rllm-cuda/src/llm/tmodel.rs
@@ -1,24 +1,20 @@
-use std::{sync::Arc, time::Instant};
-
-use aicirt::{with_timer, TimerRef};
-use anyhow::Result;
-use rand::distributions::Distribution as _;
-use tch::{Device, IndexOp, Tensor};
-
-use crate::{
-    config::RllmConfig,
-    llm::{loader::load_model_config, util::synchronize, DType},
-    paged::{
-        BatchInfo, BatchInfoBuilder, BlockSpaceManager, CacheEngine, CacheIface, SchedulerOutputs,
-    },
-    AiciBias, LogitsProcessor, ModelExec, TensorOps,
-};
-
 use super::{
     config::{self, TchRllmConfig},
+    loader::load_model_config,
     loader::load_rllm_engine,
+    paged::{BatchInfo, BatchInfoBuilder, BlockSpaceManager, CacheEngine, CacheIface},
     seqid::TchSeqMgr,
+    util::synchronize,
+    DType,
+};
+use crate::{
+    config::RllmConfig, AiciBias, LogitsProcessor, ModelExec, SchedulerOutputs, TensorOps,
 };
+use aicirt::{with_timer, TimerRef};
+use anyhow::Result;
+use rand::distributions::Distribution as _;
+use std::{sync::Arc, time::Instant};
+use tch::{Device, IndexOp, Tensor};
 
 pub trait TModelInner {
     fn forward(&self, batch_info: &mut BatchInfo) -> Tensor;
diff --git a/rllm-cuda/src/llm/util.rs b/rllm-cuda/src/llm/util.rs
index 488f9507..fe5ed830 100644
--- a/rllm-cuda/src/llm/util.rs
+++ b/rllm-cuda/src/llm/util.rs
@@ -1,4 +1,5 @@
-use crate::{llm::DType, util::get_setting};
+use super::DType;
+use crate::util::get_setting;
 use tch::{kind::Element, Device, IndexOp as _, Tensor};
 
 #[cfg(feature = "cuda")]
diff --git a/rllm-cuda/src/paged/mod.rs b/rllm-cuda/src/paged/mod.rs
deleted file mode 100644
index abe3e10e..00000000
--- a/rllm-cuda/src/paged/mod.rs
+++ /dev/null
@@ -1,23 +0,0 @@
-mod scheduler;
-
-pub use scheduler::*;
-
-cfg_if::cfg_if! {
-    if #[cfg(feature = "tch")] {
-        #[cfg(not(feature = "cuda"))]
-        mod cuda_stub;
-
-        mod blocks;
-        mod cache_engine;
-        mod batch_info;
-
-        pub use batch_info::*;
-        pub use cache_engine::*;
-        pub use blocks::*;
-    }
-}
-
-pub struct CacheSize {
-    pub gpu: usize,
-    pub cpu: usize,
-}
diff --git a/rllm-cuda/src/paged/scheduler.rs b/rllm-cuda/src/scheduler.rs
similarity index 98%
rename from rllm-cuda/src/paged/scheduler.rs
rename to rllm-cuda/src/scheduler.rs
index 96a7cb1b..d135ac42 100644
--- a/rllm-cuda/src/paged/scheduler.rs
+++ b/rllm-cuda/src/scheduler.rs
@@ -1,5 +1,8 @@
 use crate::{
-    config::RllmConfig, paged::CacheSize, seq::{FinishReason, SchedulingPhase, Sequence, SequenceGroup}, util::limit_str, HashMap, ModelExec, SequenceManager, TBlockSpaceManager
+    config::RllmConfig,
+    seq::{FinishReason, SchedulingPhase, Sequence, SequenceGroup},
+    util::limit_str,
+    HashMap, ModelExec, SequenceManager, TBlockSpaceManager,
 };
 use aicirt::api::SequenceResult;
 use std::{
@@ -484,3 +487,8 @@ impl<ME: ModelExec> Scheduler<ME> {
         }
     }
 }
+
+pub struct CacheSize {
+    pub gpu: usize,
+    pub cpu: usize,
+}