limit usage of llm::

microsoft · Feb 6, 2024 · e8e32d1 · e8e32d1
1 parent b0254f8
commit e8e32d1
Show file tree

Hide file tree

Showing 4 changed files with 9 additions and 18 deletions.
diff --git a/rllm-cuda/src/lib.rs b/rllm-cuda/src/lib.rs
@@ -25,10 +25,6 @@ cfg_if::cfg_if! {
     } else {
         pub mod llamacpp;
         pub use llamacpp::BlockRef;
-        // pub use llamacpp as llm;
-        // pub use llm::{Device, DType, Tensor};
-        // pub(crate) use llamacpp::BlockRef;
-        // pub(crate) use llamacpp::blocks::CppBlockSpaceManager;
     }
 }
 

diff --git a/rllm-cuda/src/llm/paged/batch_info.rs b/rllm-cuda/src/llm/paged/batch_info.rs
@@ -1,10 +1,7 @@
+use super::super::{kernels::to_offsets, tmodel::TModel};
 use super::cache_engine::CacheEngine;
 use crate::{
-    config::RllmConfig,
-    llm::{kernels::to_offsets, tmodel::TModel},
-    seq::SchedulingPhase,
-    util::pad_to_multiple,
-    HashMap, SchedulerOutputs,
+    config::RllmConfig, seq::SchedulingPhase, util::pad_to_multiple, HashMap, SchedulerOutputs,
 };
 use aicirt::api::Token;
 use std::{

diff --git a/rllm-cuda/src/llm/paged/blocks.rs b/rllm-cuda/src/llm/paged/blocks.rs
@@ -1,7 +1,7 @@
+use super::super::tmodel::TModel;
 use super::cache_engine::CacheEngine;
 use crate::{
     config::RllmConfig,
-    llm::tmodel::TModel,
     seq::{SchedulingPhase, Sequence, SequenceGroup},
     BlockLocation, CacheSize, HashMap, SchedulerOutputs, TBlockSpaceManager,
 };

diff --git a/rllm-cuda/src/llm/paged/cache_engine.rs b/rllm-cuda/src/llm/paged/cache_engine.rs
@@ -1,18 +1,16 @@
 // based on https://github.com/vllm-project/vllm/blob/b9fe4616f98b77b4b9458bce203aa6544cb31ef2/vllm/worker/cache_engine.py
 
+use super::super::{config::TchRllmConfig, kernels, tmodel::TModel};
+use super::CacheIface;
+use crate::{config::RllmConfig, CacheSize, HashMap};
+use std::sync::Arc;
+use tch::{Device, Tensor};
+
 #[cfg(not(feature = "cuda"))]
 use super::cuda_stub::{CudaEvent, CudaStream};
-use tch::{Device, Tensor};
 #[cfg(feature = "cuda")]
 use tch_cuda::{CudaEvent, CudaStream};
 
-use crate::{
-    config::RllmConfig, llm::{config::TchRllmConfig, kernels, tmodel::TModel}, CacheSize, HashMap
-};
-use std::sync::Arc;
-
-use super::CacheIface;
-
 type KVCache = (Tensor, Tensor);
 
 pub struct CacheEngine {