Skip to content

Commit

Permalink
move paged into llm/
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Feb 6, 2024
1 parent 662fd23 commit b0254f8
Show file tree
Hide file tree
Showing 18 changed files with 69 additions and 74 deletions.
5 changes: 2 additions & 3 deletions rllm-cuda/src/engine.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
use crate::{
config::{CacheConfig, ParallelConfig, RllmConfig, SamplingParams, SchedulerConfig},
iface::AiciRtIface,
paged::{CacheSize, Scheduler, SchedulerOutputs},
seq::{
AiciSampling, FinishReason, RequestOutput, SchedulingPhase, SeqOutput, Sequence,
SequenceGroup, Token, TokenUsage,
},
util::get_setting,
AiciBias as _, HashMap, LoaderArgs, LogitsProcessor, ModelExec, SequenceManager,
TBlockSpaceManager as _, TensorOps,
AiciBias as _, CacheSize, HashMap, LoaderArgs, LogitsProcessor, ModelExec, Scheduler,
SchedulerOutputs, SequenceManager, TBlockSpaceManager as _, TensorOps,
};
use aici_abi::toktree::TokTrie;
use aicirt::{
Expand Down
2 changes: 1 addition & 1 deletion rllm-cuda/src/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use anyhow::Result;

use crate::{
config::{ModelMeta, RllmConfig},
paged::{CacheSize, SchedulerOutputs},
scheduler::{CacheSize, SchedulerOutputs},
seq::{Sequence, SequenceGroup},
HashMap, LoaderArgs, LogitsProcessor, RllmEngine,
};
Expand Down
5 changes: 3 additions & 2 deletions rllm-cuda/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
pub mod paged;
pub mod seq;

// vllm modules
Expand All @@ -8,19 +7,21 @@ mod exec;
mod expected;
pub mod iface;
mod logits;
mod scheduler;
pub mod server;
pub mod util;

use config::AiciConfig;
pub use engine::*;
pub use exec::*;
pub use logits::LogitsProcessor;
pub use scheduler::*;
use std::sync::atomic::AtomicBool;

cfg_if::cfg_if! {
if #[cfg(feature = "tch")] {
pub mod llm;
pub(crate) use paged::BlockRef;
pub(crate) use llm::paged::BlockRef;
} else {
pub mod llamacpp;
pub use llamacpp::BlockRef;
Expand Down
2 changes: 1 addition & 1 deletion rllm-cuda/src/llm/kernels.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#[cfg(not(feature = "cuda"))]
pub use crate::llm::refkernels::*;
pub use super::refkernels::*;
use tch::{Device, Tensor};
#[cfg(feature = "cuda")]
pub use tch_cuda::flash_attn_varlen as varlen_attn;
Expand Down
10 changes: 7 additions & 3 deletions rllm-cuda/src/llm/llama.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@

use super::{
config::{CommonModelConfig, ModelConfig, ModelType, RllmModelConfig},
linear_no_bias, varlen_attn, RmsNorm, RotaryEmbedding,
linear_no_bias,
paged::BatchInfo,
varlen_attn, RmsNorm, RotaryEmbedding,
};
use crate::paged::BatchInfo;
use anyhow::Result;
use serde::Deserialize;
use std::rc::Rc;
use tch::{nn::{self, Module, Path}, Tensor};
use tch::{
nn::{self, Module, Path},
Tensor,
};

use super::tmodel::TModelInner;

Expand Down
17 changes: 9 additions & 8 deletions rllm-cuda/src/llm/loader.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
use super::{
config::ModelType,
llama,
paged::{BatchInfoBuilder, CacheEngine},
phi,
tmodel::TModel,
util::{gpu_memory_size, gpu_peak_allocated_bytes, log_mem_stats, reset_mem_stats},
};
use crate::{
config::{ModelMeta, RllmConfig},
llm::{
config::ModelType,
llama, phi,
tmodel::TModel,
util::{gpu_memory_size, gpu_peak_allocated_bytes, log_mem_stats, reset_mem_stats},
},
paged::{BatchInfoBuilder, CacheEngine, CacheSize},
HashSet, LoaderArgs, Repo, RllmEngine,
CacheSize, HashSet, LoaderArgs, Repo, RllmEngine,
};
use anyhow::{bail, Result};
use safetensors::Dtype;
Expand Down
7 changes: 3 additions & 4 deletions rllm-cuda/src/llm/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,16 @@ pub mod refkernels;
pub mod seqid;
pub mod tmodel;
pub mod util;
pub mod paged;

use self::config::ModelConfig;
use crate::{
llm::util::{check_all_close, check_all_close_attn},
paged::BatchInfo,
};
use paged::BatchInfo;
use std::rc::Rc;
use tch::{
nn::{self, Module, Path},
IndexOp, Tensor,
};
use util::{check_all_close, check_all_close_attn};

// note that this doesn't work for phi-2 - it seems particularly numerically unstable
const CHECK: bool = false;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
use super::{cache_engine::CacheEngine, scheduler::SchedulerOutputs};
use super::cache_engine::CacheEngine;
use crate::{
config::RllmConfig,
llm::{kernels::to_offsets, tmodel::TModel},
seq::SchedulingPhase,
util::pad_to_multiple,
HashMap,
HashMap, SchedulerOutputs,
};
use aicirt::api::Token;
use std::{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
use super::cache_engine::CacheEngine;
use crate::{
config::RllmConfig,
llm::tmodel::TModel,
paged::{cache_engine::CacheEngine, scheduler::SchedulerOutputs, CacheSize},
seq::{SchedulingPhase, Sequence, SequenceGroup},
BlockLocation, HashMap, TBlockSpaceManager,
BlockLocation, CacheSize, HashMap, SchedulerOutputs, TBlockSpaceManager,
};
use std::{
sync::{Arc, Mutex},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,11 @@ use tch::{Device, Tensor};
use tch_cuda::{CudaEvent, CudaStream};

use crate::{
config::RllmConfig,
llm::{config::TchRllmConfig, kernels, tmodel::TModel},
HashMap,
config::RllmConfig, llm::{config::TchRllmConfig, kernels, tmodel::TModel}, CacheSize, HashMap
};
use std::sync::Arc;

use super::{CacheIface, CacheSize};
use super::CacheIface;

type KVCache = (Tensor, Tensor);

Expand Down
File renamed without changes.
10 changes: 10 additions & 0 deletions rllm-cuda/src/llm/paged/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#[cfg(not(feature = "cuda"))]
mod cuda_stub;

mod batch_info;
mod blocks;
mod cache_engine;

pub use batch_info::*;
pub use blocks::*;
pub use cache_engine::*;
5 changes: 3 additions & 2 deletions rllm-cuda/src/llm/phi.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
use super::{
config::{CommonModelConfig, ModelConfig, ModelType, RllmModelConfig},
layer_norm, linear, varlen_attn, RotaryEmbedding,
layer_norm, linear,
paged::BatchInfo,
varlen_attn, RotaryEmbedding,
};
use crate::paged::BatchInfo;
use serde::Deserialize;
use std::rc::Rc;
use tch::{
Expand Down
2 changes: 1 addition & 1 deletion rllm-cuda/src/llm/refkernels.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::llm::util::{check_all_close_attn, to_vec1};
use super::util::{check_all_close_attn, to_vec1};
use crate::HashMap;
use tch::{IndexOp, Kind, Tensor};

Expand Down
28 changes: 12 additions & 16 deletions rllm-cuda/src/llm/tmodel.rs
Original file line number Diff line number Diff line change
@@ -1,24 +1,20 @@
use std::{sync::Arc, time::Instant};

use aicirt::{with_timer, TimerRef};
use anyhow::Result;
use rand::distributions::Distribution as _;
use tch::{Device, IndexOp, Tensor};

use crate::{
config::RllmConfig,
llm::{loader::load_model_config, util::synchronize, DType},
paged::{
BatchInfo, BatchInfoBuilder, BlockSpaceManager, CacheEngine, CacheIface, SchedulerOutputs,
},
AiciBias, LogitsProcessor, ModelExec, TensorOps,
};

use super::{
config::{self, TchRllmConfig},
loader::load_model_config,
loader::load_rllm_engine,
paged::{BatchInfo, BatchInfoBuilder, BlockSpaceManager, CacheEngine, CacheIface},
seqid::TchSeqMgr,
util::synchronize,
DType,
};
use crate::{
config::RllmConfig, AiciBias, LogitsProcessor, ModelExec, SchedulerOutputs, TensorOps,
};
use aicirt::{with_timer, TimerRef};
use anyhow::Result;
use rand::distributions::Distribution as _;
use std::{sync::Arc, time::Instant};
use tch::{Device, IndexOp, Tensor};

pub trait TModelInner {
fn forward(&self, batch_info: &mut BatchInfo) -> Tensor;
Expand Down
3 changes: 2 additions & 1 deletion rllm-cuda/src/llm/util.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use crate::{llm::DType, util::get_setting};
use super::DType;
use crate::util::get_setting;
use tch::{kind::Element, Device, IndexOp as _, Tensor};

#[cfg(feature = "cuda")]
Expand Down
23 changes: 0 additions & 23 deletions rllm-cuda/src/paged/mod.rs

This file was deleted.

10 changes: 9 additions & 1 deletion rllm-cuda/src/paged/scheduler.rs → rllm-cuda/src/scheduler.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
use crate::{
config::RllmConfig, paged::CacheSize, seq::{FinishReason, SchedulingPhase, Sequence, SequenceGroup}, util::limit_str, HashMap, ModelExec, SequenceManager, TBlockSpaceManager
config::RllmConfig,
seq::{FinishReason, SchedulingPhase, Sequence, SequenceGroup},
util::limit_str,
HashMap, ModelExec, SequenceManager, TBlockSpaceManager,
};
use aicirt::api::SequenceResult;
use std::{
Expand Down Expand Up @@ -484,3 +487,8 @@ impl<ME: ModelExec> Scheduler<ME> {
}
}
}

pub struct CacheSize {
pub gpu: usize,
pub cpu: usize,
}

0 comments on commit b0254f8

Please sign in to comment.