Skip to content

Commit

Permalink
feature-containment
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Feb 7, 2024
1 parent 832abbd commit f2e22d7
Show file tree
Hide file tree
Showing 9 changed files with 12 additions and 33 deletions.
2 changes: 0 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 0 additions & 5 deletions rllm-cuda/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,6 @@ percent-encoding = "2.3.1"
name = "rllm-server"
path = "src/driver.rs"

[build-dependencies]
anyhow = { version = "1", features = ["backtrace"] }
num_cpus = "1.15.0"
rayon = "1.7.0"

[features]
#default = ["llamacpp"]
default = ["tch", "cuda"]
Expand Down
6 changes: 5 additions & 1 deletion rllm-cuda/src/driver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ pub struct DriverArgs {
/// Specify which type to use in the model (bf16, f16, f32)
#[arg(long, default_value = "", help_heading = "Model")]
pub dtype: String,

/// Enable nvprof profiling for given engine step (if available)
#[arg(long, default_value_t = 0, help_heading = "Development")]
pub profile_step: usize,
}

#[actix_web::main]
Expand All @@ -44,6 +48,6 @@ async fn main() -> () {
_ => panic!("invalid dtype; try one of bf16, f16, f32"),
};

let model_args = TchLoaderArgs { device, dtype };
let model_args = TchLoaderArgs { device, dtype, profile_step_no: args.profile_step };
rllm::server::server_main::<TModel>(args.args, model_args).await;
}
12 changes: 0 additions & 12 deletions rllm-cuda/src/engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ pub struct RllmEngine<ME: ModelExec> {
pub model_id: String,
pub tmodel: ME,
pub(crate) step_no: usize,
pub profile_step_no: usize,
req_id_cnt: usize,
#[allow(dead_code)]
pub alt: usize,
Expand Down Expand Up @@ -226,7 +225,6 @@ impl<ME: ModelExec> RllmEngine<ME> {
seq_mgr: tmodel.sequence_manager(),
tmodel,
step_no: 0,
profile_step_no: 0,
req_id_cnt: 0,
num_errors: 0,
eos_token_id,
Expand Down Expand Up @@ -858,11 +856,6 @@ impl<ME: ModelExec> RllmEngine<ME> {
fn step_inner(&mut self) -> Result<Vec<RequestOutput>> {
self.step_no += 1;

#[cfg(feature = "cuda")]
if self.step_no == self.profile_step_no {
cudarc::driver::safe::profiler_start()?;
}

let post_ops = std::mem::take(&mut self.post_ops);
with_timer!(self.tim_aici_post, self.aici_post_pre(post_ops)?);

Expand All @@ -887,11 +880,6 @@ impl<ME: ModelExec> RllmEngine<ME> {
// we run step_finished() regardless if model failed
self.scheduler.step_finished(sched_out);

#[cfg(feature = "cuda")]
if self.step_no == self.profile_step_no {
cudarc::driver::safe::profiler_stop()?;
}

let (outputs, post_ops) = outputs?;
if outputs.is_empty() {
assert!(!self.scheduler.has_unfinished_seqs());
Expand Down
2 changes: 0 additions & 2 deletions rllm-cuda/src/llamacpp/tmodel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ pub struct TModel {
seq_id_to_idx: HashMap<usize, usize>,
t0: Instant,
step_no: usize,
pub nv_profile: bool,
}

pub struct CppLoaderArgs {
Expand Down Expand Up @@ -196,7 +195,6 @@ impl TModel {
Self {
model,
batch,
nv_profile: false,
seq_id_to_idx: HashMap::default(),
step_no: 0,
seq_mgr,
Expand Down
1 change: 1 addition & 0 deletions rllm-cuda/src/llm/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ pub struct ModelConfig {

pub device: Device,
pub dtype: DType,
pub profile_step_no: usize,
}

impl ModelConfig {
Expand Down
1 change: 1 addition & 0 deletions rllm-cuda/src/llm/loader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ pub(super) fn load_model_config(
Some(mut v) => {
let tok = aicirt::bintokens::find_tokenizer(&args.tokenizer)?;
v.meta.tok_vocab_size = tok.tokrx_info().vocab_size as usize;
v.profile_step_no = model_args.profile_step_no;
Ok(v)
}
None => bail!("failed to load model config:\n{}", err),
Expand Down
5 changes: 5 additions & 0 deletions rllm-cuda/src/llm/tmodel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ pub struct TModel {
}

pub struct TchLoaderArgs {
pub profile_step_no: usize,
pub device: Device,
pub dtype: Option<DType>,
}
Expand Down Expand Up @@ -73,6 +74,10 @@ impl ModelExec for TModel {
step_no: usize,
sched_out: &mut SchedulerOutputs,
) -> Result<()> {
if step_no == self.config.model.profile_step_no {
self.nv_profile = true;
}

let mut info = BatchInfoBuilder::new(self.config.clone())
.sched_out(sched_out, self.seq_mgr.get_gpu_allocator())
.finish(step_no, self.cache_iface(sched_out));
Expand Down
11 changes: 0 additions & 11 deletions rllm-cuda/src/server/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -182,11 +182,6 @@ pub struct RllmCliArgs {
#[arg(long, help_heading = "AICI settings")]
pub shm_prefix: Option<String>,

// TODO: #[cfg(feature = "cuda")] -> causes rust-analyzer error
/// Enable nvprof profiling for given engine step (if available)
#[arg(long, default_value_t = 0, help_heading = "Development")]
pub profile_step: usize,

/// Specify test-cases (expected/*/*.safetensors)
#[arg(long, help_heading = "Development")]
pub test: Vec<String>,
Expand Down Expand Up @@ -461,19 +456,13 @@ fn spawn_inference_loop<ME: ModelExec>(
let handle_res = Arc::new(Mutex::new(handle));
let handle = handle_res.clone();

// prep for move
#[cfg(feature = "cuda")]
let profile_step = args.profile_step;
#[cfg(not(feature = "cuda"))]
let profile_step = 0;
let warmup = args.warmup.clone();
let warmup_only = args.warmup_only.clone();

std::thread::spawn(move || {
set_max_priority();
let mut engine =
ME::load_rllm_engine(loader_args, model_args).expect("failed to load model");
engine.profile_step_no = profile_step;
engine.set_aicirt(iface);
let wid = "warmup".to_string();
match warmup {
Expand Down

0 comments on commit f2e22d7

Please sign in to comment.