feature-containment

microsoft · Feb 7, 2024 · f2e22d7 · f2e22d7
1 parent 832abbd
commit f2e22d7
Show file tree

Hide file tree

Showing 9 changed files with 12 additions and 33 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/rllm-cuda/Cargo.toml b/rllm-cuda/Cargo.toml
@@ -44,11 +44,6 @@ percent-encoding = "2.3.1"
 name = "rllm-server"
 path = "src/driver.rs"
 
-[build-dependencies]
-anyhow = { version = "1", features = ["backtrace"] }
-num_cpus = "1.15.0"
-rayon = "1.7.0"
-
 [features]
 #default = ["llamacpp"]
 default = ["tch", "cuda"]

diff --git a/rllm-cuda/src/driver.rs b/rllm-cuda/src/driver.rs
@@ -18,6 +18,10 @@ pub struct DriverArgs {
     /// Specify which type to use in the model (bf16, f16, f32)
     #[arg(long, default_value = "", help_heading = "Model")]
     pub dtype: String,
+
+    /// Enable nvprof profiling for given engine step (if available)
+    #[arg(long, default_value_t = 0, help_heading = "Development")]
+    pub profile_step: usize,
 }
 
 #[actix_web::main]
@@ -44,6 +48,6 @@ async fn main() -> () {
         _ => panic!("invalid dtype; try one of bf16, f16, f32"),
     };
 
-    let model_args = TchLoaderArgs { device, dtype };
+    let model_args = TchLoaderArgs { device, dtype, profile_step_no: args.profile_step };
     rllm::server::server_main::<TModel>(args.args, model_args).await;
 }
diff --git a/rllm-cuda/src/engine.rs b/rllm-cuda/src/engine.rs
@@ -127,7 +127,6 @@ pub struct RllmEngine<ME: ModelExec> {
     pub model_id: String,
     pub tmodel: ME,
     pub(crate) step_no: usize,
-    pub profile_step_no: usize,
     req_id_cnt: usize,
     #[allow(dead_code)]
     pub alt: usize,
@@ -226,7 +225,6 @@ impl<ME: ModelExec> RllmEngine<ME> {
             seq_mgr: tmodel.sequence_manager(),
             tmodel,
             step_no: 0,
-            profile_step_no: 0,
             req_id_cnt: 0,
             num_errors: 0,
             eos_token_id,
@@ -858,11 +856,6 @@ impl<ME: ModelExec> RllmEngine<ME> {
     fn step_inner(&mut self) -> Result<Vec<RequestOutput>> {
         self.step_no += 1;
 
-        #[cfg(feature = "cuda")]
-        if self.step_no == self.profile_step_no {
-            cudarc::driver::safe::profiler_start()?;
-        }
-
         let post_ops = std::mem::take(&mut self.post_ops);
         with_timer!(self.tim_aici_post, self.aici_post_pre(post_ops)?);
 
@@ -887,11 +880,6 @@ impl<ME: ModelExec> RllmEngine<ME> {
         // we run step_finished() regardless if model failed
         self.scheduler.step_finished(sched_out);
 
-        #[cfg(feature = "cuda")]
-        if self.step_no == self.profile_step_no {
-            cudarc::driver::safe::profiler_stop()?;
-        }
-
         let (outputs, post_ops) = outputs?;
         if outputs.is_empty() {
             assert!(!self.scheduler.has_unfinished_seqs());

diff --git a/rllm-cuda/src/llamacpp/tmodel.rs b/rllm-cuda/src/llamacpp/tmodel.rs
@@ -21,7 +21,6 @@ pub struct TModel {
     seq_id_to_idx: HashMap<usize, usize>,
     t0: Instant,
     step_no: usize,
-    pub nv_profile: bool,
 }
 
 pub struct CppLoaderArgs {
@@ -196,7 +195,6 @@ impl TModel {
         Self {
             model,
             batch,
-            nv_profile: false,
             seq_id_to_idx: HashMap::default(),
             step_no: 0,
             seq_mgr,

diff --git a/rllm-cuda/src/llm/config.rs b/rllm-cuda/src/llm/config.rs
@@ -87,6 +87,7 @@ pub struct ModelConfig {
 
     pub device: Device,
     pub dtype: DType,
+    pub profile_step_no: usize,
 }
 
 impl ModelConfig {

diff --git a/rllm-cuda/src/llm/loader.rs b/rllm-cuda/src/llm/loader.rs
@@ -254,6 +254,7 @@ pub(super) fn load_model_config(
         Some(mut v) => {
             let tok = aicirt::bintokens::find_tokenizer(&args.tokenizer)?;
             v.meta.tok_vocab_size = tok.tokrx_info().vocab_size as usize;
+            v.profile_step_no = model_args.profile_step_no;
             Ok(v)
         }
         None => bail!("failed to load model config:\n{}", err),

diff --git a/rllm-cuda/src/llm/tmodel.rs b/rllm-cuda/src/llm/tmodel.rs
@@ -31,6 +31,7 @@ pub struct TModel {
 }
 
 pub struct TchLoaderArgs {
+    pub profile_step_no: usize,
     pub device: Device,
     pub dtype: Option<DType>,
 }
@@ -73,6 +74,10 @@ impl ModelExec for TModel {
         step_no: usize,
         sched_out: &mut SchedulerOutputs,
     ) -> Result<()> {
+        if step_no == self.config.model.profile_step_no {
+            self.nv_profile = true;
+        }
+
         let mut info = BatchInfoBuilder::new(self.config.clone())
             .sched_out(sched_out, self.seq_mgr.get_gpu_allocator())
             .finish(step_no, self.cache_iface(sched_out));

diff --git a/rllm-cuda/src/server/mod.rs b/rllm-cuda/src/server/mod.rs
@@ -182,11 +182,6 @@ pub struct RllmCliArgs {
     #[arg(long, help_heading = "AICI settings")]
     pub shm_prefix: Option<String>,
 
-    // TODO: #[cfg(feature = "cuda")] -> causes rust-analyzer error
-    /// Enable nvprof profiling for given engine step (if available)
-    #[arg(long, default_value_t = 0, help_heading = "Development")]
-    pub profile_step: usize,
-
     /// Specify test-cases (expected/*/*.safetensors)
     #[arg(long, help_heading = "Development")]
     pub test: Vec<String>,
@@ -461,19 +456,13 @@ fn spawn_inference_loop<ME: ModelExec>(
     let handle_res = Arc::new(Mutex::new(handle));
     let handle = handle_res.clone();
 
-    // prep for move
-    #[cfg(feature = "cuda")]
-    let profile_step = args.profile_step;
-    #[cfg(not(feature = "cuda"))]
-    let profile_step = 0;
     let warmup = args.warmup.clone();
     let warmup_only = args.warmup_only.clone();
 
     std::thread::spawn(move || {
         set_max_priority();
         let mut engine =
             ME::load_rllm_engine(loader_args, model_args).expect("failed to load model");
-        engine.profile_step_no = profile_step;
         engine.set_aicirt(iface);
         let wid = "warmup".to_string();
         match warmup {