From 469cf646cb140f589dda932ae2690221c4ccee45 Mon Sep 17 00:00:00 2001
From: Michal Moskal <michal@moskal.me>
Date: Wed, 7 Feb 2024 21:14:11 +0000
Subject: [PATCH] split cpp/cuda code into folders

---
 Cargo.lock                                    | 42 ++++++++++++++++---
 Cargo.toml                                    |  1 +
 rllm-cpp/Cargo.toml                           |  3 +-
 .../src/llamacpp/blocks.rs                    |  2 +-
 .../src/llamacpp/loader.rs                    |  2 +-
 {rllm-cuda => rllm-cpp}/src/llamacpp/mod.rs   |  2 +-
 {rllm-cuda => rllm-cpp}/src/llamacpp/seqid.rs |  2 +-
 .../src/llamacpp/tmodel.rs                    |  4 +-
 rllm-cpp/src/rllm-cpp.rs                      |  7 ++--
 rllm-cuda/Cargo.toml                          | 18 ++++----
 rllm-cuda/SHIP_TODO.md                        |  6 ---
 rllm-cuda/src/llm/config.rs                   |  2 +-
 rllm-cuda/src/llm/loader.rs                   |  2 +-
 rllm-cuda/src/llm/paged/batch_info.rs         |  2 +-
 rllm-cuda/src/llm/paged/blocks.rs             |  2 +-
 rllm-cuda/src/llm/paged/cache_engine.rs       |  2 +-
 rllm-cuda/src/llm/refkernels.rs               |  2 +-
 rllm-cuda/src/llm/tmodel.rs                   | 10 ++---
 rllm-cuda/src/llm/util.rs                     |  2 +-
 rllm-cuda/src/{driver.rs => rllm-cuda.rs}     | 18 ++++----
 rllm-lib/Cargo.toml                           | 32 ++++++++++++++
 {rllm-cuda => rllm-lib}/src/config.rs         |  0
 {rllm-cuda => rllm-lib}/src/engine.rs         |  0
 {rllm-cuda => rllm-lib}/src/exec.rs           |  0
 {rllm-cuda => rllm-lib}/src/expected.rs       |  0
 {rllm-cuda => rllm-lib}/src/iface.rs          |  0
 {rllm-cuda => rllm-lib}/src/lib.rs            |  8 ----
 {rllm-cuda => rllm-lib}/src/logits.rs         |  0
 {rllm-cuda => rllm-lib}/src/scheduler.rs      |  0
 {rllm-cuda => rllm-lib}/src/seq.rs            |  2 +-
 {rllm-cuda => rllm-lib}/src/server/api.rs     |  0
 .../src/server/completion.rs                  |  0
 {rllm-cuda => rllm-lib}/src/server/mod.rs     |  0
 .../src/server/openai/LICENSE                 |  0
 .../src/server/openai/mod.rs                  |  0
 .../src/server/openai/requests.rs             |  0
 .../src/server/openai/responses.rs            |  0
 {rllm-cuda => rllm-lib}/src/util.rs           |  0
 38 files changed, 112 insertions(+), 61 deletions(-)
 rename {rllm-cuda => rllm-cpp}/src/llamacpp/blocks.rs (98%)
 rename {rllm-cuda => rllm-cpp}/src/llamacpp/loader.rs (97%)
 rename {rllm-cuda => rllm-cpp}/src/llamacpp/mod.rs (96%)
 rename {rllm-cuda => rllm-cpp}/src/llamacpp/seqid.rs (96%)
 rename {rllm-cuda => rllm-cpp}/src/llamacpp/tmodel.rs (99%)
 delete mode 100644 rllm-cuda/SHIP_TODO.md
 rename rllm-cuda/src/{driver.rs => rllm-cuda.rs} (85%)
 create mode 100644 rllm-lib/Cargo.toml
 rename {rllm-cuda => rllm-lib}/src/config.rs (100%)
 rename {rllm-cuda => rllm-lib}/src/engine.rs (100%)
 rename {rllm-cuda => rllm-lib}/src/exec.rs (100%)
 rename {rllm-cuda => rllm-lib}/src/expected.rs (100%)
 rename {rllm-cuda => rllm-lib}/src/iface.rs (100%)
 rename {rllm-cuda => rllm-lib}/src/lib.rs (92%)
 rename {rllm-cuda => rllm-lib}/src/logits.rs (100%)
 rename {rllm-cuda => rllm-lib}/src/scheduler.rs (100%)
 rename {rllm-cuda => rllm-lib}/src/seq.rs (99%)
 rename {rllm-cuda => rllm-lib}/src/server/api.rs (100%)
 rename {rllm-cuda => rllm-lib}/src/server/completion.rs (100%)
 rename {rllm-cuda => rllm-lib}/src/server/mod.rs (100%)
 rename {rllm-cuda => rllm-lib}/src/server/openai/LICENSE (100%)
 rename {rllm-cuda => rllm-lib}/src/server/openai/mod.rs (100%)
 rename {rllm-cuda => rllm-lib}/src/server/openai/requests.rs (100%)
 rename {rllm-cuda => rllm-lib}/src/server/openai/responses.rs (100%)
 rename {rllm-cuda => rllm-lib}/src/util.rs (100%)

diff --git a/Cargo.lock b/Cargo.lock
index cbd71781..22360fd1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2945,7 +2945,6 @@ dependencies = [
  "base64 0.21.5",
  "cfg-if",
  "clap",
- "cudarc",
  "derive_more",
  "futures",
  "fxhash",
@@ -2954,7 +2953,6 @@ dependencies = [
  "indicatif",
  "lazy_static",
  "libc",
- "llama_cpp_low",
  "log",
  "memmap2",
  "percent-encoding",
@@ -2962,11 +2960,8 @@ dependencies = [
  "safetensors 0.4.1",
  "serde",
  "serde_json",
- "tch",
- "tch-cuda",
  "tokenizers",
  "tokio",
- "torch-sys",
  "uuid",
 ]
 
@@ -2975,11 +2970,48 @@ name = "rllm-cpp"
 version = "0.1.0"
 dependencies = [
  "actix-web",
+ "anyhow",
  "clap",
  "llama_cpp_low",
  "rllm",
 ]
 
+[[package]]
+name = "rllm-cuda"
+version = "0.1.0"
+dependencies = [
+ "actix-web",
+ "aici_abi",
+ "aicirt",
+ "anyhow",
+ "base64 0.21.5",
+ "cfg-if",
+ "clap",
+ "cudarc",
+ "derive_more",
+ "futures",
+ "fxhash",
+ "half 2.3.1",
+ "hf-hub",
+ "indicatif",
+ "lazy_static",
+ "libc",
+ "log",
+ "memmap2",
+ "percent-encoding",
+ "rand",
+ "rllm",
+ "safetensors 0.4.1",
+ "serde",
+ "serde_json",
+ "tch",
+ "tch-cuda",
+ "tokenizers",
+ "tokio",
+ "torch-sys",
+ "uuid",
+]
+
 [[package]]
 name = "rquickjs"
 version = "0.4.0"
diff --git a/Cargo.toml b/Cargo.toml
index 8617a353..41fc0335 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,6 +7,7 @@ members = [
     "pyctrl",
     "jsctrl",
     "uppercase",
+    "rllm-lib",
     "rllm-cuda",
     "rllm-cpp",
     "tch-cuda",
diff --git a/rllm-cpp/Cargo.toml b/rllm-cpp/Cargo.toml
index 61fbe6a9..4fb7bcb4 100644
--- a/rllm-cpp/Cargo.toml
+++ b/rllm-cpp/Cargo.toml
@@ -6,9 +6,10 @@ rust-version = "1.75.0"
 
 [dependencies]
 actix-web = "4.4.0"
+anyhow = "1.0.79"
 clap = { version = "4.4.18", features = ["derive"] }
 llama_cpp_low = { path = "../llama-cpp-low" }
-rllm = { path = "../rllm-cuda", default-features = false, features = ["llamacpp"] }
+rllm = { path = "../rllm-lib" }
 
 [[bin]]
 name = "rllm-cpp"
diff --git a/rllm-cuda/src/llamacpp/blocks.rs b/rllm-cpp/src/llamacpp/blocks.rs
similarity index 98%
rename from rllm-cuda/src/llamacpp/blocks.rs
rename to rllm-cpp/src/llamacpp/blocks.rs
index 4e4fdc09..47d7885c 100644
--- a/rllm-cuda/src/llamacpp/blocks.rs
+++ b/rllm-cpp/src/llamacpp/blocks.rs
@@ -1,4 +1,4 @@
-use crate::{
+use rllm::{
     seq::{Sequence, SequenceGroup},
     SchedulerOutputs, TBlockSpaceManager,
 };
diff --git a/rllm-cuda/src/llamacpp/loader.rs b/rllm-cpp/src/llamacpp/loader.rs
similarity index 97%
rename from rllm-cuda/src/llamacpp/loader.rs
rename to rllm-cpp/src/llamacpp/loader.rs
index 93e9e8bf..9d6d9c86 100644
--- a/rllm-cuda/src/llamacpp/loader.rs
+++ b/rllm-cpp/src/llamacpp/loader.rs
@@ -1,6 +1,6 @@
 use std::sync::Arc;
 
-use crate::{config::ModelMeta, LoaderArgs, Repo, RllmEngine};
+use rllm::{config::ModelMeta, LoaderArgs, Repo, RllmEngine};
 use anyhow::{bail, Result};
 
 use llama_cpp_low as cpp;
diff --git a/rllm-cuda/src/llamacpp/mod.rs b/rllm-cpp/src/llamacpp/mod.rs
similarity index 96%
rename from rllm-cuda/src/llamacpp/mod.rs
rename to rllm-cpp/src/llamacpp/mod.rs
index 2360fc1f..692922a6 100644
--- a/rllm-cuda/src/llamacpp/mod.rs
+++ b/rllm-cpp/src/llamacpp/mod.rs
@@ -1,4 +1,4 @@
-use crate::TensorOps;
+use rllm::TensorOps;
 
 pub mod blocks;
 pub mod loader;
diff --git a/rllm-cuda/src/llamacpp/seqid.rs b/rllm-cpp/src/llamacpp/seqid.rs
similarity index 96%
rename from rllm-cuda/src/llamacpp/seqid.rs
rename to rllm-cpp/src/llamacpp/seqid.rs
index e0f16f86..6fd23984 100644
--- a/rllm-cuda/src/llamacpp/seqid.rs
+++ b/rllm-cpp/src/llamacpp/seqid.rs
@@ -1,6 +1,6 @@
 use std::sync::Mutex;
 
-use crate::{HashMap, SeqId, SequenceManager};
+use rllm::{HashMap, SeqId, SequenceManager};
 use llama_cpp_low as cpp;
 
 pub struct CppSequenceManager {
diff --git a/rllm-cuda/src/llamacpp/tmodel.rs b/rllm-cpp/src/llamacpp/tmodel.rs
similarity index 99%
rename from rllm-cuda/src/llamacpp/tmodel.rs
rename to rllm-cpp/src/llamacpp/tmodel.rs
index 73d78a50..9173ab85 100644
--- a/rllm-cuda/src/llamacpp/tmodel.rs
+++ b/rllm-cpp/src/llamacpp/tmodel.rs
@@ -1,4 +1,4 @@
-use crate::{
+use rllm::{
     config::{ModelMeta, RllmConfig}, seq::SchedulingPhase, AiciBias, HashMap, LoaderArgs, LogitsProcessor, ModelExec, SchedulerOutputs, TensorOps
 };
 use aicirt::{with_timer, TimerRef};
@@ -179,7 +179,7 @@ impl ModelExec for TModel {
     fn load_rllm_engine(
         args: LoaderArgs,
         model_args: Self::ModelLoaderArgs,
-    ) -> Result<crate::RllmEngine<Self>> {
+    ) -> Result<rllm::RllmEngine<Self>> {
         load_rllm_engine(args, model_args)
     }
 
diff --git a/rllm-cpp/src/rllm-cpp.rs b/rllm-cpp/src/rllm-cpp.rs
index c28fc8ff..caeded80 100644
--- a/rllm-cpp/src/rllm-cpp.rs
+++ b/rllm-cpp/src/rllm-cpp.rs
@@ -1,8 +1,7 @@
+mod llamacpp;
 use clap::Parser;
-use rllm::{
-    llamacpp::tmodel::{CppLoaderArgs, TModel},
-    util::parse_with_settings,
-};
+use llamacpp::tmodel::{CppLoaderArgs, TModel};
+use rllm::util::parse_with_settings;
 
 /// Serve LLMs with AICI over HTTP with llama.cpp backend.
 #[derive(Parser, Debug)]
diff --git a/rllm-cuda/Cargo.toml b/rllm-cuda/Cargo.toml
index ffb4a5b1..aa1d5808 100644
--- a/rllm-cuda/Cargo.toml
+++ b/rllm-cuda/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "rllm"
+name = "rllm-cuda"
 version = "0.1.0"
 edition = "2021"
 default-run = "rllm-server"
@@ -20,14 +20,13 @@ futures = "0.3.29"
 uuid = { version = "1.6.1", features = ["v4"] }
 derive_more = "0.99.17"
 
-tch = { version = "0.14.0", optional = true }
-torch-sys = { version = "0.14.0", optional = true }
+tch = { version = "0.14.0" }
+torch-sys = { version = "0.14.0" }
 
 cudarc = { version = "0.10.0", features = ["f16"], optional = true }
 tch-cuda = { path = "../tch-cuda", optional = true }
 
-llama_cpp_low = { path = "../llama-cpp-low", optional = true }
-
+rllm = { path = "../rllm-lib" }
 aicirt = { path = "../aicirt" }
 aici_abi = { path = "../aici_abi" }
 libc = "0.2.150"
@@ -42,11 +41,8 @@ percent-encoding = "2.3.1"
 
 [[bin]]
 name = "rllm-server"
-path = "src/driver.rs"
+path = "src/rllm-cuda.rs"
 
 [features]
-#default = ["llamacpp"]
-default = ["tch", "cuda"]
-tch = ["dep:tch", "dep:torch-sys"]
-cuda = ["tch", "dep:tch-cuda", "dep:cudarc", "llama_cpp_low?/cuda"]
-llamacpp = ["dep:llama_cpp_low"]
+default = ["cuda"]
+cuda = ["dep:tch-cuda", "dep:cudarc"]
diff --git a/rllm-cuda/SHIP_TODO.md b/rllm-cuda/SHIP_TODO.md
deleted file mode 100644
index 3673f047..00000000
--- a/rllm-cuda/SHIP_TODO.md
+++ /dev/null
@@ -1,6 +0,0 @@
-## AICIrt
-
-## rLLM
-* [x] load-test
-* [ ] swap to CPU
-* [x] auto-detect GPU cache size
diff --git a/rllm-cuda/src/llm/config.rs b/rllm-cuda/src/llm/config.rs
index b7612293..1903ef01 100644
--- a/rllm-cuda/src/llm/config.rs
+++ b/rllm-cuda/src/llm/config.rs
@@ -1,4 +1,4 @@
-use crate::config::{ModelMeta, RllmConfig};
+use rllm::config::{ModelMeta, RllmConfig};
 use aicirt::bail_user;
 use anyhow::Result;
 use tch::Device;
diff --git a/rllm-cuda/src/llm/loader.rs b/rllm-cuda/src/llm/loader.rs
index 9f7e7c8c..e0439951 100644
--- a/rllm-cuda/src/llm/loader.rs
+++ b/rllm-cuda/src/llm/loader.rs
@@ -6,7 +6,7 @@ use super::{
     tmodel::TModel,
     util::{gpu_memory_size, gpu_peak_allocated_bytes, log_mem_stats, reset_mem_stats},
 };
-use crate::{
+use rllm::{
     config::{ModelMeta, RllmConfig},
     CacheSize, HashSet, LoaderArgs, Repo, RllmEngine,
 };
diff --git a/rllm-cuda/src/llm/paged/batch_info.rs b/rllm-cuda/src/llm/paged/batch_info.rs
index 4d4449c8..1bd5a700 100644
--- a/rllm-cuda/src/llm/paged/batch_info.rs
+++ b/rllm-cuda/src/llm/paged/batch_info.rs
@@ -1,7 +1,7 @@
 use super::super::{kernels::to_offsets, tmodel::TModel};
 use super::cache_engine::CacheEngine;
 use super::BlockAllocator;
-use crate::{
+use rllm::{
     config::RllmConfig, seq::SchedulingPhase, util::pad_to_multiple, HashMap, SchedulerOutputs,
 };
 use aicirt::api::Token;
diff --git a/rllm-cuda/src/llm/paged/blocks.rs b/rllm-cuda/src/llm/paged/blocks.rs
index f913bff0..ef84492a 100644
--- a/rllm-cuda/src/llm/paged/blocks.rs
+++ b/rllm-cuda/src/llm/paged/blocks.rs
@@ -1,6 +1,6 @@
 use super::super::tmodel::TModel;
 use super::cache_engine::CacheEngine;
-use crate::{
+use rllm::{
     config::RllmConfig,
     seq::{SchedulingPhase, Sequence, SequenceGroup},
     BlockLocation, CacheSize, HashMap, SchedulerOutputs, SeqId, SequenceManager,
diff --git a/rllm-cuda/src/llm/paged/cache_engine.rs b/rllm-cuda/src/llm/paged/cache_engine.rs
index a4996d49..33744d81 100644
--- a/rllm-cuda/src/llm/paged/cache_engine.rs
+++ b/rllm-cuda/src/llm/paged/cache_engine.rs
@@ -2,7 +2,7 @@
 
 use super::super::{config::TchRllmConfig, kernels, tmodel::TModel};
 use super::CacheIface;
-use crate::{config::RllmConfig, CacheSize, HashMap};
+use rllm::{config::RllmConfig, CacheSize, HashMap};
 use std::sync::Arc;
 use tch::{Device, Tensor};
 
diff --git a/rllm-cuda/src/llm/refkernels.rs b/rllm-cuda/src/llm/refkernels.rs
index 2286f69f..2eac7829 100644
--- a/rllm-cuda/src/llm/refkernels.rs
+++ b/rllm-cuda/src/llm/refkernels.rs
@@ -1,5 +1,5 @@
 use super::util::{check_all_close_attn, to_vec1};
-use crate::HashMap;
+use rllm::HashMap;
 use tch::{IndexOp, Kind, Tensor};
 
 pub fn reshape_and_cache(
diff --git a/rllm-cuda/src/llm/tmodel.rs b/rllm-cuda/src/llm/tmodel.rs
index 97f80857..2e94e1c5 100644
--- a/rllm-cuda/src/llm/tmodel.rs
+++ b/rllm-cuda/src/llm/tmodel.rs
@@ -5,7 +5,7 @@ use super::{
     util::synchronize,
     DType,
 };
-use crate::{
+use rllm::{
     config::RllmConfig, AiciBias, LogitsProcessor, ModelExec, SchedulerOutputs, TensorOps,
 };
 use aicirt::{with_timer, TimerRef};
@@ -45,9 +45,9 @@ impl ModelExec for TModel {
     type SequenceManager = TchSeqMgr;
 
     fn load_model_config(
-        args: &crate::LoaderArgs,
+        args: &rllm::LoaderArgs,
         model_args: &mut Self::ModelLoaderArgs,
-    ) -> Result<(crate::config::ModelMeta, Self::ModelConfig)> {
+    ) -> Result<(rllm::config::ModelMeta, Self::ModelConfig)> {
         let m = load_model_config(args, model_args)?;
         Ok((m.meta.clone(), m))
     }
@@ -57,9 +57,9 @@ impl ModelExec for TModel {
     }
 
     fn load_rllm_engine(
-        args: crate::LoaderArgs,
+        args: rllm::LoaderArgs,
         model_args: Self::ModelLoaderArgs,
-    ) -> Result<crate::RllmEngine<Self>> {
+    ) -> Result<rllm::RllmEngine<Self>> {
         load_rllm_engine(args, model_args)
     }
 
diff --git a/rllm-cuda/src/llm/util.rs b/rllm-cuda/src/llm/util.rs
index fe5ed830..0b295a4a 100644
--- a/rllm-cuda/src/llm/util.rs
+++ b/rllm-cuda/src/llm/util.rs
@@ -1,5 +1,5 @@
 use super::DType;
-use crate::util::get_setting;
+use rllm::util::get_setting;
 use tch::{kind::Element, Device, IndexOp as _, Tensor};
 
 #[cfg(feature = "cuda")]
diff --git a/rllm-cuda/src/driver.rs b/rllm-cuda/src/rllm-cuda.rs
similarity index 85%
rename from rllm-cuda/src/driver.rs
rename to rllm-cuda/src/rllm-cuda.rs
index 8628afac..5a1f952e 100644
--- a/rllm-cuda/src/driver.rs
+++ b/rllm-cuda/src/rllm-cuda.rs
@@ -1,11 +1,11 @@
+mod llm;
+
 use clap::Parser;
-use rllm::{
-    llm::{
-        tmodel::{TModel, TchLoaderArgs},
-        DType,
-    },
-    util::parse_with_settings,
+use llm::{
+    tmodel::{TModel, TchLoaderArgs},
+    DType,
 };
+use rllm::util::parse_with_settings;
 use tch::Device;
 
 /// Serve LLMs with AICI over HTTP with tch (torch) backend.
@@ -48,6 +48,10 @@ async fn main() -> () {
         _ => panic!("invalid dtype; try one of bf16, f16, f32"),
     };
 
-    let model_args = TchLoaderArgs { device, dtype, profile_step_no: args.profile_step };
+    let model_args = TchLoaderArgs {
+        device,
+        dtype,
+        profile_step_no: args.profile_step,
+    };
     rllm::server::server_main::<TModel>(args.args, model_args).await;
 }
diff --git a/rllm-lib/Cargo.toml b/rllm-lib/Cargo.toml
new file mode 100644
index 00000000..6ed863c0
--- /dev/null
+++ b/rllm-lib/Cargo.toml
@@ -0,0 +1,32 @@
+[package]
+name = "rllm"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+anyhow = "1.0.75"
+clap = "4.4.8"
+hf-hub = "0.3.2"
+tokenizers = { version = "0.15.0", features = ["hf-hub"] }
+serde_json = "1.0.108"
+serde = { version = "1.0.193", features = ["derive"] }
+rand = "0.8.5"
+half = "2.3.1"
+log = "0.4.20"
+actix-web = "4.4.0"
+tokio = { version = "1.34.0", features = ["sync"] }
+futures = "0.3.29"
+uuid = { version = "1.6.1", features = ["v4"] }
+derive_more = "0.99.17"
+
+aicirt = { path = "../aicirt" }
+aici_abi = { path = "../aici_abi" }
+libc = "0.2.150"
+base64 = "0.21.5"
+indicatif = "0.17.7"
+memmap2 = "0.9.0"
+safetensors = "0.4.1"
+lazy_static = "1.4.0"
+fxhash = "0.2.1"
+cfg-if = "1.0.0"
+percent-encoding = "2.3.1"
diff --git a/rllm-cuda/src/config.rs b/rllm-lib/src/config.rs
similarity index 100%
rename from rllm-cuda/src/config.rs
rename to rllm-lib/src/config.rs
diff --git a/rllm-cuda/src/engine.rs b/rllm-lib/src/engine.rs
similarity index 100%
rename from rllm-cuda/src/engine.rs
rename to rllm-lib/src/engine.rs
diff --git a/rllm-cuda/src/exec.rs b/rllm-lib/src/exec.rs
similarity index 100%
rename from rllm-cuda/src/exec.rs
rename to rllm-lib/src/exec.rs
diff --git a/rllm-cuda/src/expected.rs b/rllm-lib/src/expected.rs
similarity index 100%
rename from rllm-cuda/src/expected.rs
rename to rllm-lib/src/expected.rs
diff --git a/rllm-cuda/src/iface.rs b/rllm-lib/src/iface.rs
similarity index 100%
rename from rllm-cuda/src/iface.rs
rename to rllm-lib/src/iface.rs
diff --git a/rllm-cuda/src/lib.rs b/rllm-lib/src/lib.rs
similarity index 92%
rename from rllm-cuda/src/lib.rs
rename to rllm-lib/src/lib.rs
index 179b39aa..3816e4f7 100644
--- a/rllm-cuda/src/lib.rs
+++ b/rllm-lib/src/lib.rs
@@ -18,14 +18,6 @@ pub use logits::LogitsProcessor;
 pub use scheduler::*;
 use std::sync::atomic::AtomicBool;
 
-cfg_if::cfg_if! {
-    if #[cfg(feature = "tch")] {
-        pub mod llm;
-    } else {
-        pub mod llamacpp;
-    }
-}
-
 pub use fxhash::FxHashMap as HashMap;
 pub use fxhash::FxHashSet as HashSet;
 
diff --git a/rllm-cuda/src/logits.rs b/rllm-lib/src/logits.rs
similarity index 100%
rename from rllm-cuda/src/logits.rs
rename to rllm-lib/src/logits.rs
diff --git a/rllm-cuda/src/scheduler.rs b/rllm-lib/src/scheduler.rs
similarity index 100%
rename from rllm-cuda/src/scheduler.rs
rename to rllm-lib/src/scheduler.rs
diff --git a/rllm-cuda/src/seq.rs b/rllm-lib/src/seq.rs
similarity index 99%
rename from rllm-cuda/src/seq.rs
rename to rllm-lib/src/seq.rs
index 1eacdd4e..9d60d5c4 100644
--- a/rllm-cuda/src/seq.rs
+++ b/rllm-lib/src/seq.rs
@@ -75,7 +75,7 @@ pub struct Sequence {
     pub prompt_len: usize,
     pub(crate) output_ptr: usize,
     pub(crate) output_pending: Vec<u8>,
-    pub(crate) num_kv_computed: usize,
+    pub num_kv_computed: usize,
     pub(crate) has_aici: bool,
     pub(crate) aici_sampling: AiciSampling,
     pub aici_logs: Vec<SequenceResult>,
diff --git a/rllm-cuda/src/server/api.rs b/rllm-lib/src/server/api.rs
similarity index 100%
rename from rllm-cuda/src/server/api.rs
rename to rllm-lib/src/server/api.rs
diff --git a/rllm-cuda/src/server/completion.rs b/rllm-lib/src/server/completion.rs
similarity index 100%
rename from rllm-cuda/src/server/completion.rs
rename to rllm-lib/src/server/completion.rs
diff --git a/rllm-cuda/src/server/mod.rs b/rllm-lib/src/server/mod.rs
similarity index 100%
rename from rllm-cuda/src/server/mod.rs
rename to rllm-lib/src/server/mod.rs
diff --git a/rllm-cuda/src/server/openai/LICENSE b/rllm-lib/src/server/openai/LICENSE
similarity index 100%
rename from rllm-cuda/src/server/openai/LICENSE
rename to rllm-lib/src/server/openai/LICENSE
diff --git a/rllm-cuda/src/server/openai/mod.rs b/rllm-lib/src/server/openai/mod.rs
similarity index 100%
rename from rllm-cuda/src/server/openai/mod.rs
rename to rllm-lib/src/server/openai/mod.rs
diff --git a/rllm-cuda/src/server/openai/requests.rs b/rllm-lib/src/server/openai/requests.rs
similarity index 100%
rename from rllm-cuda/src/server/openai/requests.rs
rename to rllm-lib/src/server/openai/requests.rs
diff --git a/rllm-cuda/src/server/openai/responses.rs b/rllm-lib/src/server/openai/responses.rs
similarity index 100%
rename from rllm-cuda/src/server/openai/responses.rs
rename to rllm-lib/src/server/openai/responses.rs
diff --git a/rllm-cuda/src/util.rs b/rllm-lib/src/util.rs
similarity index 100%
rename from rllm-cuda/src/util.rs
rename to rllm-lib/src/util.rs