From 469cf646cb140f589dda932ae2690221c4ccee45 Mon Sep 17 00:00:00 2001 From: Michal Moskal Date: Wed, 7 Feb 2024 21:14:11 +0000 Subject: [PATCH] split cpp/cuda code into folders --- Cargo.lock | 42 ++++++++++++++++--- Cargo.toml | 1 + rllm-cpp/Cargo.toml | 3 +- .../src/llamacpp/blocks.rs | 2 +- .../src/llamacpp/loader.rs | 2 +- {rllm-cuda => rllm-cpp}/src/llamacpp/mod.rs | 2 +- {rllm-cuda => rllm-cpp}/src/llamacpp/seqid.rs | 2 +- .../src/llamacpp/tmodel.rs | 4 +- rllm-cpp/src/rllm-cpp.rs | 7 ++-- rllm-cuda/Cargo.toml | 18 ++++---- rllm-cuda/SHIP_TODO.md | 6 --- rllm-cuda/src/llm/config.rs | 2 +- rllm-cuda/src/llm/loader.rs | 2 +- rllm-cuda/src/llm/paged/batch_info.rs | 2 +- rllm-cuda/src/llm/paged/blocks.rs | 2 +- rllm-cuda/src/llm/paged/cache_engine.rs | 2 +- rllm-cuda/src/llm/refkernels.rs | 2 +- rllm-cuda/src/llm/tmodel.rs | 10 ++--- rllm-cuda/src/llm/util.rs | 2 +- rllm-cuda/src/{driver.rs => rllm-cuda.rs} | 18 ++++---- rllm-lib/Cargo.toml | 32 ++++++++++++++ {rllm-cuda => rllm-lib}/src/config.rs | 0 {rllm-cuda => rllm-lib}/src/engine.rs | 0 {rllm-cuda => rllm-lib}/src/exec.rs | 0 {rllm-cuda => rllm-lib}/src/expected.rs | 0 {rllm-cuda => rllm-lib}/src/iface.rs | 0 {rllm-cuda => rllm-lib}/src/lib.rs | 8 ---- {rllm-cuda => rllm-lib}/src/logits.rs | 0 {rllm-cuda => rllm-lib}/src/scheduler.rs | 0 {rllm-cuda => rllm-lib}/src/seq.rs | 2 +- {rllm-cuda => rllm-lib}/src/server/api.rs | 0 .../src/server/completion.rs | 0 {rllm-cuda => rllm-lib}/src/server/mod.rs | 0 .../src/server/openai/LICENSE | 0 .../src/server/openai/mod.rs | 0 .../src/server/openai/requests.rs | 0 .../src/server/openai/responses.rs | 0 {rllm-cuda => rllm-lib}/src/util.rs | 0 38 files changed, 112 insertions(+), 61 deletions(-) rename {rllm-cuda => rllm-cpp}/src/llamacpp/blocks.rs (98%) rename {rllm-cuda => rllm-cpp}/src/llamacpp/loader.rs (97%) rename {rllm-cuda => rllm-cpp}/src/llamacpp/mod.rs (96%) rename {rllm-cuda => rllm-cpp}/src/llamacpp/seqid.rs (96%) rename {rllm-cuda => rllm-cpp}/src/llamacpp/tmodel.rs (99%) delete mode 100644 rllm-cuda/SHIP_TODO.md rename rllm-cuda/src/{driver.rs => rllm-cuda.rs} (85%) create mode 100644 rllm-lib/Cargo.toml rename {rllm-cuda => rllm-lib}/src/config.rs (100%) rename {rllm-cuda => rllm-lib}/src/engine.rs (100%) rename {rllm-cuda => rllm-lib}/src/exec.rs (100%) rename {rllm-cuda => rllm-lib}/src/expected.rs (100%) rename {rllm-cuda => rllm-lib}/src/iface.rs (100%) rename {rllm-cuda => rllm-lib}/src/lib.rs (92%) rename {rllm-cuda => rllm-lib}/src/logits.rs (100%) rename {rllm-cuda => rllm-lib}/src/scheduler.rs (100%) rename {rllm-cuda => rllm-lib}/src/seq.rs (99%) rename {rllm-cuda => rllm-lib}/src/server/api.rs (100%) rename {rllm-cuda => rllm-lib}/src/server/completion.rs (100%) rename {rllm-cuda => rllm-lib}/src/server/mod.rs (100%) rename {rllm-cuda => rllm-lib}/src/server/openai/LICENSE (100%) rename {rllm-cuda => rllm-lib}/src/server/openai/mod.rs (100%) rename {rllm-cuda => rllm-lib}/src/server/openai/requests.rs (100%) rename {rllm-cuda => rllm-lib}/src/server/openai/responses.rs (100%) rename {rllm-cuda => rllm-lib}/src/util.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index cbd71781..22360fd1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2945,7 +2945,6 @@ dependencies = [ "base64 0.21.5", "cfg-if", "clap", - "cudarc", "derive_more", "futures", "fxhash", @@ -2954,7 +2953,6 @@ dependencies = [ "indicatif", "lazy_static", "libc", - "llama_cpp_low", "log", "memmap2", "percent-encoding", @@ -2962,11 +2960,8 @@ dependencies = [ "safetensors 0.4.1", "serde", "serde_json", - "tch", - "tch-cuda", "tokenizers", "tokio", - "torch-sys", "uuid", ] @@ -2975,11 +2970,48 @@ name = "rllm-cpp" version = "0.1.0" dependencies = [ "actix-web", + "anyhow", "clap", "llama_cpp_low", "rllm", ] +[[package]] +name = "rllm-cuda" +version = "0.1.0" +dependencies = [ + "actix-web", + "aici_abi", + "aicirt", + "anyhow", + "base64 0.21.5", + "cfg-if", + "clap", + "cudarc", + "derive_more", + "futures", + "fxhash", + "half 2.3.1", + "hf-hub", + "indicatif", + "lazy_static", + "libc", + "log", + "memmap2", + "percent-encoding", + "rand", + "rllm", + "safetensors 0.4.1", + "serde", + "serde_json", + "tch", + "tch-cuda", + "tokenizers", + "tokio", + "torch-sys", + "uuid", +] + [[package]] name = "rquickjs" version = "0.4.0" diff --git a/Cargo.toml b/Cargo.toml index 8617a353..41fc0335 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ members = [ "pyctrl", "jsctrl", "uppercase", + "rllm-lib", "rllm-cuda", "rllm-cpp", "tch-cuda", diff --git a/rllm-cpp/Cargo.toml b/rllm-cpp/Cargo.toml index 61fbe6a9..4fb7bcb4 100644 --- a/rllm-cpp/Cargo.toml +++ b/rllm-cpp/Cargo.toml @@ -6,9 +6,10 @@ rust-version = "1.75.0" [dependencies] actix-web = "4.4.0" +anyhow = "1.0.79" clap = { version = "4.4.18", features = ["derive"] } llama_cpp_low = { path = "../llama-cpp-low" } -rllm = { path = "../rllm-cuda", default-features = false, features = ["llamacpp"] } +rllm = { path = "../rllm-lib" } [[bin]] name = "rllm-cpp" diff --git a/rllm-cuda/src/llamacpp/blocks.rs b/rllm-cpp/src/llamacpp/blocks.rs similarity index 98% rename from rllm-cuda/src/llamacpp/blocks.rs rename to rllm-cpp/src/llamacpp/blocks.rs index 4e4fdc09..47d7885c 100644 --- a/rllm-cuda/src/llamacpp/blocks.rs +++ b/rllm-cpp/src/llamacpp/blocks.rs @@ -1,4 +1,4 @@ -use crate::{ +use rllm::{ seq::{Sequence, SequenceGroup}, SchedulerOutputs, TBlockSpaceManager, }; diff --git a/rllm-cuda/src/llamacpp/loader.rs b/rllm-cpp/src/llamacpp/loader.rs similarity index 97% rename from rllm-cuda/src/llamacpp/loader.rs rename to rllm-cpp/src/llamacpp/loader.rs index 93e9e8bf..9d6d9c86 100644 --- a/rllm-cuda/src/llamacpp/loader.rs +++ b/rllm-cpp/src/llamacpp/loader.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use crate::{config::ModelMeta, LoaderArgs, Repo, RllmEngine}; +use rllm::{config::ModelMeta, LoaderArgs, Repo, RllmEngine}; use anyhow::{bail, Result}; use llama_cpp_low as cpp; diff --git a/rllm-cuda/src/llamacpp/mod.rs b/rllm-cpp/src/llamacpp/mod.rs similarity index 96% rename from rllm-cuda/src/llamacpp/mod.rs rename to rllm-cpp/src/llamacpp/mod.rs index 2360fc1f..692922a6 100644 --- a/rllm-cuda/src/llamacpp/mod.rs +++ b/rllm-cpp/src/llamacpp/mod.rs @@ -1,4 +1,4 @@ -use crate::TensorOps; +use rllm::TensorOps; pub mod blocks; pub mod loader; diff --git a/rllm-cuda/src/llamacpp/seqid.rs b/rllm-cpp/src/llamacpp/seqid.rs similarity index 96% rename from rllm-cuda/src/llamacpp/seqid.rs rename to rllm-cpp/src/llamacpp/seqid.rs index e0f16f86..6fd23984 100644 --- a/rllm-cuda/src/llamacpp/seqid.rs +++ b/rllm-cpp/src/llamacpp/seqid.rs @@ -1,6 +1,6 @@ use std::sync::Mutex; -use crate::{HashMap, SeqId, SequenceManager}; +use rllm::{HashMap, SeqId, SequenceManager}; use llama_cpp_low as cpp; pub struct CppSequenceManager { diff --git a/rllm-cuda/src/llamacpp/tmodel.rs b/rllm-cpp/src/llamacpp/tmodel.rs similarity index 99% rename from rllm-cuda/src/llamacpp/tmodel.rs rename to rllm-cpp/src/llamacpp/tmodel.rs index 73d78a50..9173ab85 100644 --- a/rllm-cuda/src/llamacpp/tmodel.rs +++ b/rllm-cpp/src/llamacpp/tmodel.rs @@ -1,4 +1,4 @@ -use crate::{ +use rllm::{ config::{ModelMeta, RllmConfig}, seq::SchedulingPhase, AiciBias, HashMap, LoaderArgs, LogitsProcessor, ModelExec, SchedulerOutputs, TensorOps }; use aicirt::{with_timer, TimerRef}; @@ -179,7 +179,7 @@ impl ModelExec for TModel { fn load_rllm_engine( args: LoaderArgs, model_args: Self::ModelLoaderArgs, - ) -> Result> { + ) -> Result> { load_rllm_engine(args, model_args) } diff --git a/rllm-cpp/src/rllm-cpp.rs b/rllm-cpp/src/rllm-cpp.rs index c28fc8ff..caeded80 100644 --- a/rllm-cpp/src/rllm-cpp.rs +++ b/rllm-cpp/src/rllm-cpp.rs @@ -1,8 +1,7 @@ +mod llamacpp; use clap::Parser; -use rllm::{ - llamacpp::tmodel::{CppLoaderArgs, TModel}, - util::parse_with_settings, -}; +use llamacpp::tmodel::{CppLoaderArgs, TModel}; +use rllm::util::parse_with_settings; /// Serve LLMs with AICI over HTTP with llama.cpp backend. #[derive(Parser, Debug)] diff --git a/rllm-cuda/Cargo.toml b/rllm-cuda/Cargo.toml index ffb4a5b1..aa1d5808 100644 --- a/rllm-cuda/Cargo.toml +++ b/rllm-cuda/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "rllm" +name = "rllm-cuda" version = "0.1.0" edition = "2021" default-run = "rllm-server" @@ -20,14 +20,13 @@ futures = "0.3.29" uuid = { version = "1.6.1", features = ["v4"] } derive_more = "0.99.17" -tch = { version = "0.14.0", optional = true } -torch-sys = { version = "0.14.0", optional = true } +tch = { version = "0.14.0" } +torch-sys = { version = "0.14.0" } cudarc = { version = "0.10.0", features = ["f16"], optional = true } tch-cuda = { path = "../tch-cuda", optional = true } -llama_cpp_low = { path = "../llama-cpp-low", optional = true } - +rllm = { path = "../rllm-lib" } aicirt = { path = "../aicirt" } aici_abi = { path = "../aici_abi" } libc = "0.2.150" @@ -42,11 +41,8 @@ percent-encoding = "2.3.1" [[bin]] name = "rllm-server" -path = "src/driver.rs" +path = "src/rllm-cuda.rs" [features] -#default = ["llamacpp"] -default = ["tch", "cuda"] -tch = ["dep:tch", "dep:torch-sys"] -cuda = ["tch", "dep:tch-cuda", "dep:cudarc", "llama_cpp_low?/cuda"] -llamacpp = ["dep:llama_cpp_low"] +default = ["cuda"] +cuda = ["dep:tch-cuda", "dep:cudarc"] diff --git a/rllm-cuda/SHIP_TODO.md b/rllm-cuda/SHIP_TODO.md deleted file mode 100644 index 3673f047..00000000 --- a/rllm-cuda/SHIP_TODO.md +++ /dev/null @@ -1,6 +0,0 @@ -## AICIrt - -## rLLM -* [x] load-test -* [ ] swap to CPU -* [x] auto-detect GPU cache size diff --git a/rllm-cuda/src/llm/config.rs b/rllm-cuda/src/llm/config.rs index b7612293..1903ef01 100644 --- a/rllm-cuda/src/llm/config.rs +++ b/rllm-cuda/src/llm/config.rs @@ -1,4 +1,4 @@ -use crate::config::{ModelMeta, RllmConfig}; +use rllm::config::{ModelMeta, RllmConfig}; use aicirt::bail_user; use anyhow::Result; use tch::Device; diff --git a/rllm-cuda/src/llm/loader.rs b/rllm-cuda/src/llm/loader.rs index 9f7e7c8c..e0439951 100644 --- a/rllm-cuda/src/llm/loader.rs +++ b/rllm-cuda/src/llm/loader.rs @@ -6,7 +6,7 @@ use super::{ tmodel::TModel, util::{gpu_memory_size, gpu_peak_allocated_bytes, log_mem_stats, reset_mem_stats}, }; -use crate::{ +use rllm::{ config::{ModelMeta, RllmConfig}, CacheSize, HashSet, LoaderArgs, Repo, RllmEngine, }; diff --git a/rllm-cuda/src/llm/paged/batch_info.rs b/rllm-cuda/src/llm/paged/batch_info.rs index 4d4449c8..1bd5a700 100644 --- a/rllm-cuda/src/llm/paged/batch_info.rs +++ b/rllm-cuda/src/llm/paged/batch_info.rs @@ -1,7 +1,7 @@ use super::super::{kernels::to_offsets, tmodel::TModel}; use super::cache_engine::CacheEngine; use super::BlockAllocator; -use crate::{ +use rllm::{ config::RllmConfig, seq::SchedulingPhase, util::pad_to_multiple, HashMap, SchedulerOutputs, }; use aicirt::api::Token; diff --git a/rllm-cuda/src/llm/paged/blocks.rs b/rllm-cuda/src/llm/paged/blocks.rs index f913bff0..ef84492a 100644 --- a/rllm-cuda/src/llm/paged/blocks.rs +++ b/rllm-cuda/src/llm/paged/blocks.rs @@ -1,6 +1,6 @@ use super::super::tmodel::TModel; use super::cache_engine::CacheEngine; -use crate::{ +use rllm::{ config::RllmConfig, seq::{SchedulingPhase, Sequence, SequenceGroup}, BlockLocation, CacheSize, HashMap, SchedulerOutputs, SeqId, SequenceManager, diff --git a/rllm-cuda/src/llm/paged/cache_engine.rs b/rllm-cuda/src/llm/paged/cache_engine.rs index a4996d49..33744d81 100644 --- a/rllm-cuda/src/llm/paged/cache_engine.rs +++ b/rllm-cuda/src/llm/paged/cache_engine.rs @@ -2,7 +2,7 @@ use super::super::{config::TchRllmConfig, kernels, tmodel::TModel}; use super::CacheIface; -use crate::{config::RllmConfig, CacheSize, HashMap}; +use rllm::{config::RllmConfig, CacheSize, HashMap}; use std::sync::Arc; use tch::{Device, Tensor}; diff --git a/rllm-cuda/src/llm/refkernels.rs b/rllm-cuda/src/llm/refkernels.rs index 2286f69f..2eac7829 100644 --- a/rllm-cuda/src/llm/refkernels.rs +++ b/rllm-cuda/src/llm/refkernels.rs @@ -1,5 +1,5 @@ use super::util::{check_all_close_attn, to_vec1}; -use crate::HashMap; +use rllm::HashMap; use tch::{IndexOp, Kind, Tensor}; pub fn reshape_and_cache( diff --git a/rllm-cuda/src/llm/tmodel.rs b/rllm-cuda/src/llm/tmodel.rs index 97f80857..2e94e1c5 100644 --- a/rllm-cuda/src/llm/tmodel.rs +++ b/rllm-cuda/src/llm/tmodel.rs @@ -5,7 +5,7 @@ use super::{ util::synchronize, DType, }; -use crate::{ +use rllm::{ config::RllmConfig, AiciBias, LogitsProcessor, ModelExec, SchedulerOutputs, TensorOps, }; use aicirt::{with_timer, TimerRef}; @@ -45,9 +45,9 @@ impl ModelExec for TModel { type SequenceManager = TchSeqMgr; fn load_model_config( - args: &crate::LoaderArgs, + args: &rllm::LoaderArgs, model_args: &mut Self::ModelLoaderArgs, - ) -> Result<(crate::config::ModelMeta, Self::ModelConfig)> { + ) -> Result<(rllm::config::ModelMeta, Self::ModelConfig)> { let m = load_model_config(args, model_args)?; Ok((m.meta.clone(), m)) } @@ -57,9 +57,9 @@ impl ModelExec for TModel { } fn load_rllm_engine( - args: crate::LoaderArgs, + args: rllm::LoaderArgs, model_args: Self::ModelLoaderArgs, - ) -> Result> { + ) -> Result> { load_rllm_engine(args, model_args) } diff --git a/rllm-cuda/src/llm/util.rs b/rllm-cuda/src/llm/util.rs index fe5ed830..0b295a4a 100644 --- a/rllm-cuda/src/llm/util.rs +++ b/rllm-cuda/src/llm/util.rs @@ -1,5 +1,5 @@ use super::DType; -use crate::util::get_setting; +use rllm::util::get_setting; use tch::{kind::Element, Device, IndexOp as _, Tensor}; #[cfg(feature = "cuda")] diff --git a/rllm-cuda/src/driver.rs b/rllm-cuda/src/rllm-cuda.rs similarity index 85% rename from rllm-cuda/src/driver.rs rename to rllm-cuda/src/rllm-cuda.rs index 8628afac..5a1f952e 100644 --- a/rllm-cuda/src/driver.rs +++ b/rllm-cuda/src/rllm-cuda.rs @@ -1,11 +1,11 @@ +mod llm; + use clap::Parser; -use rllm::{ - llm::{ - tmodel::{TModel, TchLoaderArgs}, - DType, - }, - util::parse_with_settings, +use llm::{ + tmodel::{TModel, TchLoaderArgs}, + DType, }; +use rllm::util::parse_with_settings; use tch::Device; /// Serve LLMs with AICI over HTTP with tch (torch) backend. @@ -48,6 +48,10 @@ async fn main() -> () { _ => panic!("invalid dtype; try one of bf16, f16, f32"), }; - let model_args = TchLoaderArgs { device, dtype, profile_step_no: args.profile_step }; + let model_args = TchLoaderArgs { + device, + dtype, + profile_step_no: args.profile_step, + }; rllm::server::server_main::(args.args, model_args).await; } diff --git a/rllm-lib/Cargo.toml b/rllm-lib/Cargo.toml new file mode 100644 index 00000000..6ed863c0 --- /dev/null +++ b/rllm-lib/Cargo.toml @@ -0,0 +1,32 @@ +[package] +name = "rllm" +version = "0.1.0" +edition = "2021" + +[dependencies] +anyhow = "1.0.75" +clap = "4.4.8" +hf-hub = "0.3.2" +tokenizers = { version = "0.15.0", features = ["hf-hub"] } +serde_json = "1.0.108" +serde = { version = "1.0.193", features = ["derive"] } +rand = "0.8.5" +half = "2.3.1" +log = "0.4.20" +actix-web = "4.4.0" +tokio = { version = "1.34.0", features = ["sync"] } +futures = "0.3.29" +uuid = { version = "1.6.1", features = ["v4"] } +derive_more = "0.99.17" + +aicirt = { path = "../aicirt" } +aici_abi = { path = "../aici_abi" } +libc = "0.2.150" +base64 = "0.21.5" +indicatif = "0.17.7" +memmap2 = "0.9.0" +safetensors = "0.4.1" +lazy_static = "1.4.0" +fxhash = "0.2.1" +cfg-if = "1.0.0" +percent-encoding = "2.3.1" diff --git a/rllm-cuda/src/config.rs b/rllm-lib/src/config.rs similarity index 100% rename from rllm-cuda/src/config.rs rename to rllm-lib/src/config.rs diff --git a/rllm-cuda/src/engine.rs b/rllm-lib/src/engine.rs similarity index 100% rename from rllm-cuda/src/engine.rs rename to rllm-lib/src/engine.rs diff --git a/rllm-cuda/src/exec.rs b/rllm-lib/src/exec.rs similarity index 100% rename from rllm-cuda/src/exec.rs rename to rllm-lib/src/exec.rs diff --git a/rllm-cuda/src/expected.rs b/rllm-lib/src/expected.rs similarity index 100% rename from rllm-cuda/src/expected.rs rename to rllm-lib/src/expected.rs diff --git a/rllm-cuda/src/iface.rs b/rllm-lib/src/iface.rs similarity index 100% rename from rllm-cuda/src/iface.rs rename to rllm-lib/src/iface.rs diff --git a/rllm-cuda/src/lib.rs b/rllm-lib/src/lib.rs similarity index 92% rename from rllm-cuda/src/lib.rs rename to rllm-lib/src/lib.rs index 179b39aa..3816e4f7 100644 --- a/rllm-cuda/src/lib.rs +++ b/rllm-lib/src/lib.rs @@ -18,14 +18,6 @@ pub use logits::LogitsProcessor; pub use scheduler::*; use std::sync::atomic::AtomicBool; -cfg_if::cfg_if! { - if #[cfg(feature = "tch")] { - pub mod llm; - } else { - pub mod llamacpp; - } -} - pub use fxhash::FxHashMap as HashMap; pub use fxhash::FxHashSet as HashSet; diff --git a/rllm-cuda/src/logits.rs b/rllm-lib/src/logits.rs similarity index 100% rename from rllm-cuda/src/logits.rs rename to rllm-lib/src/logits.rs diff --git a/rllm-cuda/src/scheduler.rs b/rllm-lib/src/scheduler.rs similarity index 100% rename from rllm-cuda/src/scheduler.rs rename to rllm-lib/src/scheduler.rs diff --git a/rllm-cuda/src/seq.rs b/rllm-lib/src/seq.rs similarity index 99% rename from rllm-cuda/src/seq.rs rename to rllm-lib/src/seq.rs index 1eacdd4e..9d60d5c4 100644 --- a/rllm-cuda/src/seq.rs +++ b/rllm-lib/src/seq.rs @@ -75,7 +75,7 @@ pub struct Sequence { pub prompt_len: usize, pub(crate) output_ptr: usize, pub(crate) output_pending: Vec, - pub(crate) num_kv_computed: usize, + pub num_kv_computed: usize, pub(crate) has_aici: bool, pub(crate) aici_sampling: AiciSampling, pub aici_logs: Vec, diff --git a/rllm-cuda/src/server/api.rs b/rllm-lib/src/server/api.rs similarity index 100% rename from rllm-cuda/src/server/api.rs rename to rllm-lib/src/server/api.rs diff --git a/rllm-cuda/src/server/completion.rs b/rllm-lib/src/server/completion.rs similarity index 100% rename from rllm-cuda/src/server/completion.rs rename to rllm-lib/src/server/completion.rs diff --git a/rllm-cuda/src/server/mod.rs b/rllm-lib/src/server/mod.rs similarity index 100% rename from rllm-cuda/src/server/mod.rs rename to rllm-lib/src/server/mod.rs diff --git a/rllm-cuda/src/server/openai/LICENSE b/rllm-lib/src/server/openai/LICENSE similarity index 100% rename from rllm-cuda/src/server/openai/LICENSE rename to rllm-lib/src/server/openai/LICENSE diff --git a/rllm-cuda/src/server/openai/mod.rs b/rllm-lib/src/server/openai/mod.rs similarity index 100% rename from rllm-cuda/src/server/openai/mod.rs rename to rllm-lib/src/server/openai/mod.rs diff --git a/rllm-cuda/src/server/openai/requests.rs b/rllm-lib/src/server/openai/requests.rs similarity index 100% rename from rllm-cuda/src/server/openai/requests.rs rename to rllm-lib/src/server/openai/requests.rs diff --git a/rllm-cuda/src/server/openai/responses.rs b/rllm-lib/src/server/openai/responses.rs similarity index 100% rename from rllm-cuda/src/server/openai/responses.rs rename to rllm-lib/src/server/openai/responses.rs diff --git a/rllm-cuda/src/util.rs b/rllm-lib/src/util.rs similarity index 100% rename from rllm-cuda/src/util.rs rename to rllm-lib/src/util.rs