diff --git a/Cargo.lock b/Cargo.lock index 9332c414..8f1a3622 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2967,6 +2967,7 @@ dependencies = [ "llama_cpp_low", "log", "memmap2", + "num-traits", "num_cpus", "percent-encoding", "rand", diff --git a/cpp-rllm/cpp-server.sh b/cpp-rllm/cpp-server.sh index 463e7d87..2f4be372 100755 --- a/cpp-rllm/cpp-server.sh +++ b/cpp-rllm/cpp-server.sh @@ -36,10 +36,10 @@ fi case "$1" in phi2 ) - ARGS="-m https://huggingface.co/TheBloke/phi-2-GGUF/blob/main/phi-2.Q8_0.gguf -t phi" + ARGS="-m https://huggingface.co/TheBloke/phi-2-GGUF/blob/main/phi-2.Q8_0.gguf -t phi -w ../rllm/expected/phi-2/cats.safetensors -s test_maxtol=0.8 -s test_avgtol=0.3" ;; orca ) - ARGS="-m https://huggingface.co/TheBloke/Orca-2-13B-GGUF/blob/main/orca-2-13b.Q8_0.gguf -t orca" + ARGS="-m https://huggingface.co/TheBloke/Orca-2-13B-GGUF/blob/main/orca-2-13b.Q8_0.gguf -t orca -w ../rllm/expected/orca/cats.safetensors" ;; build ) BUILD=1 diff --git a/rllm/src/expected.rs b/rllm/src/expected.rs new file mode 100644 index 00000000..c38f80bf --- /dev/null +++ b/rllm/src/expected.rs @@ -0,0 +1,101 @@ +use aici_abi::bytes::vec_from_bytes; +use aicirt::api::Token; +use anyhow::Result; +use safetensors::Dtype; +use std::path::PathBuf; + +use crate::{ExpectedGeneration, ExpectedToken}; + +fn read_flat_i32_vec(view: &impl safetensors::View) -> Vec { + match view.dtype() { + Dtype::I32 => vec_from_bytes(&view.data()), + Dtype::I16 => vec_from_bytes::(&view.data()) + .iter() + .map(|x| *x as i32) + .collect(), + Dtype::I8 => vec_from_bytes::(&view.data()) + .iter() + .map(|x| *x as i32) + .collect(), + Dtype::U8 => vec_from_bytes::(&view.data()) + .iter() + .map(|x| *x as i32) + .collect(), + Dtype::I64 => vec_from_bytes::(&view.data()) + .iter() + .map(|x| (*x).try_into().expect("i64->i32 failed")) + .collect(), + Dtype::BOOL => vec_from_bytes::(&view.data()) + .iter() + .map(|x| if *x != 0 { 1 } else { 0 }) + .collect(), + _ => panic!("expected int type"), + } +} + +fn read_flat_f32_vec(view: &impl safetensors::View) -> Vec { + match view.dtype() { + Dtype::F32 => vec_from_bytes(&view.data()), + Dtype::F16 => vec_from_bytes::(&view.data()) + .iter() + .map(|x| half::f16::from_bits(*x).to_f32()) + .collect(), + Dtype::BF16 => vec_from_bytes::(&view.data()) + .iter() + .map(|x| half::bf16::from_bits(*x).to_f32()) + .collect(), + Dtype::F64 => vec_from_bytes::(&view.data()) + .iter() + .map(|x| *x as f32) + .collect(), + _ => read_flat_i32_vec(view).iter().map(|x| *x as f32).collect(), + } +} + +fn to_2d(v: Vec, view: &impl safetensors::View) -> Result>> { + let size = view.shape(); + if size.len() != 2 { + anyhow::bail!("expected 2d tensor"); + } + Ok((0..size[0]) + .map(|i| v[i * size[1]..(i + 1) * size[1]].to_vec()) + .collect()) +} + +impl ExpectedGeneration { + pub fn load(f: &PathBuf) -> Result { + let fp = std::fs::File::open(f)?; + let content = unsafe { memmap2::MmapOptions::new().map(&fp)? }; + let s = safetensors::SafeTensors::deserialize(&content)?; + + let prompt = read_flat_i32_vec(&s.tensor("prompt")?); + let output = read_flat_i32_vec(&s.tensor("output")?); + let prob_mass = read_flat_f32_vec(&s.tensor("prob_mass")?); + + let view = s.tensor("tokens")?; + let tokens = to_2d(read_flat_i32_vec(&view), &view)?; + let view = s.tensor("logits")?; + let logits = to_2d(read_flat_f32_vec(&view), &view)?; + + let num_tokens = output.len(); + assert!(tokens.len() == num_tokens); + assert!(logits.len() == num_tokens); + assert!(prob_mass.len() == num_tokens); + + Ok(ExpectedGeneration { + prompt: prompt.into_iter().map(|x| x as Token).collect(), + output: (0..num_tokens) + .map(|i| ExpectedToken { + sampled: output[i] as Token, + ff_section_len: 1, + prob_mass: prob_mass[i], + logits: tokens[i] + .iter() + .zip(logits[i].iter()) + .map(|(t, p)| (*t as Token, *p)) + .collect(), + }) + .collect(), + }) + } +} diff --git a/rllm/src/lib.rs b/rllm/src/lib.rs index cd76b97c..4deebf00 100644 --- a/rllm/src/lib.rs +++ b/rllm/src/lib.rs @@ -6,6 +6,7 @@ pub mod config; mod engine; pub mod iface; mod logits; +mod expected; pub mod util; pub mod server; diff --git a/rllm/src/llamacpp/loader.rs b/rllm/src/llamacpp/loader.rs index 9c7f322f..599315d1 100644 --- a/rllm/src/llamacpp/loader.rs +++ b/rllm/src/llamacpp/loader.rs @@ -46,10 +46,12 @@ fn do_load(args: &mut LoaderArgs) -> Result { let mut mparams = cpp::ModelParams::default(); // TODO: make this configurable mparams.set_split_mode(cpp::SplitMode::None); + mparams.n_gpu_layers = 1000; // don't GPU offload on Intel macs - it just fails there - #[cfg(not(all(target_os = "macos", target_arch = "x86_64")))] + #[cfg(all(target_os = "macos", target_arch = "x86_64"))] { - mparams.n_gpu_layers = 1000; + log::warn!("disabling GPU (Intel macOS)"); + mparams.n_gpu_layers = 0; } let m = cpp::Model::from_file(file.to_str().unwrap(), mparams)?; diff --git a/rllm/src/llm/loader.rs b/rllm/src/llm/loader.rs index 5f22c337..595a6526 100644 --- a/rllm/src/llm/loader.rs +++ b/rllm/src/llm/loader.rs @@ -10,7 +10,7 @@ use crate::{ }, }, paged::{BatchInfoBuilder, CacheEngine, CacheSize}, - DType, ExpectedGeneration, ExpectedToken, HashSet, LoaderArgs, Repo, RllmEngine, + DType, ExpectedGeneration, HashSet, LoaderArgs, Repo, RllmEngine, RllmModelConfig, }; use aicirt::api::Token; @@ -30,57 +30,6 @@ fn read_tensor(s: &safetensors::SafeTensors, name: &str) -> Result { Ok(tensor) } -fn kind_from_dt(dtype: Dtype) -> Kind { - match dtype { - Dtype::BOOL => Kind::Bool, - Dtype::U8 => Kind::Uint8, - Dtype::I8 => Kind::Int8, - Dtype::I16 => Kind::Int16, - Dtype::I32 => Kind::Int, - Dtype::I64 => Kind::Int64, - Dtype::BF16 => Kind::BFloat16, - Dtype::F16 => Kind::Half, - Dtype::F32 => Kind::Float, - Dtype::F64 => Kind::Double, - dtype => panic!("unsupported dtype {dtype:?}"), - } -} - -impl ExpectedGeneration { - pub fn load(f: &PathBuf) -> Result { - let fp = std::fs::File::open(f)?; - let content = unsafe { memmap2::MmapOptions::new().map(&fp)? }; - let s = safetensors::SafeTensors::deserialize(&content)?; - - let prompt = to_vec1::(&read_tensor(&s, "prompt")?.to_kind(Kind::Int)); - let output = to_vec1::(&read_tensor(&s, "output")?.to_kind(Kind::Int)); - let prob_mass = to_vec1::(&read_tensor(&s, "prob_mass")?.to_kind(Kind::Float)); - let tokens = to_vec2::(&read_tensor(&s, "tokens")?.to_kind(Kind::Int)); - let logits = to_vec2::(&read_tensor(&s, "logits")?.to_kind(Kind::Float)); - - let num_tokens = output.len(); - assert!(tokens.len() == num_tokens); - assert!(logits.len() == num_tokens); - assert!(prob_mass.len() == num_tokens); - - Ok(ExpectedGeneration { - prompt: prompt.into_iter().map(|x| x as Token).collect(), - output: (0..num_tokens) - .map(|i| ExpectedToken { - sampled: output[i] as Token, - ff_section_len: 1, - prob_mass: prob_mass[i], - logits: tokens[i] - .iter() - .zip(logits[i].iter()) - .map(|(t, p)| (*t as Token, *p)) - .collect(), - }) - .collect(), - }) - } -} - fn load_model(rllm_config: &RllmConfig, filenames: Vec) -> Result> { let mut vs = VarStore::new(rllm_config.device.clone()); diff --git a/rllm/src/server/mod.rs b/rllm/src/server/mod.rs index 2d452b61..8256fbc8 100644 --- a/rllm/src/server/mod.rs +++ b/rllm/src/server/mod.rs @@ -348,12 +348,6 @@ fn inference_loop( } } -#[cfg(not(feature = "tch"))] -fn run_tests(_args: &RllmCliArgs, _loader_args: LoaderArgs) { - panic!("tests not supported without tch feature") -} - -#[cfg(feature = "tch")] fn run_tests(args: &RllmCliArgs, loader_args: LoaderArgs) { let mut engine = RllmEngine::load(loader_args).expect("failed to load model"); let mut tests = args.test.clone(); @@ -414,7 +408,6 @@ fn spawn_inference_loop( let wid = "warmup".to_string(); match warmup { Some(w) if w == "off" => {} - #[cfg(feature = "tch")] Some(w) => { let exp = crate::ExpectedGeneration::load(&std::path::PathBuf::from(&w)) .expect("can't load warmup"); diff --git a/rllm/src/util.rs b/rllm/src/util.rs index 1bb06a7b..a5c94f14 100644 --- a/rllm/src/util.rs +++ b/rllm/src/util.rs @@ -7,8 +7,8 @@ use std::time::Instant; const SETTINGS: [(&'static str, &'static str, f64); 4] = [ ("attn_rtol", "relative tolerance for flash attn check", 0.1), ("attn_atol", "absolute tolerance for flash attn check", 0.1), - ("test_maxtol", "max allowed error", 0.5), - ("test_avgtol", "avg allowed error", 0.2), + ("test_maxtol", "max allowed error for --test and --warmup", 0.5), + ("test_avgtol", "avg allowed error for --test and --warmup", 0.2), ]; lazy_static::lazy_static! {