Skip to content

Commit

Permalink
allow --test with llama-cpp
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Jan 25, 2024
1 parent d144ee4 commit 3c5ad91
Show file tree
Hide file tree
Showing 8 changed files with 112 additions and 65 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions cpp-rllm/cpp-server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@ fi

case "$1" in
phi2 )
ARGS="-m https://huggingface.co/TheBloke/phi-2-GGUF/blob/main/phi-2.Q8_0.gguf -t phi"
ARGS="-m https://huggingface.co/TheBloke/phi-2-GGUF/blob/main/phi-2.Q8_0.gguf -t phi -w ../rllm/expected/phi-2/cats.safetensors -s test_maxtol=0.8 -s test_avgtol=0.3"
;;
orca )
ARGS="-m https://huggingface.co/TheBloke/Orca-2-13B-GGUF/blob/main/orca-2-13b.Q8_0.gguf -t orca"
ARGS="-m https://huggingface.co/TheBloke/Orca-2-13B-GGUF/blob/main/orca-2-13b.Q8_0.gguf -t orca -w ../rllm/expected/orca/cats.safetensors"
;;
build )
BUILD=1
Expand Down
101 changes: 101 additions & 0 deletions rllm/src/expected.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
use aici_abi::bytes::vec_from_bytes;
use aicirt::api::Token;
use anyhow::Result;
use safetensors::Dtype;
use std::path::PathBuf;

use crate::{ExpectedGeneration, ExpectedToken};

fn read_flat_i32_vec(view: &impl safetensors::View) -> Vec<i32> {
match view.dtype() {
Dtype::I32 => vec_from_bytes(&view.data()),
Dtype::I16 => vec_from_bytes::<i16>(&view.data())
.iter()
.map(|x| *x as i32)
.collect(),
Dtype::I8 => vec_from_bytes::<i8>(&view.data())
.iter()
.map(|x| *x as i32)
.collect(),
Dtype::U8 => vec_from_bytes::<u8>(&view.data())
.iter()
.map(|x| *x as i32)
.collect(),
Dtype::I64 => vec_from_bytes::<i64>(&view.data())
.iter()
.map(|x| (*x).try_into().expect("i64->i32 failed"))
.collect(),
Dtype::BOOL => vec_from_bytes::<u8>(&view.data())
.iter()
.map(|x| if *x != 0 { 1 } else { 0 })
.collect(),
_ => panic!("expected int type"),
}
}

fn read_flat_f32_vec(view: &impl safetensors::View) -> Vec<f32> {
match view.dtype() {
Dtype::F32 => vec_from_bytes(&view.data()),
Dtype::F16 => vec_from_bytes::<u16>(&view.data())
.iter()
.map(|x| half::f16::from_bits(*x).to_f32())
.collect(),
Dtype::BF16 => vec_from_bytes::<u16>(&view.data())
.iter()
.map(|x| half::bf16::from_bits(*x).to_f32())
.collect(),
Dtype::F64 => vec_from_bytes::<f64>(&view.data())
.iter()
.map(|x| *x as f32)
.collect(),
_ => read_flat_i32_vec(view).iter().map(|x| *x as f32).collect(),
}
}

fn to_2d<T: Clone>(v: Vec<T>, view: &impl safetensors::View) -> Result<Vec<Vec<T>>> {
let size = view.shape();
if size.len() != 2 {
anyhow::bail!("expected 2d tensor");
}
Ok((0..size[0])
.map(|i| v[i * size[1]..(i + 1) * size[1]].to_vec())
.collect())
}

impl ExpectedGeneration {
pub fn load(f: &PathBuf) -> Result<Self> {
let fp = std::fs::File::open(f)?;
let content = unsafe { memmap2::MmapOptions::new().map(&fp)? };
let s = safetensors::SafeTensors::deserialize(&content)?;

let prompt = read_flat_i32_vec(&s.tensor("prompt")?);
let output = read_flat_i32_vec(&s.tensor("output")?);
let prob_mass = read_flat_f32_vec(&s.tensor("prob_mass")?);

let view = s.tensor("tokens")?;
let tokens = to_2d(read_flat_i32_vec(&view), &view)?;
let view = s.tensor("logits")?;
let logits = to_2d(read_flat_f32_vec(&view), &view)?;

let num_tokens = output.len();
assert!(tokens.len() == num_tokens);
assert!(logits.len() == num_tokens);
assert!(prob_mass.len() == num_tokens);

Ok(ExpectedGeneration {
prompt: prompt.into_iter().map(|x| x as Token).collect(),
output: (0..num_tokens)
.map(|i| ExpectedToken {
sampled: output[i] as Token,
ff_section_len: 1,
prob_mass: prob_mass[i],
logits: tokens[i]
.iter()
.zip(logits[i].iter())
.map(|(t, p)| (*t as Token, *p))
.collect(),
})
.collect(),
})
}
}
1 change: 1 addition & 0 deletions rllm/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ pub mod config;
mod engine;
pub mod iface;
mod logits;
mod expected;
pub mod util;
pub mod server;

Expand Down
6 changes: 4 additions & 2 deletions rllm/src/llamacpp/loader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,12 @@ fn do_load(args: &mut LoaderArgs) -> Result<cpp::Model> {
let mut mparams = cpp::ModelParams::default();
// TODO: make this configurable
mparams.set_split_mode(cpp::SplitMode::None);
mparams.n_gpu_layers = 1000;
// don't GPU offload on Intel macs - it just fails there
#[cfg(not(all(target_os = "macos", target_arch = "x86_64")))]
#[cfg(all(target_os = "macos", target_arch = "x86_64"))]
{
mparams.n_gpu_layers = 1000;
log::warn!("disabling GPU (Intel macOS)");
mparams.n_gpu_layers = 0;
}

let m = cpp::Model::from_file(file.to_str().unwrap(), mparams)?;
Expand Down
53 changes: 1 addition & 52 deletions rllm/src/llm/loader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use crate::{
},
},
paged::{BatchInfoBuilder, CacheEngine, CacheSize},
DType, ExpectedGeneration, ExpectedToken, HashSet, LoaderArgs, Repo, RllmEngine,
DType, ExpectedGeneration, HashSet, LoaderArgs, Repo, RllmEngine,
RllmModelConfig,
};
use aicirt::api::Token;
Expand All @@ -30,57 +30,6 @@ fn read_tensor(s: &safetensors::SafeTensors, name: &str) -> Result<Tensor> {
Ok(tensor)
}

fn kind_from_dt(dtype: Dtype) -> Kind {
match dtype {
Dtype::BOOL => Kind::Bool,
Dtype::U8 => Kind::Uint8,
Dtype::I8 => Kind::Int8,
Dtype::I16 => Kind::Int16,
Dtype::I32 => Kind::Int,
Dtype::I64 => Kind::Int64,
Dtype::BF16 => Kind::BFloat16,
Dtype::F16 => Kind::Half,
Dtype::F32 => Kind::Float,
Dtype::F64 => Kind::Double,
dtype => panic!("unsupported dtype {dtype:?}"),
}
}

impl ExpectedGeneration {
pub fn load(f: &PathBuf) -> Result<Self> {
let fp = std::fs::File::open(f)?;
let content = unsafe { memmap2::MmapOptions::new().map(&fp)? };
let s = safetensors::SafeTensors::deserialize(&content)?;

let prompt = to_vec1::<i32>(&read_tensor(&s, "prompt")?.to_kind(Kind::Int));
let output = to_vec1::<i32>(&read_tensor(&s, "output")?.to_kind(Kind::Int));
let prob_mass = to_vec1::<f32>(&read_tensor(&s, "prob_mass")?.to_kind(Kind::Float));
let tokens = to_vec2::<i32>(&read_tensor(&s, "tokens")?.to_kind(Kind::Int));
let logits = to_vec2::<f32>(&read_tensor(&s, "logits")?.to_kind(Kind::Float));

let num_tokens = output.len();
assert!(tokens.len() == num_tokens);
assert!(logits.len() == num_tokens);
assert!(prob_mass.len() == num_tokens);

Ok(ExpectedGeneration {
prompt: prompt.into_iter().map(|x| x as Token).collect(),
output: (0..num_tokens)
.map(|i| ExpectedToken {
sampled: output[i] as Token,
ff_section_len: 1,
prob_mass: prob_mass[i],
logits: tokens[i]
.iter()
.zip(logits[i].iter())
.map(|(t, p)| (*t as Token, *p))
.collect(),
})
.collect(),
})
}
}

fn load_model(rllm_config: &RllmConfig, filenames: Vec<PathBuf>) -> Result<Box<dyn TModelInner>> {
let mut vs = VarStore::new(rllm_config.device.clone());

Expand Down
7 changes: 0 additions & 7 deletions rllm/src/server/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -348,12 +348,6 @@ fn inference_loop(
}
}

#[cfg(not(feature = "tch"))]
fn run_tests(_args: &RllmCliArgs, _loader_args: LoaderArgs) {
panic!("tests not supported without tch feature")
}

#[cfg(feature = "tch")]
fn run_tests(args: &RllmCliArgs, loader_args: LoaderArgs) {
let mut engine = RllmEngine::load(loader_args).expect("failed to load model");
let mut tests = args.test.clone();
Expand Down Expand Up @@ -414,7 +408,6 @@ fn spawn_inference_loop(
let wid = "warmup".to_string();
match warmup {
Some(w) if w == "off" => {}
#[cfg(feature = "tch")]
Some(w) => {
let exp = crate::ExpectedGeneration::load(&std::path::PathBuf::from(&w))
.expect("can't load warmup");
Expand Down
4 changes: 2 additions & 2 deletions rllm/src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ use std::time::Instant;
const SETTINGS: [(&'static str, &'static str, f64); 4] = [
("attn_rtol", "relative tolerance for flash attn check", 0.1),
("attn_atol", "absolute tolerance for flash attn check", 0.1),
("test_maxtol", "max allowed error", 0.5),
("test_avgtol", "avg allowed error", 0.2),
("test_maxtol", "max allowed error for --test and --warmup", 0.5),
("test_avgtol", "avg allowed error for --test and --warmup", 0.2),
];

lazy_static::lazy_static! {
Expand Down

0 comments on commit 3c5ad91

Please sign in to comment.