diff --git a/README.md b/README.md index 0ed39f83..211145ed 100644 --- a/README.md +++ b/README.md @@ -156,7 +156,7 @@ you can also run tests with `pytest` for the DeclCtrl, or with `./scripts/test-p To run rLLM server, go to `rllm/` and run `./server.sh orca`. This will run the inference server with Orca-2 13B model (which is expected by testcases). -If you don't have CUDA, go to `cpp-rllm/` and run `./cpp-server.sh cpu phi2`. +If you don't have CUDA, go to `cpp-rllm/` and run `./cpp-server.sh phi2`. You can also try other models, see [rllm/README.md](rllm/README.md) and [cpp-rllm/README.md](cpp-rllm/README.md) for details. diff --git a/cpp-rllm/README.md b/cpp-rllm/README.md index 16014ff2..4963fc8c 100644 --- a/cpp-rllm/README.md +++ b/cpp-rllm/README.md @@ -11,8 +11,7 @@ If you're not using the supplied docker container follow the To compile and run first aicirt and then the rllm server, run: ```bash -./cpp-server.sh cpu phi2 +./cpp-server.sh phi2 ``` -You can also try `gpu` instead of `gpu` which will try to use CUDA. - +You can also try passing `--cuda` before `phi2`. diff --git a/cpp-rllm/cpp-server.sh b/cpp-rllm/cpp-server.sh index f95452e6..68547b14 100755 --- a/cpp-rllm/cpp-server.sh +++ b/cpp-rllm/cpp-server.sh @@ -1,7 +1,7 @@ #!/bin/sh set -e -REL= +REL=--release LOOP= BUILD= ADD_ARGS= @@ -24,19 +24,14 @@ fi VER="--no-default-features" -if [ "$1" = gpu ] ; then - REL=--release +if [ "$1" = "--cuda" ] ; then VER="$VER --features cuda" shift -elif [ "$1" = cpu ] ; then - REL=--release - shift -elif [ "$1" = debug ] ; then +fi + +if [ "$1" = "--debug" ] ; then REL= shift -else - echo "usage: $0 [gpu|cpu|debug] [phi2|orca|build]" - exit 1 fi case "$1" in @@ -48,10 +43,9 @@ case "$1" in ;; build ) BUILD=1 - REL=--release ;; * ) - echo "try one of models: phi2, orca" + echo "usage: $0 [--cuda] [--debug] [phi2|orca|build] [rllm_args...]" exit 1 ;; esac diff --git a/rllm/src/llamacpp/loader.rs b/rllm/src/llamacpp/loader.rs index dd4f5b02..2656518f 100644 --- a/rllm/src/llamacpp/loader.rs +++ b/rllm/src/llamacpp/loader.rs @@ -46,6 +46,8 @@ fn do_load(args: &mut LoaderArgs) -> Result { let mut mparams = cpp::ModelParams::default(); // TODO: make this configurable mparams.set_split_mode(cpp::SplitMode::None); + // don't GPU offload on Intel macs - it just fails there + #[cfg(not(all(target_os = "macos", target_arch = "x86_64")))] mparams.n_gpu_layers = 1000; let m = cpp::Model::from_file(file.to_str().unwrap(), mparams)?;