diff --git a/.github/workflows/binaries.yml b/.github/workflows/binaries.yml
index 210508c..bdaddc6 100644
--- a/.github/workflows/binaries.yml
+++ b/.github/workflows/binaries.yml
@@ -14,9 +14,15 @@ on:
       - ".github/workflows/binaries.yml"
   workflow_dispatch:
 
+env:
+  PROJECT_NAME: "candlex"
+  PROJECT_DIR: "native/candlex"
+  PROJECT_VERSION: "0.1.2"
+  NIF_VERSION: "2.16"
+
 jobs:
-  build_binary:
-    name: ${{ matrix.target }} / ${{ matrix.os }}
+  build_cpu:
+    name: cpu / ${{ matrix.target }} / ${{ matrix.os }}
     runs-on: ${{ matrix.os }}
     permissions:
       contents: write
@@ -40,12 +46,53 @@ jobs:
       - uses: philss/rustler-precompiled-action@main
         id: precompile
         with:
-          project-dir: "native/candlex"
-          project-name: candlex
-          project-version: "0.1.2"
+          project-dir: ${{ env.PROJECT_DIR }}
+          project-name: ${{ env.PROJECT_NAME }}
+          project-version: ${{ env.PROJECT_VERSION }}
           target: ${{ matrix.target }}
           use-cross: ${{ matrix.use-cross }}
-          nif-version: "2.16"
+          nif-version: ${{ env.NIF_VERSION }}
+
+      - uses: softprops/action-gh-release@v1
+        with:
+          draft: true
+          files: ${{ steps.precompile.outputs.file-path }}
+        if: startsWith(github.ref, 'refs/tags/')
+
+  build_cuda:
+    name: cuda / ${{ matrix.target }} / ${{ matrix.os }}
+    runs-on: ubuntu-22.04
+    permissions:
+      contents: write
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - target: x86_64-unknown-linux-gnu
+            os: ubuntu-22.04
+
+    container:
+      image: nvidia/cuda:12.2.2-devel-ubuntu22.04
+
+    steps:
+      - run: apt update && apt install -y curl git
+      - uses: actions/checkout@v4
+      - uses: dtolnay/rust-toolchain@stable
+      - run: rustup target add ${{ matrix.target }}
+
+      - uses: philss/rustler-precompiled-action@main
+        id: precompile
+        env:
+          CUDA_COMPUTE_CAP: "70"
+        with:
+          project-dir: ${{ env.PROJECT_DIR }}
+          project-name: ${{ env.PROJECT_NAME }}
+          project-version: ${{ env.PROJECT_VERSION }}
+          target: ${{ matrix.target }}
+          use-cross: null
+          nif-version: ${{ env.NIF_VERSION }}
+          variant: cuda
+          cargo-args: "--features cuda"
 
       - uses: softprops/action-gh-release@v1
         with:
diff --git a/README.md b/README.md
index 4805cf5..710c6cb 100644
--- a/README.md
+++ b/README.md
@@ -48,6 +48,19 @@ if no precompiled binary is available for your target environment. Once set, you
 must run `mix deps.clean candlex --build` explicitly to force to recompile.
 Building has a number of dependencies, see *Building from source* below.
 
+#### `CANDLEX_NIF_TARGET`
+
+The default value is `cpu`, which implies the final the binary supports targeting
+only the host CPU.
+
+| Value | Target environment |
+| --- | --- |
+| cpu | |
+| cuda | CUDA 12.x |
+
+To use Candlex with NVidia GPU you need [CUDA](https://developer.nvidia.com/cuda-downloads) compatible with your
+GPU drivers.
+
 ## Building from source
 
 To build the native binary locally you need to set `CANDLEX_NIF_BUILD=true`.
@@ -58,11 +71,16 @@ You will need the following installed in your system for the compilation:
   * [Git](https://git-scm.com) for fetching candle-core source
   * [Rust](https://www.rust-lang.org) with cargo to compile rustler NIFs
 
+### GPU support
+
+To build native binary with GPU support, you need to run in an environment that has CUDA installed,
+then you can build with `CANDLEX_NIF_TARGET=cuda`. See the `CANDLEX_NIF_TARGET` for more details.
+
 ## Releasing
 
 To publish a new version of this package:
 
-1. Update `@version` in `mix.exs` and `project-version` in `.github/workflows/binaries.yml`.
+1. Update `@version` in `mix.exs` and `PROJECT_VERSION` in `.github/workflows/binaries.yml`.
 1. `git tag -s <tag-version>` to create new signed tag.
 1. `git push origin <tag-version>` to push the tag.
 1. Wait for the `binaries.yml` GitHub workflow to build all the NIF binaries.
diff --git a/config/config.exs b/config/config.exs
index aecd8ea..aff71b9 100644
--- a/config/config.exs
+++ b/config/config.exs
@@ -1,17 +1,3 @@
 import Config
 
-enable_cuda =
-  case System.get_env("CUDA") do
-    nil -> System.find_executable("nvcc") && System.find_executable("nvidia-smi")
-    "false" -> false
-    _ -> true
-  end
-
-crate_features =
-  if enable_cuda do
-    [:cuda]
-  else
-    []
-  end
-
-config :candlex, crate_features: crate_features
+config :candlex, use_cuda: System.get_env("CANDLEX_NIF_TARGET") == "cuda"
diff --git a/lib/candlex/native.ex b/lib/candlex/native.ex
index 902bc89..b85f0b6 100644
--- a/lib/candlex/native.ex
+++ b/lib/candlex/native.ex
@@ -8,7 +8,7 @@ defmodule Candlex.Native do
 
   use RustlerPrecompiled,
     otp_app: :candlex,
-    features: Application.compile_env(:candlex, :crate_features, []),
+    features: if(Application.compile_env(:candlex, :use_cuda), do: [:cuda], else: []),
     base_url: "#{source_url}/releases/download/v#{version}",
     force_build: System.get_env("CANDLEX_NIF_BUILD") in ["1", "true"],
     mode: mode,
@@ -19,7 +19,10 @@ defmodule Candlex.Native do
       "aarch64-unknown-linux-gnu",
       "x86_64-apple-darwin",
       "x86_64-unknown-linux-gnu"
-    ]
+    ],
+    variants: %{
+      "x86_64-unknown-linux-gnu" => [cuda: fn -> Application.compile_env(:candlex, :use_cuda) end]
+    }
 
   # Rustler will override all the below stub functions with real NIFs
   def from_binary(_binary, _dtype, _shape, _device), do: error()
diff --git a/native/candlex/build.rs b/native/candlex/build.rs
index 86a9c61..33c4b9d 100644
--- a/native/candlex/build.rs
+++ b/native/candlex/build.rs
@@ -180,13 +180,21 @@ fn set_cuda_include_dir() -> Result<()> {
 
 #[allow(unused)]
 fn compute_cap() -> Result<usize> {
-    // Grab compute code from nvidia-smi
-    let mut compute_cap = {
+    println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
+
+    // Try to parse compute caps from env
+    let mut compute_cap = if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
+        println!("cargo:rustc-env=CUDA_COMPUTE_CAP={compute_cap_str}");
+        compute_cap_str
+            .parse::<usize>()
+            .context("Could not parse code")?
+    } else {
+        // Use nvidia-smi to get the current compute cap
         let out = std::process::Command::new("nvidia-smi")
-                    .arg("--query-gpu=compute_cap")
-                    .arg("--format=csv")
-                    .output()
-                    .context("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.")?;
+            .arg("--query-gpu=compute_cap")
+            .arg("--format=csv")
+            .output()
+            .context("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.")?;
         let out = std::str::from_utf8(&out.stdout).context("stdout is not a utf8 string")?;
         let mut lines = out.lines();
         assert_eq!(
@@ -197,16 +205,19 @@ fn compute_cap() -> Result<usize> {
             .next()
             .context("missing line in stdout")?
             .replace('.', "");
-        cap.parse::<usize>()
-            .with_context(|| format!("cannot parse as int {cap}"))?
+        let cap = cap
+            .parse::<usize>()
+            .with_context(|| format!("cannot parse as int {cap}"))?;
+        println!("cargo:rustc-env=CUDA_COMPUTE_CAP={cap}");
+        cap
     };
 
     // Grab available GPU codes from nvcc and select the highest one
-    let max_nvcc_code = {
+    let (supported_nvcc_codes, max_nvcc_code) = {
         let out = std::process::Command::new("nvcc")
-                    .arg("--list-gpu-code")
-                    .output()
-                    .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
+            .arg("--list-gpu-code")
+            .output()
+            .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
         let out = std::str::from_utf8(&out.stdout).unwrap();
 
         let out = out.lines().collect::<Vec<&str>>();
@@ -220,31 +231,21 @@ fn compute_cap() -> Result<usize> {
             }
         }
         codes.sort();
-        if !codes.contains(&compute_cap) {
-            anyhow::bail!(
-                "nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {codes:?}."
-            );
-        }
-        *codes.last().unwrap()
+        let max_nvcc_code = *codes.last().context("no gpu codes parsed from nvcc")?;
+        (codes, max_nvcc_code)
     };
 
-    // If nvidia-smi compute_cap is higher than the highest gpu code from nvcc,
-    // then choose the highest gpu code in nvcc
+    // Check that nvcc supports the asked compute caps
+    if !supported_nvcc_codes.contains(&compute_cap) {
+        anyhow::bail!(
+        "nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {supported_nvcc_codes:?}."
+    );
+    }
     if compute_cap > max_nvcc_code {
-        println!(
-            "cargo:warning=Lowering gpu arch {compute_cap} to max nvcc target {max_nvcc_code}."
-        );
-        compute_cap = max_nvcc_code;
+        anyhow::bail!(
+        "CUDA compute cap {compute_cap} is higher than the highest gpu code from nvcc {max_nvcc_code}"
+    );
     }
 
-    println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
-
-    if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
-        compute_cap = compute_cap_str
-            .parse::<usize>()
-            .with_context(|| format!("cannot parse as usize '{compute_cap_str}'"))?;
-        println!("cargo:warning=Using gpu arch {compute_cap} from $CUDA_COMPUTE_CAP");
-    }
-    println!("cargo:rustc-env=CUDA_COMPUTE_CAP=sm_{compute_cap}");
     Ok(compute_cap)
 }