From 7df12c0dd913a4257934fe9a7233ade30382bcd3 Mon Sep 17 00:00:00 2001
From: Corey Lowman <clowman1993@gmail.com>
Date: Wed, 25 Oct 2023 12:52:01 -0400
Subject: [PATCH] [Breaking] Combining separate device errors into single
 `dfdx::tensor::Error` enum (#875)

* [Breaking] Adding single Error enum

* Fixing example

* Fixing cuda kernels

* Fixing no-std
---
 dfdx-core/src/nn_traits/mod.rs                | 56 ++++---------
 dfdx-core/src/nn_traits/tuples.rs             | 21 +++--
 dfdx-core/src/nn_traits/vecs.rs               | 22 +++--
 dfdx-core/src/tensor/cpu/allocate.rs          | 39 ++++-----
 dfdx-core/src/tensor/cpu/device.rs            | 36 ++------
 dfdx-core/src/tensor/cpu/mod.rs               |  2 +-
 dfdx-core/src/tensor/cuda/allocate.rs         | 26 +++---
 dfdx-core/src/tensor/cuda/device.rs           | 61 ++++----------
 dfdx-core/src/tensor/cuda/mod.rs              |  2 +-
 dfdx-core/src/tensor/error.rs                 | 28 +++++++
 dfdx-core/src/tensor/ghost.rs                 |  6 +-
 dfdx-core/src/tensor/gradients.rs             | 20 ++---
 dfdx-core/src/tensor/mod.rs                   |  8 +-
 dfdx-core/src/tensor/storage_traits.rs        | 82 +++++++++----------
 dfdx-core/src/tensor/tensor_impls.rs          | 10 +--
 dfdx-core/src/tensor/tensorlike.rs            |  8 +-
 dfdx-core/src/tensor_ops/abs/mod.rs           |  2 +-
 dfdx-core/src/tensor_ops/accurate_gelu/mod.rs |  2 +-
 dfdx-core/src/tensor_ops/adam/cpu_kernel.rs   |  6 +-
 dfdx-core/src/tensor_ops/adam/cuda_kernel.rs  |  4 +-
 dfdx-core/src/tensor_ops/adam/mod.rs          |  6 +-
 dfdx-core/src/tensor_ops/add/mod.rs           | 10 +--
 .../attention_reshape/cpu_kernel.rs           |  2 +-
 .../attention_reshape/cuda_kernel.rs          |  2 +-
 .../src/tensor_ops/attention_reshape/mod.rs   |  6 +-
 dfdx-core/src/tensor_ops/axpy/cpu_kernel.rs   | 13 ++-
 dfdx-core/src/tensor_ops/axpy/cuda_kernel.rs  | 10 +--
 dfdx-core/src/tensor_ops/axpy/mod.rs          |  7 +-
 dfdx-core/src/tensor_ops/bce/mod.rs           |  5 +-
 .../src/tensor_ops/boolean/cpu_kernels.rs     | 15 ++--
 .../src/tensor_ops/boolean/cuda_kernels.rs    | 15 ++--
 dfdx-core/src/tensor_ops/boolean/mod.rs       | 19 ++---
 dfdx-core/src/tensor_ops/broadcast_to.rs      |  8 +-
 dfdx-core/src/tensor_ops/choose/cpu_kernel.rs |  6 +-
 .../src/tensor_ops/choose/cuda_kernel.rs      |  6 +-
 dfdx-core/src/tensor_ops/choose/mod.rs        | 12 +--
 dfdx-core/src/tensor_ops/clamp/mod.rs         |  6 +-
 dfdx-core/src/tensor_ops/cmp/cpu_kernels.rs   |  6 +-
 dfdx-core/src/tensor_ops/cmp/cuda_kernels.rs  |  6 +-
 dfdx-core/src/tensor_ops/cmp/mod.rs           | 24 +++---
 dfdx-core/src/tensor_ops/concat/cpu_kernel.rs |  6 +-
 .../src/tensor_ops/concat/cuda_kernel.rs      |  6 +-
 dfdx-core/src/tensor_ops/concat/mod.rs        | 10 +--
 .../src/tensor_ops/concat_along/cpu_kernel.rs |  4 +-
 .../tensor_ops/concat_along/cuda_kernel.rs    |  6 +-
 dfdx-core/src/tensor_ops/concat_along/mod.rs  | 16 ++--
 dfdx-core/src/tensor_ops/conv1d/cpu_kernel.rs | 10 +--
 .../src/tensor_ops/conv1d/cuda_kernel.rs      |  8 +-
 dfdx-core/src/tensor_ops/conv1d/mod.rs        | 23 ++----
 dfdx-core/src/tensor_ops/conv2d/cpu_kernel.rs | 10 +--
 .../src/tensor_ops/conv2d/cuda_kernel.rs      |  8 +-
 .../src/tensor_ops/conv2d/cudnn_kernel.rs     |  8 +-
 dfdx-core/src/tensor_ops/conv2d/mod.rs        | 21 ++---
 .../src/tensor_ops/convtrans2d/cpu_kernel.rs  | 12 +--
 .../src/tensor_ops/convtrans2d/cuda_kernel.rs |  8 +-
 dfdx-core/src/tensor_ops/convtrans2d/mod.rs   | 21 ++---
 dfdx-core/src/tensor_ops/cos/mod.rs           |  2 +-
 dfdx-core/src/tensor_ops/div/mod.rs           |  8 +-
 .../src/tensor_ops/dropout/cpu_kernel.rs      |  6 +-
 .../src/tensor_ops/dropout/cuda_kernel.rs     |  6 +-
 dfdx-core/src/tensor_ops/dropout/mod.rs       | 11 +--
 dfdx-core/src/tensor_ops/exp/mod.rs           |  2 +-
 dfdx-core/src/tensor_ops/fast_gelu/mod.rs     |  4 +-
 dfdx-core/src/tensor_ops/huber_error/mod.rs   |  2 +-
 dfdx-core/src/tensor_ops/ln/mod.rs            |  2 +-
 dfdx-core/src/tensor_ops/log_softmax.rs       |  2 +-
 dfdx-core/src/tensor_ops/logsumexp_to.rs      |  6 +-
 dfdx-core/src/tensor_ops/matmul/cpu_kernel.rs | 18 ++--
 .../src/tensor_ops/matmul/cuda_kernel.rs      | 18 ++--
 dfdx-core/src/tensor_ops/matmul/mod.rs        | 42 +++++-----
 dfdx-core/src/tensor_ops/max_to/cpu_kernel.rs |  6 +-
 .../src/tensor_ops/max_to/cuda_kernel.rs      |  6 +-
 dfdx-core/src/tensor_ops/max_to/mod.rs        | 10 +--
 dfdx-core/src/tensor_ops/maximum/mod.rs       |  5 +-
 dfdx-core/src/tensor_ops/mean_to.rs           |  6 +-
 dfdx-core/src/tensor_ops/min_to/cpu_kernel.rs |  6 +-
 .../src/tensor_ops/min_to/cuda_kernel.rs      |  6 +-
 dfdx-core/src/tensor_ops/min_to/mod.rs        | 10 +--
 dfdx-core/src/tensor_ops/minimum/mod.rs       |  5 +-
 dfdx-core/src/tensor_ops/mul/mod.rs           |  8 +-
 dfdx-core/src/tensor_ops/nans_to/mod.rs       |  2 +-
 dfdx-core/src/tensor_ops/negate/mod.rs        |  2 +-
 dfdx-core/src/tensor_ops/normalize.rs         |  7 +-
 dfdx-core/src/tensor_ops/permute_to.rs        |  6 +-
 dfdx-core/src/tensor_ops/pool2d/cpu_kernel.rs |  6 +-
 .../src/tensor_ops/pool2d/cuda_kernel.rs      |  8 +-
 dfdx-core/src/tensor_ops/pool2d/mod.rs        | 21 ++---
 dfdx-core/src/tensor_ops/pow/cuda_kernel.rs   |  4 +-
 dfdx-core/src/tensor_ops/pow/mod.rs           |  4 +-
 dfdx-core/src/tensor_ops/prelu.rs             |  8 +-
 dfdx-core/src/tensor_ops/realize_to.rs        |  2 +-
 dfdx-core/src/tensor_ops/recip/mod.rs         |  2 +-
 dfdx-core/src/tensor_ops/relu/mod.rs          |  2 +-
 .../src/tensor_ops/reshape_to/cpu_kernel.rs   |  6 +-
 .../src/tensor_ops/reshape_to/cuda_kernel.rs  |  6 +-
 dfdx-core/src/tensor_ops/reshape_to/mod.rs    | 14 ++--
 .../src/tensor_ops/rmsprop/cpu_kernel.rs      |  6 +-
 .../src/tensor_ops/rmsprop/cuda_kernel.rs     |  4 +-
 dfdx-core/src/tensor_ops/rmsprop/mod.rs       |  4 +-
 dfdx-core/src/tensor_ops/roll/cpu_kernel.rs   |  4 +-
 dfdx-core/src/tensor_ops/roll/cuda_kernel.rs  |  4 +-
 dfdx-core/src/tensor_ops/roll/mod.rs          | 13 +--
 .../select_and_gather/cpu_kernel.rs           | 10 +--
 .../select_and_gather/cuda_kernel.rs          | 10 +--
 .../src/tensor_ops/select_and_gather/mod.rs   | 20 ++---
 dfdx-core/src/tensor_ops/sgd/cpu_kernel.rs    |  6 +-
 dfdx-core/src/tensor_ops/sgd/cuda_kernel.rs   |  4 +-
 dfdx-core/src/tensor_ops/sgd/mod.rs           |  6 +-
 dfdx-core/src/tensor_ops/sigmoid/mod.rs       |  2 +-
 dfdx-core/src/tensor_ops/sin/mod.rs           |  2 +-
 dfdx-core/src/tensor_ops/slice/cpu_kernel.rs  |  4 +-
 dfdx-core/src/tensor_ops/slice/cuda_kernel.rs |  6 +-
 dfdx-core/src/tensor_ops/slice/mod.rs         |  9 +-
 dfdx-core/src/tensor_ops/softmax.rs           |  2 +-
 dfdx-core/src/tensor_ops/sqrt/mod.rs          |  2 +-
 dfdx-core/src/tensor_ops/square/mod.rs        |  2 +-
 dfdx-core/src/tensor_ops/stack/cpu_kernel.rs  |  6 +-
 dfdx-core/src/tensor_ops/stack/cuda_kernel.rs |  6 +-
 dfdx-core/src/tensor_ops/stack/mod.rs         | 24 ++----
 dfdx-core/src/tensor_ops/stddev_to.rs         |  6 +-
 dfdx-core/src/tensor_ops/sub/mod.rs           |  8 +-
 dfdx-core/src/tensor_ops/sum_to/cpu_kernel.rs | 10 +--
 .../src/tensor_ops/sum_to/cuda_kernel.rs      |  6 +-
 dfdx-core/src/tensor_ops/sum_to/mod.rs        | 10 +--
 dfdx-core/src/tensor_ops/tanh/mod.rs          |  2 +-
 .../src/tensor_ops/to_dtype/cpu_kernel.rs     |  4 +-
 .../src/tensor_ops/to_dtype/cuda_kernel.rs    |  4 +-
 dfdx-core/src/tensor_ops/to_dtype/mod.rs      |  6 +-
 dfdx-core/src/tensor_ops/tri.rs               | 18 ++--
 .../src/tensor_ops/upscale2d/cpu_kernel.rs    | 10 +--
 .../src/tensor_ops/upscale2d/cuda_kernel.rs   |  6 +-
 dfdx-core/src/tensor_ops/upscale2d/mod.rs     | 20 ++---
 .../src/tensor_ops/utilities/backward.rs      |  8 +-
 .../src/tensor_ops/utilities/cpu_kernels.rs   | 10 +--
 .../src/tensor_ops/utilities/cuda_kernels.rs  |  8 +-
 dfdx-core/src/tensor_ops/utilities/ops.rs     | 14 ++--
 dfdx-core/src/tensor_ops/var_to.rs            |  6 +-
 dfdx-derives/src/lib.rs                       | 42 ++++------
 dfdx/examples/advanced-train-loop.rs          |  6 +-
 dfdx/src/nn/layers/abs.rs                     |  3 +-
 dfdx/src/nn/layers/add_into.rs                | 11 ++-
 dfdx/src/nn/layers/batch_norm1d.rs            | 21 ++---
 dfdx/src/nn/layers/batch_norm2d.rs            | 24 ++----
 dfdx/src/nn/layers/bias1d.rs                  | 16 ++--
 dfdx/src/nn/layers/bias2d.rs                  | 10 +--
 dfdx/src/nn/layers/conv1d.rs                  | 13 +--
 dfdx/src/nn/layers/conv2d.rs                  | 11 +--
 dfdx/src/nn/layers/conv_trans2d.rs            | 11 +--
 dfdx/src/nn/layers/cos.rs                     |  3 +-
 dfdx/src/nn/layers/dropout.rs                 | 10 +--
 dfdx/src/nn/layers/embedding.rs               | 13 +--
 dfdx/src/nn/layers/exp.rs                     |  3 +-
 dfdx/src/nn/layers/flatten2d.rs               |  9 +-
 dfdx/src/nn/layers/gelu.rs                    |  6 +-
 dfdx/src/nn/layers/generalized_add.rs         | 12 ++-
 dfdx/src/nn/layers/generalized_mul.rs         | 12 ++-
 dfdx/src/nn/layers/layer_norm1d.rs            | 16 ++--
 dfdx/src/nn/layers/leaky_relu.rs              |  3 +-
 dfdx/src/nn/layers/linear.rs                  | 12 ++-
 dfdx/src/nn/layers/ln.rs                      |  3 +-
 dfdx/src/nn/layers/log_softmax.rs             |  3 +-
 dfdx/src/nn/layers/matmul.rs                  |  9 +-
 dfdx/src/nn/layers/multi_head_attention.rs    | 11 +--
 dfdx/src/nn/layers/pool_2d_avg.rs             |  3 +-
 dfdx/src/nn/layers/pool_2d_max.rs             |  4 +-
 dfdx/src/nn/layers/pool_2d_min.rs             |  4 +-
 dfdx/src/nn/layers/pool_global_avg.rs         | 14 ++--
 dfdx/src/nn/layers/pool_global_max.rs         | 14 ++--
 dfdx/src/nn/layers/pool_global_min.rs         | 14 ++--
 dfdx/src/nn/layers/prelu.rs                   |  8 +-
 dfdx/src/nn/layers/prelu1d.rs                 | 20 ++---
 dfdx/src/nn/layers/relu.rs                    |  3 +-
 dfdx/src/nn/layers/reshape.rs                 |  3 +-
 dfdx/src/nn/layers/residual_add.rs            | 10 +--
 dfdx/src/nn/layers/residual_mul.rs            | 10 +--
 dfdx/src/nn/layers/sigmoid.rs                 |  3 +-
 dfdx/src/nn/layers/sin.rs                     |  3 +-
 dfdx/src/nn/layers/softmax.rs                 |  3 +-
 dfdx/src/nn/layers/split_into.rs              | 11 +--
 dfdx/src/nn/layers/sqrt.rs                    |  3 +-
 dfdx/src/nn/layers/square.rs                  |  3 +-
 dfdx/src/nn/layers/tanh.rs                    |  3 +-
 dfdx/src/nn/layers/transformer.rs             | 21 ++---
 dfdx/src/nn/layers/upscale2d.rs               | 12 +--
 dfdx/src/nn/optim/adam.rs                     | 18 ++--
 dfdx/src/nn/optim/mod.rs                      |  2 +-
 dfdx/src/nn/optim/rmsprop.rs                  | 18 ++--
 dfdx/src/nn/optim/sgd.rs                      |  2 +-
 188 files changed, 844 insertions(+), 1069 deletions(-)
 create mode 100644 dfdx-core/src/tensor/error.rs

diff --git a/dfdx-core/src/nn_traits/mod.rs b/dfdx-core/src/nn_traits/mod.rs
index 204d82dff..20c55da23 100644
--- a/dfdx-core/src/nn_traits/mod.rs
+++ b/dfdx-core/src/nn_traits/mod.rs
@@ -3,17 +3,16 @@ mod vecs;
 
 use std::vec::Vec;
 
-use crate::prelude::{Device, Dtype, Gradients, Shape, Tensor, UniqueId};
+use crate::prelude::{Device, Dtype, Error, Gradients, Shape, Tensor, UniqueId};
 
 /// Mutable & Immutable forward of `Input` that produces [Module::Output].
 pub trait Module<X> {
     /// The type that this unit produces given `Input`.
     type Output;
-    type Error: std::fmt::Debug;
 
-    fn try_forward(&self, x: X) -> Result<Self::Output, Self::Error>;
+    fn try_forward(&self, x: X) -> Result<Self::Output, Error>;
 
-    fn try_forward_mut(&mut self, x: X) -> Result<Self::Output, Self::Error> {
+    fn try_forward_mut(&mut self, x: X) -> Result<Self::Output, Error> {
         self.try_forward(x)
     }
 
@@ -26,27 +25,6 @@ pub trait Module<X> {
     }
 }
 
-/// An error indicating that a parameter was not used in gradient
-/// computation, and was therefore not present in [Gradients]
-/// during an update.
-#[derive(Debug)]
-pub enum OptimizerUpdateError<Err> {
-    UnusedTensors(Vec<UniqueId>),
-    DeviceError(Err),
-}
-
-impl<Err: std::fmt::Display> std::fmt::Display for OptimizerUpdateError<Err> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            Self::UnusedTensors(unused) => write!(f, "Unused tensors: {unused:?}"),
-            Self::DeviceError(err) => write!(f, "{err}"),
-        }
-    }
-}
-
-#[cfg(feature = "std")]
-impl<Err: std::fmt::Debug + std::fmt::Display> std::error::Error for OptimizerUpdateError<Err> {}
-
 /// Something that can update both tensors and a [UpdateParams]. At minimum [Optimizer::update_tensor()] must be implemented.
 pub trait Optimizer<M, E: Dtype, D: Device<E>>: Sized {
     fn update_tensor<S: Shape>(
@@ -54,24 +32,18 @@ pub trait Optimizer<M, E: Dtype, D: Device<E>>: Sized {
         t: &mut Tensor<S, E, D>,
         gradients: &Gradients<E, D>,
         missing_tensors: &mut Vec<UniqueId>,
-    ) -> Result<(), D::Err>;
+    ) -> Result<(), Error>;
 
-    fn update(
-        &mut self,
-        module: &mut M,
-        gradients: &Gradients<E, D>,
-    ) -> Result<(), OptimizerUpdateError<D::Err>>
+    fn update(&mut self, module: &mut M, gradients: &Gradients<E, D>) -> Result<(), Error>
     where
         M: UpdateParams<E, D>,
     {
         let mut missing_tensors = Vec::new();
-        module
-            .try_update_params(self, gradients, &mut missing_tensors)
-            .map_err(OptimizerUpdateError::DeviceError)?;
+        module.try_update_params(self, gradients, &mut missing_tensors)?;
         if missing_tensors.is_empty() {
             Ok(())
         } else {
-            Err(OptimizerUpdateError::UnusedTensors(missing_tensors))
+            Err(Error::UnusedTensors(missing_tensors))
         }
     }
 }
@@ -82,7 +54,7 @@ pub trait BuildOnDevice<E: Dtype, D: Device<E>>: Clone {
     fn build_on_device(&self, device: &D) -> Self::Built {
         self.try_build_on_device(device).unwrap()
     }
-    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, D::Err>;
+    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, crate::tensor::Error>;
 }
 
 /// Something that can have all of its parameters reset to a specific state (may be random or not random).
@@ -90,7 +62,7 @@ pub trait ResetParams<E: Dtype, D: Device<E>> {
     fn reset_params(&mut self) {
         self.try_reset_params().unwrap()
     }
-    fn try_reset_params(&mut self) -> Result<(), D::Err>;
+    fn try_reset_params(&mut self) -> Result<(), crate::tensor::Error>;
 }
 
 /// Something that can have it's params updated with an [Optimizer] and a set of [Gradients].
@@ -109,7 +81,7 @@ pub trait UpdateParams<E: Dtype, D: Device<E>> {
         optimizer: &mut Optim,
         gradients: &Gradients<E, D>,
         missing_tensors: &mut Vec<UniqueId>,
-    ) -> Result<(), D::Err>;
+    ) -> Result<(), crate::tensor::Error>;
 }
 
 impl<S: Shape, E: Dtype, D: Device<E>> UpdateParams<E, D> for Tensor<S, E, D> {
@@ -118,7 +90,7 @@ impl<S: Shape, E: Dtype, D: Device<E>> UpdateParams<E, D> for Tensor<S, E, D> {
         optimizer: &mut Optim,
         gradients: &Gradients<E, D>,
         missing_tensors: &mut Vec<UniqueId>,
-    ) -> Result<(), <D>::Err> {
+    ) -> Result<(), crate::tensor::Error> {
         optimizer.update_tensor(self, gradients, missing_tensors)
     }
 }
@@ -128,12 +100,12 @@ pub trait ZeroGrads<E: Dtype, D: Device<E>> {
     fn zero_grads(&self, grads: &mut Gradients<E, D>) {
         self.try_zero_grads(grads).unwrap()
     }
-    fn try_zero_grads(&self, grads: &mut Gradients<E, D>) -> Result<(), D::Err>;
+    fn try_zero_grads(&self, grads: &mut Gradients<E, D>) -> Result<(), crate::tensor::Error>;
 
     fn alloc_grads(&self) -> Gradients<E, D> {
         self.try_alloc_grads().unwrap()
     }
-    fn try_alloc_grads(&self) -> Result<Gradients<E, D>, D::Err> {
+    fn try_alloc_grads(&self) -> Result<Gradients<E, D>, crate::tensor::Error> {
         let mut grads = Gradients::leaky();
         self.try_zero_grads(&mut grads)?;
         grads.retain_current_grads_as_leafs();
@@ -275,7 +247,7 @@ pub trait BuildModuleExt<M>: Sized {
         self.try_build_module(m).unwrap()
     }
 
-    fn try_build_module<E: Dtype>(&self, m: M) -> Result<M::Built, Self::Err>
+    fn try_build_module<E: Dtype>(&self, m: M) -> Result<M::Built, Error>
     where
         M: BuildOnDevice<E, Self>,
         M::Built: ResetParams<E, Self>,
diff --git a/dfdx-core/src/nn_traits/tuples.rs b/dfdx-core/src/nn_traits/tuples.rs
index 7cb587768..97e8c7deb 100644
--- a/dfdx-core/src/nn_traits/tuples.rs
+++ b/dfdx-core/src/nn_traits/tuples.rs
@@ -1,4 +1,8 @@
-use crate::{dtypes::Dtype, tensor::UniqueId, tensor_ops::Device};
+use crate::{
+    dtypes::Dtype,
+    tensor::{Error, UniqueId},
+    tensor_ops::Device,
+};
 
 use std::vec::Vec;
 
@@ -7,7 +11,7 @@ macro_rules! tuple_impls {
 
         impl<Dev: Device<Elem>, Elem: Dtype, $($name: crate::nn_traits::BuildOnDevice<Elem, Dev>),+> crate::nn_traits::BuildOnDevice<Elem, Dev> for ($($name,)+) {
             type Built = ($($name::Built, )+);
-            fn try_build_on_device(&self, device: &Dev) -> Result<Self::Built, Dev::Err> {
+            fn try_build_on_device(&self, device: &Dev) -> Result<Self::Built, Error> {
                 Ok(($(
                     self.$idx.try_build_on_device(device)?,
                 )+))
@@ -38,7 +42,7 @@ macro_rules! tuple_impls {
         }
 
         impl<Dev: Device<Elem>, Elem: Dtype, $($name: crate::nn_traits::ResetParams<Elem, Dev>),+> crate::nn_traits::ResetParams<Elem, Dev> for ($($name,)+) {
-            fn try_reset_params(&mut self) -> Result<(), Dev::Err> {
+            fn try_reset_params(&mut self) -> Result<(), Error> {
                 $(self.$idx.try_reset_params()?;)+
                 Ok(())
             }
@@ -50,14 +54,14 @@ macro_rules! tuple_impls {
                 optimizer: &mut Optim,
                 gradients: &crate::prelude::Gradients<Elem, Dev>,
                 missing_tensors: &mut Vec<UniqueId>,
-            ) -> Result<(), Dev::Err> {
+            ) -> Result<(), Error> {
                 $(self.$idx.try_update_params(optimizer, gradients, missing_tensors)?;)+
                 Ok(())
             }
         }
 
         impl<Dev: Device<Elem>, Elem: Dtype, $($name: crate::nn_traits::ZeroGrads<Elem, Dev>),+> crate::nn_traits::ZeroGrads<Elem, Dev> for ($($name,)+) {
-            fn try_zero_grads(&self, grads: &mut crate::prelude::Gradients<Elem, Dev>) -> Result<(), Dev::Err> {
+            fn try_zero_grads(&self, grads: &mut crate::prelude::Gradients<Elem, Dev>) -> Result<(), Error> {
                 $(self.$idx.try_zero_grads(grads)?;)+
                 Ok(())
             }
@@ -91,20 +95,19 @@ macro_rules! tuple_impls {
         impl<
             Input,
             $last:
-            $(crate::nn_traits::Module::<$rev_tail ::Output, Error=$rev_tail::Error>, $rev_tail: )*
+            $(crate::nn_traits::Module::<$rev_tail ::Output>, $rev_tail: )*
             crate::nn_traits::Module<Input>
         > crate::nn_traits::Module<Input> for ($($name,)+) {
             type Output = $last ::Output;
-            type Error = $last ::Error;
 
             /// Calls forward sequentially on each module in the tuple.
-            fn try_forward(&self, x: Input) -> Result<Self::Output, Self::Error> {
+            fn try_forward(&self, x: Input) -> Result<Self::Output, Error> {
                 $(let x = self.$idx.try_forward(x)?;)+
                 Ok(x)
             }
 
             /// Calls forward sequentially on each module in the tuple.
-            fn try_forward_mut(&mut self, x: Input) -> Result<Self::Output, Self::Error> {
+            fn try_forward_mut(&mut self, x: Input) -> Result<Self::Output, Error> {
                 $(let x = self.$idx.try_forward_mut(x)?;)+
                 Ok(x)
             }
diff --git a/dfdx-core/src/nn_traits/vecs.rs b/dfdx-core/src/nn_traits/vecs.rs
index e413bc47c..803a07d8a 100644
--- a/dfdx-core/src/nn_traits/vecs.rs
+++ b/dfdx-core/src/nn_traits/vecs.rs
@@ -1,4 +1,8 @@
-use crate::{dtypes::Dtype, tensor::UniqueId, tensor_ops::Device};
+use crate::{
+    dtypes::Dtype,
+    tensor::{Error, UniqueId},
+    tensor_ops::Device,
+};
 
 use std::vec::Vec;
 
@@ -6,7 +10,7 @@ impl<E: Dtype, D: Device<E>, T: crate::nn_traits::BuildOnDevice<E, D>>
     crate::nn_traits::BuildOnDevice<E, D> for Vec<T>
 {
     type Built = Vec<T::Built>;
-    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, <D>::Err> {
+    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, crate::tensor::Error> {
         self.iter()
             .map(|m_i| m_i.try_build_on_device(device))
             .collect()
@@ -16,7 +20,7 @@ impl<E: Dtype, D: Device<E>, T: crate::nn_traits::BuildOnDevice<E, D>>
 impl<E: Dtype, D: Device<E>, T: crate::nn_traits::ResetParams<E, D>>
     crate::nn_traits::ResetParams<E, D> for Vec<T>
 {
-    fn try_reset_params(&mut self) -> Result<(), <D>::Err> {
+    fn try_reset_params(&mut self) -> Result<(), crate::tensor::Error> {
         for m_i in self.iter_mut() {
             m_i.try_reset_params()?;
         }
@@ -32,7 +36,7 @@ impl<E: Dtype, D: Device<E>, T: crate::nn_traits::UpdateParams<E, D>>
         optimizer: &mut Optim,
         gradients: &crate::tensor::Gradients<E, D>,
         missing_tensors: &mut Vec<UniqueId>,
-    ) -> Result<(), D::Err> {
+    ) -> Result<(), crate::tensor::Error> {
         for m_i in self.iter_mut() {
             m_i.try_update_params(optimizer, gradients, missing_tensors)?;
         }
@@ -43,7 +47,10 @@ impl<E: Dtype, D: Device<E>, T: crate::nn_traits::UpdateParams<E, D>>
 impl<E: Dtype, D: Device<E>, T: crate::nn_traits::ZeroGrads<E, D>> crate::nn_traits::ZeroGrads<E, D>
     for Vec<T>
 {
-    fn try_zero_grads(&self, grads: &mut crate::tensor::Gradients<E, D>) -> Result<(), <D>::Err> {
+    fn try_zero_grads(
+        &self,
+        grads: &mut crate::tensor::Gradients<E, D>,
+    ) -> Result<(), crate::tensor::Error> {
         for m_i in self.iter() {
             m_i.try_zero_grads(grads)?;
         }
@@ -82,15 +89,14 @@ impl<Input, T: crate::nn_traits::Module<Input, Output = Input>> crate::nn_traits
     for Vec<T>
 {
     type Output = T::Output;
-    type Error = T::Error;
 
-    fn try_forward(&self, mut x: Input) -> Result<Self::Output, T::Error> {
+    fn try_forward(&self, mut x: Input) -> Result<Self::Output, Error> {
         for m_i in self.iter() {
             x = m_i.try_forward(x)?;
         }
         Ok(x)
     }
-    fn try_forward_mut(&mut self, mut x: Input) -> Result<Self::Output, Self::Error> {
+    fn try_forward_mut(&mut self, mut x: Input) -> Result<Self::Output, Error> {
         for m_i in self.iter_mut() {
             x = m_i.try_forward_mut(x)?;
         }
diff --git a/dfdx-core/src/tensor/cpu/allocate.rs b/dfdx-core/src/tensor/cpu/allocate.rs
index be134f127..cf93623b7 100644
--- a/dfdx-core/src/tensor/cpu/allocate.rs
+++ b/dfdx-core/src/tensor/cpu/allocate.rs
@@ -2,20 +2,17 @@
 
 use crate::{
     shapes::*,
-    tensor::{masks::triangle_mask, storage_traits::*, unique_id, Tensor},
+    tensor::{masks::triangle_mask, storage_traits::*, unique_id, Error, Tensor},
 };
 
-use super::{CachableVec, Cpu, CpuError, LendingIterator};
+use super::{CachableVec, Cpu, LendingIterator};
 
 use rand::{distributions::Distribution, Rng};
 use std::{sync::Arc, vec::Vec};
 
 impl Cpu {
     #[inline]
-    pub(crate) fn try_alloc_zeros<E: Unit>(
-        &self,
-        numel: usize,
-    ) -> Result<CachableVec<E>, CpuError> {
+    pub(crate) fn try_alloc_zeros<E: Unit>(&self, numel: usize) -> Result<CachableVec<E>, Error> {
         self.try_alloc_elem::<E>(numel, Default::default())
     }
 
@@ -24,14 +21,14 @@ impl Cpu {
         &self,
         numel: usize,
         elem: E,
-    ) -> Result<CachableVec<E>, CpuError> {
-        let data = self.cache.try_pop::<E>(numel).map_or_else(
+    ) -> Result<CachableVec<E>, Error> {
+        let data: Result<Vec<E>, Error> = self.cache.try_pop::<E>(numel).map_or_else(
             #[cfg(feature = "fast-alloc")]
             || Ok(std::vec![elem; numel]),
             #[cfg(not(feature = "fast-alloc"))]
             || {
                 let mut data: Vec<E> = Vec::new();
-                data.try_reserve(numel).map_err(|_| CpuError::OutOfMemory)?;
+                data.try_reserve(numel).map_err(|_| Error::OutOfMemory)?;
                 data.resize(numel, elem);
                 Ok(data)
             },
@@ -48,17 +45,17 @@ impl Cpu {
                 data.fill(elem);
                 Ok(data)
             },
-        )?;
+        );
 
         Ok(CachableVec {
-            data,
+            data: data?,
             cache: self.cache.clone(),
         })
     }
 }
 
 impl<E: Unit> ZerosTensor<E> for Cpu {
-    fn try_zeros_like<S: HasShape>(&self, src: &S) -> Result<Tensor<S::Shape, E, Self>, Self::Err> {
+    fn try_zeros_like<S: HasShape>(&self, src: &S) -> Result<Tensor<S::Shape, E, Self>, Error> {
         let shape = *src.shape();
         let strides = shape.strides();
         let data = self.try_alloc_zeros::<E>(shape.num_elements())?;
@@ -75,14 +72,14 @@ impl<E: Unit> ZerosTensor<E> for Cpu {
 }
 
 impl<E: Unit> ZeroFillStorage<E> for Cpu {
-    fn try_fill_with_zeros(&self, storage: &mut Self::Vec) -> Result<(), Self::Err> {
+    fn try_fill_with_zeros(&self, storage: &mut Self::Vec) -> Result<(), Error> {
         storage.fill(Default::default());
         Ok(())
     }
 }
 
 impl<E: Unit> OnesTensor<E> for Cpu {
-    fn try_ones_like<S: HasShape>(&self, src: &S) -> Result<Tensor<S::Shape, E, Self>, Self::Err> {
+    fn try_ones_like<S: HasShape>(&self, src: &S) -> Result<Tensor<S::Shape, E, Self>, Error> {
         let shape = *src.shape();
         let strides = shape.strides();
         let data = self.try_alloc_elem::<E>(shape.num_elements(), E::ONE)?;
@@ -104,7 +101,7 @@ impl<E: Unit> TriangleTensor<E> for Cpu {
         src: &S,
         val: E,
         diagonal: impl Into<Option<isize>>,
-    ) -> Result<Tensor<S::Shape, E, Self>, Self::Err> {
+    ) -> Result<Tensor<S::Shape, E, Self>, Error> {
         let shape = *src.shape();
         let strides = shape.strides();
         let mut data = self.try_alloc_elem::<E>(shape.num_elements(), val)?;
@@ -126,7 +123,7 @@ impl<E: Unit> TriangleTensor<E> for Cpu {
         src: &S,
         val: E,
         diagonal: impl Into<Option<isize>>,
-    ) -> Result<Tensor<S::Shape, E, Self>, Self::Err> {
+    ) -> Result<Tensor<S::Shape, E, Self>, Error> {
         let shape = *src.shape();
         let strides = shape.strides();
         let mut data = self.try_alloc_elem::<E>(shape.num_elements(), val)?;
@@ -145,7 +142,7 @@ impl<E: Unit> TriangleTensor<E> for Cpu {
 }
 
 impl<E: Unit> OneFillStorage<E> for Cpu {
-    fn try_fill_with_ones(&self, storage: &mut Self::Vec) -> Result<(), Self::Err> {
+    fn try_fill_with_ones(&self, storage: &mut Self::Vec) -> Result<(), Error> {
         storage.fill(E::ONE);
         Ok(())
     }
@@ -156,7 +153,7 @@ impl<E: Unit> SampleTensor<E> for Cpu {
         &self,
         src: &S,
         distr: D,
-    ) -> Result<Tensor<S::Shape, E, Self>, Self::Err> {
+    ) -> Result<Tensor<S::Shape, E, Self>, Error> {
         let mut tensor = self.try_zeros_like(src)?;
         {
             #[cfg(not(feature = "no-std"))]
@@ -173,7 +170,7 @@ impl<E: Unit> SampleTensor<E> for Cpu {
         &self,
         storage: &mut Self::Vec,
         distr: D,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         {
             #[cfg(not(feature = "no-std"))]
             let mut rng = self.rng.lock().unwrap();
@@ -201,11 +198,11 @@ impl<E: Unit> TensorFromVec<E> for Cpu {
         &self,
         src: Vec<E>,
         shape: S,
-    ) -> Result<Tensor<S, E, Self>, Self::Err> {
+    ) -> Result<Tensor<S, E, Self>, Error> {
         let num_elements = shape.num_elements();
 
         if src.len() != num_elements {
-            Err(CpuError::WrongNumElements)
+            Err(Error::WrongNumElements)
         } else {
             let src = CachableVec {
                 data: src,
diff --git a/dfdx-core/src/tensor/cpu/device.rs b/dfdx-core/src/tensor/cpu/device.rs
index ebc380dcd..d3ce936f1 100644
--- a/dfdx-core/src/tensor/cpu/device.rs
+++ b/dfdx-core/src/tensor/cpu/device.rs
@@ -1,5 +1,5 @@
 use crate::shapes::{Shape, Unit};
-use crate::tensor::{cache::TensorCache, cpu::LendingIterator, storage_traits::*, Tensor};
+use crate::tensor::{cache::TensorCache, cpu::LendingIterator, storage_traits::*, Error, Tensor};
 use rand::{rngs::StdRng, Rng, SeedableRng};
 use std::{sync::Arc, vec::Vec};
 
@@ -47,30 +47,6 @@ impl Cpu {
     }
 }
 
-#[derive(Debug, Clone, Copy)]
-pub enum CpuError {
-    /// Device is out of memory
-    OutOfMemory,
-    /// Not enough elements were provided when creating a tensor
-    WrongNumElements,
-}
-
-impl std::fmt::Display for CpuError {
-    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
-        match self {
-            Self::OutOfMemory => f.write_str("CpuError::OutOfMemory"),
-            Self::WrongNumElements => f.write_str("CpuError::WrongNumElements"),
-        }
-    }
-}
-
-#[cfg(feature = "std")]
-impl std::error::Error for CpuError {}
-
-impl HasErr for Cpu {
-    type Err = CpuError;
-}
-
 /// A [Vec] that can be cloned without allocating new memory.
 /// When [Drop]ed it will insert it's data into the cache.
 #[derive(Debug)]
@@ -154,7 +130,7 @@ impl RandomU64 for Cpu {
 impl<E: Unit> Storage<E> for Cpu {
     type Vec = CachableVec<E>;
 
-    fn try_alloc_len(&self, len: usize) -> Result<Self::Vec, Self::Err> {
+    fn try_alloc_len(&self, len: usize) -> Result<Self::Vec, Error> {
         self.try_alloc_zeros(len)
     }
 
@@ -173,23 +149,23 @@ impl<E: Unit> Storage<E> for Cpu {
 }
 
 impl Synchronize for Cpu {
-    fn try_synchronize(&self) -> Result<(), Self::Err> {
+    fn try_synchronize(&self) -> Result<(), Error> {
         Ok(())
     }
 }
 
 impl Cache for Cpu {
-    fn try_enable_cache(&self) -> Result<(), Self::Err> {
+    fn try_enable_cache(&self) -> Result<(), Error> {
         self.cache.enable();
         Ok(())
     }
 
-    fn try_disable_cache(&self) -> Result<(), Self::Err> {
+    fn try_disable_cache(&self) -> Result<(), Error> {
         self.cache.disable();
         self.try_empty_cache()
     }
 
-    fn try_empty_cache(&self) -> Result<(), Self::Err> {
+    fn try_empty_cache(&self) -> Result<(), Error> {
         #[cfg(not(feature = "no-std"))]
         let mut cache = self.cache.allocations.write().unwrap();
         #[cfg(feature = "no-std")]
diff --git a/dfdx-core/src/tensor/cpu/mod.rs b/dfdx-core/src/tensor/cpu/mod.rs
index fda5306ea..f69ef5dd0 100644
--- a/dfdx-core/src/tensor/cpu/mod.rs
+++ b/dfdx-core/src/tensor/cpu/mod.rs
@@ -7,7 +7,7 @@ pub(crate) use index::index_to_i;
 pub(crate) use iterate::{LendingIterator, NdIndex};
 
 pub(crate) use device::CachableVec;
-pub use device::{Cpu, CpuError};
+pub use device::Cpu;
 
 #[cfg(test)]
 mod tests {
diff --git a/dfdx-core/src/tensor/cuda/allocate.rs b/dfdx-core/src/tensor/cuda/allocate.rs
index fb688ae37..aa489f9ad 100644
--- a/dfdx-core/src/tensor/cuda/allocate.rs
+++ b/dfdx-core/src/tensor/cuda/allocate.rs
@@ -2,10 +2,10 @@
 
 use crate::{
     shapes::*,
-    tensor::{masks::triangle_mask, storage_traits::*, unique_id, Cpu, CpuError, NoneTape, Tensor},
+    tensor::{masks::triangle_mask, storage_traits::*, unique_id, Cpu, Error, NoneTape, Tensor},
 };
 
-use super::{device::CachableCudaSlice, Cuda, CudaError};
+use super::{device::CachableCudaSlice, Cuda};
 
 use cudarc::driver::{CudaSlice, DeviceSlice};
 use rand::Rng;
@@ -16,7 +16,7 @@ impl Cuda {
         &self,
         shape: S,
         buf: Vec<E>,
-    ) -> Result<Tensor<S, E, Self>, CudaError> {
+    ) -> Result<Tensor<S, E, Self>, Error> {
         let mut slice = unsafe { self.alloc_empty(buf.len()) }?;
         self.dev.htod_copy_into(buf, &mut slice)?;
         Ok(self.build_tensor(shape, shape.strides(), slice))
@@ -44,7 +44,7 @@ impl Cuda {
 }
 
 impl<E: Unit> ZerosTensor<E> for Cuda {
-    fn try_zeros_like<S: HasShape>(&self, src: &S) -> Result<Tensor<S::Shape, E, Self>, Self::Err> {
+    fn try_zeros_like<S: HasShape>(&self, src: &S) -> Result<Tensor<S::Shape, E, Self>, Error> {
         let shape = *src.shape();
         let strides = shape.strides();
         let mut data = unsafe { self.alloc_empty(shape.num_elements()) }?;
@@ -54,7 +54,7 @@ impl<E: Unit> ZerosTensor<E> for Cuda {
 }
 
 impl<E: Unit> ZeroFillStorage<E> for Cuda {
-    fn try_fill_with_zeros(&self, storage: &mut Self::Vec) -> Result<(), Self::Err> {
+    fn try_fill_with_zeros(&self, storage: &mut Self::Vec) -> Result<(), Error> {
         self.dev.memset_zeros(&mut storage.data)?;
         Ok(())
     }
@@ -64,7 +64,7 @@ impl<E: Unit> OnesTensor<E> for Cuda
 where
     Cpu: OnesTensor<E>,
 {
-    fn try_ones_like<S: HasShape>(&self, src: &S) -> Result<Tensor<S::Shape, E, Self>, Self::Err> {
+    fn try_ones_like<S: HasShape>(&self, src: &S) -> Result<Tensor<S::Shape, E, Self>, Error> {
         let shape = *src.shape();
         let buf = std::vec![E::ONE; shape.num_elements()];
         self.tensor_from_host_buf(shape, buf)
@@ -80,7 +80,7 @@ where
         src: &S,
         val: E,
         diagonal: impl Into<Option<isize>>,
-    ) -> Result<Tensor<S::Shape, E, Self>, Self::Err> {
+    ) -> Result<Tensor<S::Shape, E, Self>, Error> {
         let shape = *src.shape();
         let mut data = std::vec![val; shape.num_elements()];
         let offset = diagonal.into().unwrap_or(0);
@@ -93,7 +93,7 @@ where
         src: &S,
         val: E,
         diagonal: impl Into<Option<isize>>,
-    ) -> Result<Tensor<S::Shape, E, Self>, Self::Err> {
+    ) -> Result<Tensor<S::Shape, E, Self>, Error> {
         let shape = *src.shape();
         let mut data = std::vec![val; shape.num_elements()];
         let offset = diagonal.into().unwrap_or(0);
@@ -103,7 +103,7 @@ where
 }
 
 impl<E: Unit> OneFillStorage<E> for Cuda {
-    fn try_fill_with_ones(&self, storage: &mut Self::Vec) -> Result<(), Self::Err> {
+    fn try_fill_with_ones(&self, storage: &mut Self::Vec) -> Result<(), Error> {
         self.dev
             .htod_copy_into(std::vec![E::ONE; storage.len()], &mut storage.data)?;
         Ok(())
@@ -118,7 +118,7 @@ where
         &self,
         src: &S,
         distr: D,
-    ) -> Result<Tensor<S::Shape, E, Self>, Self::Err> {
+    ) -> Result<Tensor<S::Shape, E, Self>, Error> {
         let shape = *src.shape();
         let mut buf = Vec::with_capacity(shape.num_elements());
         {
@@ -134,7 +134,7 @@ where
         &self,
         storage: &mut Self::Vec,
         distr: D,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let mut buf = Vec::with_capacity(storage.len());
         {
             #[cfg(not(feature = "no-std"))]
@@ -180,11 +180,11 @@ impl<E: Unit> TensorFromVec<E> for Cuda {
         &self,
         src: Vec<E>,
         shape: S,
-    ) -> Result<Tensor<S, E, Self>, Self::Err> {
+    ) -> Result<Tensor<S, E, Self>, Error> {
         let num_elements = shape.num_elements();
 
         if src.len() != num_elements {
-            Err(CudaError::Cpu(CpuError::WrongNumElements))
+            Err(Error::WrongNumElements)
         } else {
             self.tensor_from_host_buf(shape, src)
         }
diff --git a/dfdx-core/src/tensor/cuda/device.rs b/dfdx-core/src/tensor/cuda/device.rs
index 23a2887a4..156bdddd7 100644
--- a/dfdx-core/src/tensor/cuda/device.rs
+++ b/dfdx-core/src/tensor/cuda/device.rs
@@ -1,7 +1,7 @@
 use crate::shapes::{Shape, Unit};
-use crate::tensor::cpu::{Cpu, CpuError};
+use crate::tensor::cpu::Cpu;
 use crate::tensor::{
-    cache::TensorCache, Cache, HasErr, NoneTape, RandomU64, Storage, Synchronize, Tensor,
+    cache::TensorCache, Cache, Error, NoneTape, RandomU64, Storage, Synchronize, Tensor,
 };
 
 use cudarc::driver::{DevicePtr, DevicePtrMut, DeviceRepr};
@@ -32,37 +32,22 @@ pub struct Cuda {
     pub(crate) cache: Arc<TensorCache<CUdeviceptr>>,
 }
 
-#[derive(Debug)]
-pub enum CudaError {
-    Blas(CublasError),
-    #[cfg(feature = "cudnn")]
-    Cudnn(cudarc::cudnn::CudnnError),
-    Driver(DriverError),
-    Cpu(CpuError),
-}
-
-impl From<CpuError> for CudaError {
-    fn from(value: CpuError) -> Self {
-        Self::Cpu(value)
-    }
-}
-
-impl From<CublasError> for CudaError {
+impl From<CublasError> for Error {
     fn from(value: CublasError) -> Self {
-        Self::Blas(value)
+        Self::CublasError(value)
     }
 }
 
-impl From<DriverError> for CudaError {
+impl From<DriverError> for Error {
     fn from(value: DriverError) -> Self {
-        Self::Driver(value)
+        Self::CudaDriverError(value)
     }
 }
 
 #[cfg(feature = "cudnn")]
-impl From<cudarc::cudnn::CudnnError> for CudaError {
+impl From<cudarc::cudnn::CudnnError> for Error {
     fn from(value: cudarc::cudnn::CudnnError) -> Self {
-        Self::Cudnn(value)
+        Self::CudnnError(value)
     }
 }
 
@@ -79,12 +64,12 @@ impl Cuda {
     }
 
     /// Constructs rng with the given seed.
-    pub fn try_seed_from_u64(seed: u64) -> Result<Self, CudaError> {
+    pub fn try_seed_from_u64(seed: u64) -> Result<Self, Error> {
         Self::try_build(0, seed)
     }
 
     /// Constructs with the given seed & device ordinal
-    pub fn try_build(ordinal: usize, seed: u64) -> Result<Self, CudaError> {
+    pub fn try_build(ordinal: usize, seed: u64) -> Result<Self, Error> {
         let cpu = Cpu::seed_from_u64(seed);
         let dev = CudaDevice::new(ordinal)?;
         let blas = Arc::new(CudaBlas::new(dev.clone())?);
@@ -112,7 +97,7 @@ impl Cuda {
     pub(crate) unsafe fn alloc_empty<E: DeviceRepr>(
         &self,
         len: usize,
-    ) -> Result<CudaSlice<E>, CudaError> {
+    ) -> Result<CudaSlice<E>, Error> {
         let data = self.cache.try_pop::<E>(len).map_or_else(
             || self.dev.alloc::<E>(len),
             |ptr| Ok(self.dev.upgrade_device_ptr(ptr, len)),
@@ -123,7 +108,7 @@ impl Cuda {
     pub(crate) unsafe fn get_workspace<E>(
         &self,
         len: usize,
-    ) -> Result<MutexGuard<CudaSlice<u8>>, CudaError> {
+    ) -> Result<MutexGuard<CudaSlice<u8>>, Error> {
         let num_bytes_required = len * std::mem::size_of::<E>();
         let mut workspace = self.workspace.as_ref().lock().unwrap();
 
@@ -137,16 +122,6 @@ impl Cuda {
     }
 }
 
-impl std::fmt::Display for CudaError {
-    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
-        write!(f, "{self:?}")
-    }
-}
-
-impl HasErr for Cuda {
-    type Err = CudaError;
-}
-
 /// A [CudaSlice] that can be cloned without allocating new memory.
 /// When [Drop]ed it will insert it's data into the cache.
 #[derive(Debug)]
@@ -246,17 +221,17 @@ impl RandomU64 for Cuda {
 }
 
 impl Cache for Cuda {
-    fn try_enable_cache(&self) -> Result<(), Self::Err> {
+    fn try_enable_cache(&self) -> Result<(), Error> {
         self.cache.enable();
         Ok(())
     }
 
-    fn try_disable_cache(&self) -> Result<(), Self::Err> {
+    fn try_disable_cache(&self) -> Result<(), Error> {
         self.cache.disable();
         self.try_empty_cache()
     }
 
-    fn try_empty_cache(&self) -> Result<(), Self::Err> {
+    fn try_empty_cache(&self) -> Result<(), Error> {
         #[cfg(not(feature = "no-std"))]
         let mut cache = self.cache.allocations.write().unwrap();
         #[cfg(feature = "no-std")]
@@ -273,15 +248,15 @@ impl Cache for Cuda {
 }
 
 impl Synchronize for Cuda {
-    fn try_synchronize(&self) -> Result<(), CudaError> {
-        self.dev.synchronize().map_err(CudaError::from)
+    fn try_synchronize(&self) -> Result<(), Error> {
+        self.dev.synchronize().map_err(Error::from)
     }
 }
 
 impl<E: Unit> Storage<E> for Cuda {
     type Vec = CachableCudaSlice<E>;
 
-    fn try_alloc_len(&self, len: usize) -> Result<Self::Vec, Self::Err> {
+    fn try_alloc_len(&self, len: usize) -> Result<Self::Vec, Error> {
         let mut data = unsafe { self.alloc_empty(len) }?;
         self.dev.memset_zeros(&mut data)?;
         Ok(CachableCudaSlice {
diff --git a/dfdx-core/src/tensor/cuda/mod.rs b/dfdx-core/src/tensor/cuda/mod.rs
index 8b91d2ab3..a38149cfc 100644
--- a/dfdx-core/src/tensor/cuda/mod.rs
+++ b/dfdx-core/src/tensor/cuda/mod.rs
@@ -1,7 +1,7 @@
 mod allocate;
 mod device;
 
-pub use device::{Cuda, CudaError};
+pub use device::Cuda;
 
 pub(crate) fn launch_cfg<const NUM_THREADS: u32>(n: u32) -> cudarc::driver::LaunchConfig {
     let num_blocks = (n + NUM_THREADS - 1) / NUM_THREADS;
diff --git a/dfdx-core/src/tensor/error.rs b/dfdx-core/src/tensor/error.rs
new file mode 100644
index 000000000..f6b43c322
--- /dev/null
+++ b/dfdx-core/src/tensor/error.rs
@@ -0,0 +1,28 @@
+/// Represents a number of different errors that can occur from creating tensors
+/// or launching tensor operations. This encompasses both Cpu and CUDA errors.
+#[non_exhaustive]
+#[derive(Debug)]
+pub enum Error {
+    /// Device is out of memory
+    OutOfMemory,
+    /// Not enough elements were provided when creating a tensor
+    WrongNumElements,
+    /// Some tensors were unused by an optimizer in a graph.
+    UnusedTensors(std::vec::Vec<crate::tensor::UniqueId>),
+    #[cfg(feature = "cuda")]
+    CublasError(cudarc::cublas::result::CublasError),
+    #[cfg(feature = "cuda")]
+    CudaDriverError(cudarc::driver::DriverError),
+
+    #[cfg(feature = "cudnn")]
+    CudnnError(cudarc::cudnn::CudnnError),
+}
+
+impl std::fmt::Display for Error {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        write!(f, "{self:?}")
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for Error {}
diff --git a/dfdx-core/src/tensor/ghost.rs b/dfdx-core/src/tensor/ghost.rs
index c05854ab6..8c349a8fe 100644
--- a/dfdx-core/src/tensor/ghost.rs
+++ b/dfdx-core/src/tensor/ghost.rs
@@ -42,10 +42,6 @@ impl<S: Shape, E, D: Storage<E>> Clone for GhostTensor<S, E, D> {
     }
 }
 
-impl<S: Shape, E, D: Storage<E>> super::storage_traits::HasErr for GhostTensor<S, E, D> {
-    type Err = D::Err;
-}
-
 impl<S: Shape, E, D: Storage<E>> HasShape for GhostTensor<S, E, D> {
     type WithShape<New: Shape> = GhostTensor<New, E, D>;
     type Shape = S;
@@ -56,7 +52,7 @@ impl<S: Shape, E, D: Storage<E>> HasShape for GhostTensor<S, E, D> {
 
 impl<S: Shape, E, D: Storage<E>> super::storage_traits::AllocGrad for GhostTensor<S, E, D> {
     type Gradient = D::Vec;
-    fn try_alloc_grad(&self) -> Result<Self::Gradient, D::Err> {
+    fn try_alloc_grad(&self) -> Result<Self::Gradient, Error> {
         self.dev.try_alloc_len(self.len)
     }
 }
diff --git a/dfdx-core/src/tensor/gradients.rs b/dfdx-core/src/tensor/gradients.rs
index 99dc7a163..86974ec61 100644
--- a/dfdx-core/src/tensor/gradients.rs
+++ b/dfdx-core/src/tensor/gradients.rs
@@ -5,7 +5,7 @@ use std::collections::{BTreeMap, BTreeSet};
 use std::{boxed::Box, vec::Vec};
 
 use super::tensorlike::Tensorlike;
-use super::{storage_traits::Storage, unique_id, Tensor, UniqueId};
+use super::{storage_traits::Storage, unique_id, Error, Tensor, UniqueId};
 use crate::shapes::Shape;
 
 /// A generic container for keeping gradients of tensors keyed by the
@@ -44,13 +44,13 @@ impl<E, D: Storage<E>> Gradients<E, D> {
     pub fn get_or_alloc_mut<S: Shape>(
         &mut self,
         t: &impl Tensorlike<S, E, D>,
-    ) -> Result<&mut D::Vec, D::Err> {
+    ) -> Result<&mut D::Vec, Error> {
         self.try_alloc_for(t)?;
         Ok(self.get_mut(t))
     }
 
     /// Inserts a gradient for `t`
-    pub fn try_alloc_for<S: Shape>(&mut self, t: &impl Tensorlike<S, E, D>) -> Result<(), D::Err> {
+    pub fn try_alloc_for<S: Shape>(&mut self, t: &impl Tensorlike<S, E, D>) -> Result<(), Error> {
         if let std::collections::btree_map::Entry::Vacant(e) = self.gradient_by_id.entry(t.id()) {
             e.insert(t.try_alloc_grad()?);
         }
@@ -179,7 +179,7 @@ impl<E, D: Storage<E>> Gradients<E, D> {
 pub struct OwnedTape<E, D: Storage<E>> {
     /// A list of (Time, BackwardOp) pairs. The Time is used to ensure operations
     /// from merged tapes are executed in the correct order.
-    pub(crate) operations: Vec<(UniqueId, BackwardOp<E, D, D::Err>)>,
+    pub(crate) operations: Vec<(UniqueId, BackwardOp<E, D>)>,
     pub(crate) gradients: Gradients<E, D>,
 }
 
@@ -214,7 +214,7 @@ impl<E, D: Storage<E>> OwnedTape<E, D> {
     /// Compute the [Gradients]! This just runs all the operations on a new [Gradients] struct.
     ///
     /// Note that this method takes ownership of self, so it can't be called twice!
-    pub(crate) fn execute(&mut self) -> Result<Gradients<E, D>, D::Err> {
+    pub(crate) fn execute(&mut self) -> Result<Gradients<E, D>, Error> {
         // We must ensure that the operations are sorted in execution time order.
         // Otherwise an backward operation may not be executed in the right order
         // if multiple tapes were merged together.
@@ -228,7 +228,7 @@ impl<E, D: Storage<E>> OwnedTape<E, D> {
     }
 }
 
-type BackwardOp<E, D, Err> = Box<dyn FnOnce(&mut Gradients<E, D>) -> Result<(), Err>>;
+type BackwardOp<E, D> = Box<dyn FnOnce(&mut Gradients<E, D>) -> Result<(), Error>>;
 
 /// Contains nothing. When [Tape::add_backward_op] is called, this struct does nothing.
 #[derive(Default, Debug, Clone, Copy)]
@@ -240,14 +240,14 @@ pub trait Tape<E, D: Storage<E>>: Default + Merge<Self> + Merge<NoneTape> {
     const OWNS_TAPE: bool;
     fn add_backward_op<F>(&mut self, operation: F)
     where
-        F: 'static + FnOnce(&mut Gradients<E, D>) -> Result<(), D::Err>;
+        F: 'static + FnOnce(&mut Gradients<E, D>) -> Result<(), Error>;
 }
 
 impl<E, D: Storage<E>> Tape<E, D> for OwnedTape<E, D> {
     const OWNS_TAPE: bool = true;
     fn add_backward_op<F>(&mut self, operation: F)
     where
-        F: 'static + FnOnce(&mut Gradients<E, D>) -> Result<(), D::Err>,
+        F: 'static + FnOnce(&mut Gradients<E, D>) -> Result<(), Error>,
     {
         self.operations.push((unique_id(), Box::new(operation)));
     }
@@ -257,7 +257,7 @@ impl<E, D: Storage<E>> Tape<E, D> for NoneTape {
     const OWNS_TAPE: bool = false;
     fn add_backward_op<F>(&mut self, _: F)
     where
-        F: 'static + FnOnce(&mut Gradients<E, D>) -> Result<(), D::Err>,
+        F: 'static + FnOnce(&mut Gradients<E, D>) -> Result<(), Error>,
     {
     }
 }
@@ -329,7 +329,7 @@ impl<E, D: Storage<E>> Tape<E, D> for std::sync::Arc<std::sync::Mutex<OwnedTape<
     const OWNS_TAPE: bool = true;
     fn add_backward_op<F>(&mut self, operation: F)
     where
-        F: 'static + FnOnce(&mut Gradients<E, D>) -> Result<(), D::Err>,
+        F: 'static + FnOnce(&mut Gradients<E, D>) -> Result<(), Error>,
     {
         let mut tape = self.lock().unwrap();
         tape.add_backward_op(operation);
diff --git a/dfdx-core/src/tensor/mod.rs b/dfdx-core/src/tensor/mod.rs
index 1b3a5b06d..2a7f9db3a 100644
--- a/dfdx-core/src/tensor/mod.rs
+++ b/dfdx-core/src/tensor/mod.rs
@@ -147,6 +147,7 @@ mod masks;
 pub(crate) mod numpy;
 #[cfg(feature = "numpy")]
 pub use numpy::NumpyDtype;
+mod error;
 #[cfg(feature = "safetensors")]
 pub mod safetensors;
 mod tensorlike;
@@ -155,23 +156,24 @@ mod unique_id;
 pub(crate) mod storage_traits;
 mod tensor_impls;
 
+pub use error::Error;
 pub(crate) use ghost::GhostTensor;
 pub(crate) use storage_traits::{OneFillStorage, ZeroFillStorage};
 pub use tensorlike::Tensorlike;
 
-pub use cpu::{Cpu, CpuError};
+pub use cpu::Cpu;
 #[cfg(not(feature = "cuda"))]
 pub type AutoDevice = Cpu;
 
 #[cfg(feature = "cuda")]
 pub(crate) use cuda::launch_cfg;
 #[cfg(feature = "cuda")]
-pub use cuda::{Cuda, CudaError};
+pub use cuda::Cuda;
 #[cfg(feature = "cuda")]
 pub type AutoDevice = Cuda;
 
 pub use storage_traits::{AsArray, CopySlice, TensorFrom, TensorFromVec, TensorToArray};
-pub use storage_traits::{Cache, HasErr, RandomU64, Storage, Synchronize};
+pub use storage_traits::{Cache, RandomU64, Storage, Synchronize};
 pub use storage_traits::{OnesTensor, SampleTensor, TriangleTensor, ZerosTensor};
 
 pub use tensor_impls::{PutTape, SplitTape, Tensor, Trace, WithEmptyTape};
diff --git a/dfdx-core/src/tensor/storage_traits.rs b/dfdx-core/src/tensor/storage_traits.rs
index 4f98c3e41..9578947d9 100644
--- a/dfdx-core/src/tensor/storage_traits.rs
+++ b/dfdx-core/src/tensor/storage_traits.rs
@@ -4,12 +4,7 @@ use std::vec::Vec;
 
 use crate::shapes::*;
 
-use super::Tensor;
-
-/// Represents something that has an error associated type
-pub trait HasErr: Sized {
-    type Err: std::fmt::Debug + std::fmt::Display;
-}
+use super::{Error, Tensor};
 
 pub trait RandomU64 {
     /// Generates a random u64 number
@@ -17,40 +12,40 @@ pub trait RandomU64 {
 }
 
 /// Something that can store nd arrays for a given [Shape] and [Dtype]
-pub trait Storage<E>: 'static + std::fmt::Debug + Default + Clone + HasErr {
+pub trait Storage<E>: 'static + std::fmt::Debug + Default + Clone {
     /// Generic Storage type
     type Vec: 'static + std::fmt::Debug + Clone + Send + Sync;
 
     /// Allocates a gradient for the given nd array
-    fn try_alloc_grad(&self, storage: &Self::Vec) -> Result<Self::Vec, Self::Err> {
+    fn try_alloc_grad(&self, storage: &Self::Vec) -> Result<Self::Vec, Error> {
         self.try_alloc_len(self.len(storage))
     }
 
-    fn try_alloc_len(&self, len: usize) -> Result<Self::Vec, Self::Err>;
+    fn try_alloc_len(&self, len: usize) -> Result<Self::Vec, Error>;
 
     fn tensor_to_vec<S: Shape, T>(&self, tensor: &Tensor<S, E, Self, T>) -> Vec<E>;
 
     fn len(&self, v: &Self::Vec) -> usize;
 }
 
-pub trait Synchronize: HasErr {
+pub trait Synchronize {
     /// Blocks until all work on device to complete. Useful for benchmarking.
     fn synchronize(&self) {
         self.try_synchronize().unwrap()
     }
 
     /// Blocks until all work on device to complete. Useful for benchmarking.
-    fn try_synchronize(&self) -> Result<(), Self::Err>;
+    fn try_synchronize(&self) -> Result<(), Error>;
 }
 
-pub trait Cache: HasErr {
+pub trait Cache {
     /// Enables the cache of the device.
     fn enable_cache(&self) {
         self.try_enable_cache().unwrap()
     }
 
     /// Tries to enable the cache of the device.
-    fn try_enable_cache(&self) -> Result<(), Self::Err>;
+    fn try_enable_cache(&self) -> Result<(), Error>;
 
     /// Disables the cache of the device. This will also empty the cache
     /// if there are things in it. See [Cache::empty_cache] for
@@ -61,7 +56,7 @@ pub trait Cache: HasErr {
 
     /// Tries to disable the cache of the device. See [Cache::disable_cache] for
     /// details of when this is useful.
-    fn try_disable_cache(&self) -> Result<(), Self::Err>;
+    fn try_disable_cache(&self) -> Result<(), Error>;
 
     /// Empties the cache of the device.
     ///
@@ -78,18 +73,18 @@ pub trait Cache: HasErr {
 
     /// Tries to empty the cache of the device. See [Cache::empty_cache] for
     /// details of when this is useful.
-    fn try_empty_cache(&self) -> Result<(), Self::Err>;
+    fn try_empty_cache(&self) -> Result<(), Error>;
 }
 
 /// Internal trait - Represents something that can allocate its own gradient.
-pub trait AllocGrad: HasErr {
+pub trait AllocGrad {
     type Gradient: 'static;
-    fn try_alloc_grad(&self) -> Result<Self::Gradient, Self::Err>;
+    fn try_alloc_grad(&self) -> Result<Self::Gradient, Error>;
 }
 
 impl<S: Shape, E, D: Storage<E>, T> AllocGrad for Tensor<S, E, D, T> {
     type Gradient = D::Vec;
-    fn try_alloc_grad(&self) -> Result<Self::Gradient, D::Err> {
+    fn try_alloc_grad(&self) -> Result<Self::Gradient, Error> {
         self.device.try_alloc_grad(self.data.as_ref())
     }
 }
@@ -143,7 +138,7 @@ pub trait ZerosTensor<E>: Storage<E> {
     }
 
     /// Fallible version of [ZerosTensor::zeros]
-    fn try_zeros<S: ConstShape>(&self) -> Result<Tensor<S, E, Self>, Self::Err> {
+    fn try_zeros<S: ConstShape>(&self) -> Result<Tensor<S, E, Self>, Error> {
         self.try_zeros_like::<S>(&Default::default())
     }
 
@@ -168,11 +163,11 @@ pub trait ZerosTensor<E>: Storage<E> {
     }
 
     /// Fallible version of [ZerosTensor::zeros_like]
-    fn try_zeros_like<S: HasShape>(&self, src: &S) -> Result<Tensor<S::Shape, E, Self>, Self::Err>;
+    fn try_zeros_like<S: HasShape>(&self, src: &S) -> Result<Tensor<S::Shape, E, Self>, Error>;
 }
 
 pub trait ZeroFillStorage<E>: Storage<E> {
-    fn try_fill_with_zeros(&self, storage: &mut Self::Vec) -> Result<(), Self::Err>;
+    fn try_fill_with_zeros(&self, storage: &mut Self::Vec) -> Result<(), Error>;
 }
 
 /// Construct tensors filled with ones.
@@ -188,7 +183,7 @@ pub trait OnesTensor<E>: Storage<E> {
     }
 
     /// Fallible version of [OnesTensor::ones]
-    fn try_ones<S: ConstShape>(&self) -> Result<Tensor<S, E, Self>, Self::Err> {
+    fn try_ones<S: ConstShape>(&self) -> Result<Tensor<S, E, Self>, Error> {
         self.try_ones_like::<S>(&Default::default())
     }
 
@@ -213,11 +208,11 @@ pub trait OnesTensor<E>: Storage<E> {
     }
 
     /// Fallible version of [OnesTensor::ones_like]
-    fn try_ones_like<S: HasShape>(&self, src: &S) -> Result<Tensor<S::Shape, E, Self>, Self::Err>;
+    fn try_ones_like<S: HasShape>(&self, src: &S) -> Result<Tensor<S::Shape, E, Self>, Error>;
 }
 
 pub trait OneFillStorage<E>: Storage<E> {
-    fn try_fill_with_ones(&self, storage: &mut Self::Vec) -> Result<(), Self::Err>;
+    fn try_fill_with_ones(&self, storage: &mut Self::Vec) -> Result<(), Error>;
 }
 
 /// Build upper & lower triangle tensors.
@@ -266,7 +261,7 @@ pub trait TriangleTensor<E>: Storage<E> {
         &self,
         val: E,
         diagonal: impl Into<Option<isize>>,
-    ) -> Result<Tensor<S, E, Self>, Self::Err> {
+    ) -> Result<Tensor<S, E, Self>, Error> {
         self.try_upper_tri_like::<S>(&Default::default(), val, diagonal)
     }
 
@@ -286,7 +281,7 @@ pub trait TriangleTensor<E>: Storage<E> {
         src: &S,
         val: E,
         diagonal: impl Into<Option<isize>>,
-    ) -> Result<Tensor<S::Shape, E, Self>, Self::Err>;
+    ) -> Result<Tensor<S::Shape, E, Self>, Error>;
 
     /// Build a tensor containing the lower triangle part of each lowest 2D matrix
     /// set to the given value, along the given diagonal. The other values will be `E::default()`.
@@ -332,7 +327,7 @@ pub trait TriangleTensor<E>: Storage<E> {
         &self,
         val: E,
         diagonal: impl Into<Option<isize>>,
-    ) -> Result<Tensor<S, E, Self>, Self::Err> {
+    ) -> Result<Tensor<S, E, Self>, Error> {
         self.try_lower_tri_like::<S>(&Default::default(), val, diagonal)
     }
 
@@ -352,7 +347,7 @@ pub trait TriangleTensor<E>: Storage<E> {
         src: &S,
         val: E,
         diagonal: impl Into<Option<isize>>,
-    ) -> Result<Tensor<S::Shape, E, Self>, Self::Err>;
+    ) -> Result<Tensor<S::Shape, E, Self>, Error>;
 }
 
 /// Constructs tensors filled with random values from a given distribution.
@@ -396,7 +391,7 @@ pub trait SampleTensor<E>: Storage<E> {
     fn try_sample<S: ConstShape, D: Distribution<E>>(
         &self,
         distr: D,
-    ) -> Result<Tensor<S, E, Self>, Self::Err> {
+    ) -> Result<Tensor<S, E, Self>, Error> {
         self.try_sample_like::<S, D>(&Default::default(), distr)
     }
 
@@ -413,14 +408,14 @@ pub trait SampleTensor<E>: Storage<E> {
         &self,
         src: &S,
         distr: D,
-    ) -> Result<Tensor<S::Shape, E, Self>, Self::Err>;
+    ) -> Result<Tensor<S::Shape, E, Self>, Error>;
 
     /// Fills tensor `Storage<E>` with data from a given distribution
     fn try_fill_with_distr<D: Distribution<E>>(
         &self,
         storage: &mut Self::Vec,
         distr: D,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 }
 
 pub trait TensorToArray<S: Shape, E>: Storage<E> {
@@ -457,7 +452,7 @@ pub trait TensorFromVec<E>: Storage<E> {
         &self,
         src: Vec<E>,
         shape: S,
-    ) -> Result<Tensor<S, E, Self>, Self::Err>;
+    ) -> Result<Tensor<S, E, Self>, Error>;
 }
 
 impl<S: Shape, E, D: Storage<E>, T> Tensor<S, E, D, T> {
@@ -470,7 +465,7 @@ impl<S: Shape, E, D: Storage<E>, T> Tensor<S, E, D, T> {
     pub fn try_to_device<Dst: TensorFromVec<E>>(
         &self,
         device: &Dst,
-    ) -> Result<Tensor<S, E, Dst>, Dst::Err> {
+    ) -> Result<Tensor<S, E, Dst>, Error> {
         let buf = self.as_vec();
         device.try_tensor_from_vec(buf, self.shape)
     }
@@ -492,23 +487,23 @@ pub trait TensorFrom<Src, S: Shape, E>: Storage<E> {
         self.try_tensor(src).unwrap()
     }
     /// Fallible version of [TensorFrom::tensor]
-    fn try_tensor(&self, src: Src) -> Result<Tensor<S, E, Self>, Self::Err>;
+    fn try_tensor(&self, src: Src) -> Result<Tensor<S, E, Self>, Error>;
 }
 
 impl<E, D: TensorFromVec<E>> TensorFrom<E, Rank0, E> for D {
-    fn try_tensor(&self, src: E) -> Result<Tensor<Rank0, E, Self>, Self::Err> {
+    fn try_tensor(&self, src: E) -> Result<Tensor<Rank0, E, Self>, Error> {
         self.try_tensor_from_vec(vec![src], ())
     }
 }
 
 impl<E: Copy, const M: usize, D: TensorFromVec<E>> TensorFrom<[E; M], Rank1<M>, E> for D {
-    fn try_tensor(&self, src: [E; M]) -> Result<Tensor<Rank1<M>, E, Self>, Self::Err> {
+    fn try_tensor(&self, src: [E; M]) -> Result<Tensor<Rank1<M>, E, Self>, Error> {
         self.try_tensor(&src)
     }
 }
 
 impl<E: Copy, const M: usize, D: TensorFromVec<E>> TensorFrom<&[E; M], Rank1<M>, E> for D {
-    fn try_tensor(&self, src: &[E; M]) -> Result<Tensor<Rank1<M>, E, Self>, Self::Err> {
+    fn try_tensor(&self, src: &[E; M]) -> Result<Tensor<Rank1<M>, E, Self>, Error> {
         self.try_tensor_from_vec(src.to_vec(), (Const::<M>,))
     }
 }
@@ -516,7 +511,7 @@ impl<E: Copy, const M: usize, D: TensorFromVec<E>> TensorFrom<&[E; M], Rank1<M>,
 impl<E: Copy, const M: usize, const N: usize, D: TensorFromVec<E>>
     TensorFrom<[[E; N]; M], Rank2<M, N>, E> for D
 {
-    fn try_tensor(&self, src: [[E; N]; M]) -> Result<Tensor<Rank2<M, N>, E, Self>, Self::Err> {
+    fn try_tensor(&self, src: [[E; N]; M]) -> Result<Tensor<Rank2<M, N>, E, Self>, Error> {
         let vec: Vec<E> = src.iter().flat_map(|v| v.iter().copied()).collect();
 
         self.try_tensor_from_vec(vec, (Const::<M>, Const::<N>))
@@ -526,10 +521,7 @@ impl<E: Copy, const M: usize, const N: usize, D: TensorFromVec<E>>
 impl<E: Copy, const M: usize, const N: usize, const O: usize, D: TensorFromVec<E>>
     TensorFrom<[[[E; O]; N]; M], Rank3<M, N, O>, E> for D
 {
-    fn try_tensor(
-        &self,
-        src: [[[E; O]; N]; M],
-    ) -> Result<Tensor<Rank3<M, N, O>, E, Self>, Self::Err> {
+    fn try_tensor(&self, src: [[[E; O]; N]; M]) -> Result<Tensor<Rank3<M, N, O>, E, Self>, Error> {
         let vec: Vec<E> = src
             .iter()
             .flat_map(|v| v.iter())
@@ -552,7 +544,7 @@ impl<
     fn try_tensor(
         &self,
         src: [[[[E; P]; O]; N]; M],
-    ) -> Result<Tensor<Rank4<M, N, O, P>, E, Self>, Self::Err> {
+    ) -> Result<Tensor<Rank4<M, N, O, P>, E, Self>, Error> {
         let vec: Vec<E> = src
             .iter()
             .flat_map(|v| v.iter())
@@ -565,13 +557,13 @@ impl<
 }
 
 impl<E, S: ConstShape, D: TensorFromVec<E>> TensorFrom<Vec<E>, S, E> for D {
-    fn try_tensor(&self, src: Vec<E>) -> Result<Tensor<S, E, Self>, Self::Err> {
+    fn try_tensor(&self, src: Vec<E>) -> Result<Tensor<S, E, Self>, Error> {
         self.try_tensor_from_vec(src, S::default())
     }
 }
 
 impl<E, S: Shape, D: TensorFromVec<E>> TensorFrom<(Vec<E>, S), S, E> for D {
-    fn try_tensor(&self, (src, shape): (Vec<E>, S)) -> Result<Tensor<S, E, Self>, Self::Err> {
+    fn try_tensor(&self, (src, shape): (Vec<E>, S)) -> Result<Tensor<S, E, Self>, Error> {
         self.try_tensor_from_vec(src, shape)
     }
 }
diff --git a/dfdx-core/src/tensor/tensor_impls.rs b/dfdx-core/src/tensor/tensor_impls.rs
index bf4c57f22..691c5931f 100644
--- a/dfdx-core/src/tensor/tensor_impls.rs
+++ b/dfdx-core/src/tensor/tensor_impls.rs
@@ -54,10 +54,6 @@ impl<S: Shape, E: Dtype, D: Storage<E>, T> HasDtype for Tensor<S, E, D, T> {
     type Dtype = E;
 }
 
-impl<S: Shape, E, D: Storage<E>, T> HasErr for Tensor<S, E, D, T> {
-    type Err = D::Err;
-}
-
 /// Something that can trace gradients
 pub trait Trace<E, D: Storage<E>>: Clone {
     type Traced;
@@ -198,7 +194,7 @@ impl<S: Shape, E: Dtype, D: ZeroFillStorage<E>, T> Tensor<S, E, D, T> {
         self.try_fill_with_zeros().unwrap()
     }
     /// Fallible version of [Tensor::fill_with_zeros]
-    pub fn try_fill_with_zeros(&mut self) -> Result<(), D::Err> {
+    pub fn try_fill_with_zeros(&mut self) -> Result<(), Error> {
         self.device
             .try_fill_with_zeros(Arc::make_mut(&mut self.data))
     }
@@ -210,7 +206,7 @@ impl<S: Shape, E: Dtype, D: OneFillStorage<E>, T> Tensor<S, E, D, T> {
         self.try_fill_with_ones().unwrap()
     }
     /// Fallible version of [Tensor::fill_with_ones]
-    pub fn try_fill_with_ones(&mut self) -> Result<(), D::Err> {
+    pub fn try_fill_with_ones(&mut self) -> Result<(), Error> {
         self.device
             .try_fill_with_ones(Arc::make_mut(&mut self.data))
     }
@@ -226,7 +222,7 @@ impl<S: Shape, E: Unit, D: SampleTensor<E>, T> Tensor<S, E, D, T> {
     pub fn try_fill_with_distr<Distr: Distribution<E>>(
         &mut self,
         distr: Distr,
-    ) -> Result<(), D::Err> {
+    ) -> Result<(), Error> {
         self.device
             .try_fill_with_distr(Arc::make_mut(&mut self.data), distr)
     }
diff --git a/dfdx-core/src/tensor/tensorlike.rs b/dfdx-core/src/tensor/tensorlike.rs
index 18e4f074d..5b33f9bd5 100644
--- a/dfdx-core/src/tensor/tensorlike.rs
+++ b/dfdx-core/src/tensor/tensorlike.rs
@@ -1,8 +1,4 @@
-use crate::{
-    prelude::{HasErr, HasShape},
-    shapes::Shape,
-    tensor::Storage,
-};
+use crate::{prelude::HasShape, shapes::Shape, tensor::Storage};
 
 use super::{storage_traits::AllocGrad, GhostTensor, Tensor, UniqueId};
 
@@ -12,7 +8,7 @@ use super::{storage_traits::AllocGrad, GhostTensor, Tensor, UniqueId};
 /// *If it looks like a tensor and barks like a tensor, then pet it like a tensor.*
 #[allow(clippy::len_without_is_empty)]
 pub trait Tensorlike<S: Shape, E, D: Storage<E>>:
-    AllocGrad<Gradient = D::Vec> + HasErr<Err = D::Err> + HasShape<Shape = S>
+    AllocGrad<Gradient = D::Vec> + HasShape<Shape = S>
 {
     fn id(&self) -> UniqueId;
     fn len(&self) -> usize;
diff --git a/dfdx-core/src/tensor_ops/abs/mod.rs b/dfdx-core/src/tensor_ops/abs/mod.rs
index 361079f34..f7ac117a4 100644
--- a/dfdx-core/src/tensor_ops/abs/mod.rs
+++ b/dfdx-core/src/tensor_ops/abs/mod.rs
@@ -34,7 +34,7 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<AbsKernelOp, E>, T: Tape<E, D>> Tensor<S
         self.try_abs().unwrap()
     }
     /// See [abs]
-    pub fn try_abs(self) -> Result<Self, D::Err> {
+    pub fn try_abs(self) -> Result<Self, crate::tensor::Error> {
         try_unary_op(AbsKernelOp, self)
     }
 }
diff --git a/dfdx-core/src/tensor_ops/accurate_gelu/mod.rs b/dfdx-core/src/tensor_ops/accurate_gelu/mod.rs
index 56dda8a04..f409954ed 100644
--- a/dfdx-core/src/tensor_ops/accurate_gelu/mod.rs
+++ b/dfdx-core/src/tensor_ops/accurate_gelu/mod.rs
@@ -47,7 +47,7 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<AccurateGeLUKernelOp, E>, T: Tape<E, D>>
         self.try_accurate_gelu().unwrap()
     }
     /// See [accurate_gelu]
-    pub fn try_accurate_gelu(self) -> Result<Self, D::Err> {
+    pub fn try_accurate_gelu(self) -> Result<Self, crate::tensor::Error> {
         try_unary_op(AccurateGeLUKernelOp, self)
     }
 }
diff --git a/dfdx-core/src/tensor_ops/adam/cpu_kernel.rs b/dfdx-core/src/tensor_ops/adam/cpu_kernel.rs
index b89c0f4d2..998a578e1 100644
--- a/dfdx-core/src/tensor_ops/adam/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/adam/cpu_kernel.rs
@@ -1,7 +1,7 @@
 use super::{AdamConfig, AdamKernel, WeightDecay};
 use crate::{
     dtypes::{Dtype, NotMixedPrecision},
-    tensor::Cpu,
+    tensor::{Cpu, Error},
 };
 
 #[cfg(feature = "f16")]
@@ -14,7 +14,7 @@ impl AdamKernel<crate::dtypes::AMP<crate::dtypes::f16>> for Cpu {
         moment1: &mut Self::Vec,
         moment2: &mut Self::Vec,
         grad: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let betas = cfg.betas.map(|x| x as f32);
         let eps = cfg.eps as f32;
         let lr = cfg.lr as f32;
@@ -60,7 +60,7 @@ impl<E: num_traits::Float + Dtype + NotMixedPrecision> AdamKernel<E> for Cpu {
         moment1: &mut Self::Vec,
         moment2: &mut Self::Vec,
         grad: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let betas = cfg.betas.map(E::from_f64).map(Option::unwrap);
         let eps = E::from_f64(cfg.eps).unwrap();
         let lr = E::from_f64(cfg.lr).unwrap();
diff --git a/dfdx-core/src/tensor_ops/adam/cuda_kernel.rs b/dfdx-core/src/tensor_ops/adam/cuda_kernel.rs
index 3d1bfb9c7..24617b7c8 100644
--- a/dfdx-core/src/tensor_ops/adam/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/adam/cuda_kernel.rs
@@ -1,6 +1,6 @@
 use crate::{
     dtypes::*,
-    tensor::{launch_cfg, Cuda},
+    tensor::{launch_cfg, Cuda, Error},
     tensor_ops::optim::*,
 };
 
@@ -72,7 +72,7 @@ where
         moment1: &mut Self::Vec,
         moment2: &mut Self::Vec,
         grad: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         if !self.dev.has_func(Self::MOD, Self::FWD) {
             self.dev.load_ptx(PTX_SRC.into(), Self::MOD, &[Self::FWD])?;
         }
diff --git a/dfdx-core/src/tensor_ops/adam/mod.rs b/dfdx-core/src/tensor_ops/adam/mod.rs
index fd81e64cc..b9a307232 100644
--- a/dfdx-core/src/tensor_ops/adam/mod.rs
+++ b/dfdx-core/src/tensor_ops/adam/mod.rs
@@ -5,7 +5,7 @@ mod cuda_kernel;
 
 use crate::{
     shapes::{Dtype, Shape},
-    tensor::{Storage, Tensor},
+    tensor::{Error, Storage, Tensor},
 };
 
 use super::WeightDecay;
@@ -57,7 +57,7 @@ pub trait AdamKernel<E: Dtype>: Storage<E> {
         moment1: &mut Self::Vec,
         moment2: &mut Self::Vec,
         grad: &Self::Vec,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 }
 
 impl AdamConfig {
@@ -68,7 +68,7 @@ impl AdamConfig {
         moment1: &mut D::Vec,
         moment2: &mut D::Vec,
         grad: &D::Vec,
-    ) -> Result<(), D::Err> {
+    ) -> Result<(), crate::tensor::Error> {
         param.device.adam_kernel(
             t,
             self,
diff --git a/dfdx-core/src/tensor_ops/add/mod.rs b/dfdx-core/src/tensor_ops/add/mod.rs
index a70270042..29010e6f1 100644
--- a/dfdx-core/src/tensor_ops/add/mod.rs
+++ b/dfdx-core/src/tensor_ops/add/mod.rs
@@ -6,7 +6,7 @@ mod cuda_kernel;
 use super::ops::*;
 use crate::{
     shapes::*,
-    tensor::{HasErr, Merge, Storage, Tape, Tensor},
+    tensor::{Error, Merge, Storage, Tape, Tensor},
 };
 
 #[repr(C)]
@@ -49,9 +49,9 @@ where
 }
 
 /// Fallible version of [std::ops::Add]. See [add]
-pub trait TryAdd<Rhs = Self>: HasErr {
+pub trait TryAdd<Rhs = Self> {
     type Output;
-    fn try_add(self, rhs: Rhs) -> Result<Self::Output, Self::Err>;
+    fn try_add(self, rhs: Rhs) -> Result<Self::Output, Error>;
 }
 
 impl<S: Shape, E: Dtype, D, LhsTape: Tape<E, D>, R> TryAdd<Tensor<S, E, D, R>>
@@ -62,7 +62,7 @@ where
 {
     type Output = Self;
     /// See [add]
-    fn try_add(self, rhs: Tensor<S, E, D, R>) -> Result<Self, Self::Err> {
+    fn try_add(self, rhs: Tensor<S, E, D, R>) -> Result<Self, Error> {
         try_binary_op(BinaryAddKernelOp, self, rhs)
     }
 }
@@ -73,7 +73,7 @@ where
 {
     type Output = Self;
     /// See [add]
-    fn try_add(self, rhs: Rhs) -> Result<Self, Self::Err> {
+    fn try_add(self, rhs: Rhs) -> Result<Self, Error> {
         let rhs: f64 = rhs.into();
         let scalar = E::from_f64(rhs).unwrap();
         try_unary_op(ScalarAddKernelOp { scalar }, self)
diff --git a/dfdx-core/src/tensor_ops/attention_reshape/cpu_kernel.rs b/dfdx-core/src/tensor_ops/attention_reshape/cpu_kernel.rs
index bebe6883a..cda993c0d 100644
--- a/dfdx-core/src/tensor_ops/attention_reshape/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/attention_reshape/cpu_kernel.rs
@@ -13,7 +13,7 @@ impl<E: Dtype> super::AttentionReshapeKernel<E> for Cpu {
             Tensor<(Const<NUM_HEADS>, Const<HEAD_DIM>, usize), E, Self>,
             Tensor<(Const<NUM_HEADS>, usize, Const<HEAD_DIM>), E, Self>,
         ),
-        Self::Err,
+        Error,
     > {
         let sequence_length = qkv.shape().0;
         let past_sequence_length = past_key.shape().2;
diff --git a/dfdx-core/src/tensor_ops/attention_reshape/cuda_kernel.rs b/dfdx-core/src/tensor_ops/attention_reshape/cuda_kernel.rs
index 9f34490ed..1f121e637 100644
--- a/dfdx-core/src/tensor_ops/attention_reshape/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/attention_reshape/cuda_kernel.rs
@@ -54,7 +54,7 @@ where
             Tensor<(Const<NUM_HEADS>, Const<HEAD_DIM>, usize), E, Self>,
             Tensor<(Const<NUM_HEADS>, usize, Const<HEAD_DIM>), E, Self>,
         ),
-        Self::Err,
+        Error,
     > {
         if !self.dev.has_func(Self::FN, Self::FN) {
             self.dev.load_ptx(PTX.into(), Self::FN, &[Self::FN])?;
diff --git a/dfdx-core/src/tensor_ops/attention_reshape/mod.rs b/dfdx-core/src/tensor_ops/attention_reshape/mod.rs
index b5eeb1b82..b61cc3e4e 100644
--- a/dfdx-core/src/tensor_ops/attention_reshape/mod.rs
+++ b/dfdx-core/src/tensor_ops/attention_reshape/mod.rs
@@ -57,7 +57,7 @@ pub trait TryAttentionReshape<E: Dtype>: Storage<E> {
         qkv: &Tensor<(usize, Const<THREE_HIDDEN_DIM>), E, Self>,
         past_key: &Tensor<(Const<NUM_HEADS>, Const<HEAD_DIM>, usize), E, Self>,
         past_value: &Tensor<(Const<NUM_HEADS>, usize, Const<HEAD_DIM>), E, Self>,
-    ) -> Result<QkvTuple<NUM_HEADS, HEAD_DIM, E, Self>, Self::Err>;
+    ) -> Result<QkvTuple<NUM_HEADS, HEAD_DIM, E, Self>, Error>;
 }
 
 pub trait AttentionReshapeKernel<E: Dtype>: Storage<E> {
@@ -66,7 +66,7 @@ pub trait AttentionReshapeKernel<E: Dtype>: Storage<E> {
         qkv: &Tensor<(usize, Const<THREE_HIDDEN_DIM>), E, Self>,
         past_key: &Tensor<(Const<NUM_HEADS>, Const<HEAD_DIM>, usize), E, Self>,
         past_value: &Tensor<(Const<NUM_HEADS>, usize, Const<HEAD_DIM>), E, Self>,
-    ) -> Result<QkvTuple<NUM_HEADS, HEAD_DIM, E, Self>, Self::Err>;
+    ) -> Result<QkvTuple<NUM_HEADS, HEAD_DIM, E, Self>, Error>;
 }
 
 impl<E: Dtype, D: AttentionReshapeKernel<E>> TryAttentionReshape<E> for D {
@@ -80,7 +80,7 @@ impl<E: Dtype, D: AttentionReshapeKernel<E>> TryAttentionReshape<E> for D {
         qkv: &Tensor<(usize, Const<THREE_HIDDEN_DIM>), E, Self>,
         past_key: &Tensor<(Const<NUM_HEADS>, Const<HEAD_DIM>, usize), E, Self>,
         past_value: &Tensor<(Const<NUM_HEADS>, usize, Const<HEAD_DIM>), E, Self>,
-    ) -> Result<QkvTuple<NUM_HEADS, HEAD_DIM, E, Self>, Self::Err> {
+    ) -> Result<QkvTuple<NUM_HEADS, HEAD_DIM, E, Self>, Error> {
         let device = qkv.device.clone();
         device.forward(qkv, past_key, past_value)
     }
diff --git a/dfdx-core/src/tensor_ops/axpy/cpu_kernel.rs b/dfdx-core/src/tensor_ops/axpy/cpu_kernel.rs
index 760d17aa7..c94083528 100644
--- a/dfdx-core/src/tensor_ops/axpy/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/axpy/cpu_kernel.rs
@@ -1,13 +1,10 @@
-use crate::{shapes::Dtype, tensor::Cpu};
+use crate::{
+    shapes::Dtype,
+    tensor::{Cpu, Error},
+};
 
 impl<E: Dtype> super::AxpyKernel<E> for Cpu {
-    fn forward(
-        &self,
-        a: &mut Self::Vec,
-        alpha: E,
-        b: &Self::Vec,
-        beta: E,
-    ) -> Result<(), Self::Err> {
+    fn forward(&self, a: &mut Self::Vec, alpha: E, b: &Self::Vec, beta: E) -> Result<(), Error> {
         for (a_i, b_i) in a.iter_mut().zip(b.iter()) {
             *a_i = *a_i * alpha + *b_i * beta;
         }
diff --git a/dfdx-core/src/tensor_ops/axpy/cuda_kernel.rs b/dfdx-core/src/tensor_ops/axpy/cuda_kernel.rs
index 5a2abf0c9..9d5b7db4e 100644
--- a/dfdx-core/src/tensor_ops/axpy/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/axpy/cuda_kernel.rs
@@ -1,6 +1,6 @@
 use crate::{
     dtypes::*,
-    tensor::{launch_cfg, Cuda},
+    tensor::{launch_cfg, Cuda, Error},
 };
 
 use cudarc::driver::{DeviceSlice, LaunchAsync};
@@ -29,13 +29,7 @@ impl<E: Dtype> super::AxpyKernel<E> for Cuda
 where
     Self: HasCudaKernel<E>,
 {
-    fn forward(
-        &self,
-        a: &mut Self::Vec,
-        alpha: E,
-        b: &Self::Vec,
-        beta: E,
-    ) -> Result<(), Self::Err> {
+    fn forward(&self, a: &mut Self::Vec, alpha: E, b: &Self::Vec, beta: E) -> Result<(), Error> {
         if !self.dev.has_func(Self::FN, Self::FN) {
             self.dev.load_ptx(PTX_SRC.into(), Self::FN, &[Self::FN])?;
         }
diff --git a/dfdx-core/src/tensor_ops/axpy/mod.rs b/dfdx-core/src/tensor_ops/axpy/mod.rs
index ca108c57b..45e6f4653 100644
--- a/dfdx-core/src/tensor_ops/axpy/mod.rs
+++ b/dfdx-core/src/tensor_ops/axpy/mod.rs
@@ -1,6 +1,6 @@
 use crate::{
     shapes::{Dtype, Shape},
-    tensor::{Storage, Tensor},
+    tensor::{Error, Storage, Tensor},
 };
 
 mod cpu_kernel;
@@ -36,7 +36,7 @@ impl<S: Shape, E: Dtype, D: AxpyKernel<E>> Tensor<S, E, D> {
         alpha: impl Into<f64>,
         b: &Tensor<S, E, D, T>,
         beta: impl Into<f64>,
-    ) -> Result<(), D::Err> {
+    ) -> Result<(), crate::tensor::Error> {
         assert_eq!(self.shape, b.shape);
         assert_eq!(self.strides, b.strides, "Strides must be equal for axpy");
         self.device.clone().forward(
@@ -49,8 +49,7 @@ impl<S: Shape, E: Dtype, D: AxpyKernel<E>> Tensor<S, E, D> {
 }
 
 pub trait AxpyKernel<E: Dtype>: Storage<E> {
-    fn forward(&self, a: &mut Self::Vec, alpha: E, b: &Self::Vec, beta: E)
-        -> Result<(), Self::Err>;
+    fn forward(&self, a: &mut Self::Vec, alpha: E, b: &Self::Vec, beta: E) -> Result<(), Error>;
 }
 
 #[cfg(test)]
diff --git a/dfdx-core/src/tensor_ops/bce/mod.rs b/dfdx-core/src/tensor_ops/bce/mod.rs
index 3053f045f..0df0bfeb6 100644
--- a/dfdx-core/src/tensor_ops/bce/mod.rs
+++ b/dfdx-core/src/tensor_ops/bce/mod.rs
@@ -45,7 +45,10 @@ impl<S: Shape, E: Dtype, D: BinaryKernel<BCEKernelOp, E>, LTape: Tape<E, D>>
         self.try_bce_with_logits(prob).unwrap()
     }
     /// See [bce_with_logits]
-    pub fn try_bce_with_logits<RTape>(self, prob: Tensor<S, E, D, RTape>) -> Result<Self, D::Err>
+    pub fn try_bce_with_logits<RTape>(
+        self,
+        prob: Tensor<S, E, D, RTape>,
+    ) -> Result<Self, crate::tensor::Error>
     where
         RTape: Tape<E, D>,
         LTape: Merge<RTape>,
diff --git a/dfdx-core/src/tensor_ops/boolean/cpu_kernels.rs b/dfdx-core/src/tensor_ops/boolean/cpu_kernels.rs
index ebdab03d7..58a94bb1e 100644
--- a/dfdx-core/src/tensor_ops/boolean/cpu_kernels.rs
+++ b/dfdx-core/src/tensor_ops/boolean/cpu_kernels.rs
@@ -1,6 +1,6 @@
 use crate::{
     shapes::{Shape, Unit},
-    tensor::{cpu::LendingIterator, Cpu, HasErr, Tensor, ZerosTensor},
+    tensor::{cpu::LendingIterator, Cpu, Error, Tensor, ZerosTensor},
 };
 
 use super::BooleanKernel;
@@ -11,7 +11,7 @@ impl Cpu {
         op: O,
         lhs: &Tensor<S, E, Self>,
         rhs: &Tensor<S, E, Self>,
-    ) -> Result<Tensor<S, E, Self>, <Self as HasErr>::Err> {
+    ) -> Result<Tensor<S, E, Self>, Error> {
         let mut out = self.try_zeros_like(&lhs.shape)?;
         let mut lhs_iter = lhs.iter();
         let mut rhs_iter = rhs.iter();
@@ -24,10 +24,7 @@ impl Cpu {
 }
 
 impl BooleanKernel for Cpu {
-    fn not<S: Shape>(
-        &self,
-        inp: &Tensor<S, bool, Self>,
-    ) -> Result<Tensor<S, bool, Self>, Self::Err> {
+    fn not<S: Shape>(&self, inp: &Tensor<S, bool, Self>) -> Result<Tensor<S, bool, Self>, Error> {
         let mut out = inp.clone();
         for x in out.buf_iter_mut() {
             *x = !*x;
@@ -39,7 +36,7 @@ impl BooleanKernel for Cpu {
         &self,
         lhs: &Tensor<S, bool, Self>,
         rhs: &Tensor<S, bool, Self>,
-    ) -> Result<Tensor<S, bool, Self>, Self::Err> {
+    ) -> Result<Tensor<S, bool, Self>, Error> {
         self.eval_binary(|l, r| l && r, lhs, rhs)
     }
 
@@ -47,7 +44,7 @@ impl BooleanKernel for Cpu {
         &self,
         lhs: &Tensor<S, bool, Self>,
         rhs: &Tensor<S, bool, Self>,
-    ) -> Result<Tensor<S, bool, Self>, Self::Err> {
+    ) -> Result<Tensor<S, bool, Self>, Error> {
         self.eval_binary(|l, r| l || r, lhs, rhs)
     }
 
@@ -55,7 +52,7 @@ impl BooleanKernel for Cpu {
         &self,
         lhs: &Tensor<S, bool, Self>,
         rhs: &Tensor<S, bool, Self>,
-    ) -> Result<Tensor<S, bool, Self>, Self::Err> {
+    ) -> Result<Tensor<S, bool, Self>, Error> {
         self.eval_binary(|l, r| l ^ r, lhs, rhs)
     }
 }
diff --git a/dfdx-core/src/tensor_ops/boolean/cuda_kernels.rs b/dfdx-core/src/tensor_ops/boolean/cuda_kernels.rs
index e3ecf41a9..f0031ad4c 100644
--- a/dfdx-core/src/tensor_ops/boolean/cuda_kernels.rs
+++ b/dfdx-core/src/tensor_ops/boolean/cuda_kernels.rs
@@ -1,7 +1,7 @@
 use super::BooleanKernel;
 use crate::{
     shapes::Shape,
-    tensor::{launch_cfg, Cuda, CudaError, Tensor},
+    tensor::{launch_cfg, Cuda, Error, Tensor},
 };
 use cudarc::driver::*;
 
@@ -15,7 +15,7 @@ impl Cuda {
         fn_name: &str,
         lhs: &Tensor<S, bool, Self>,
         rhs: &Tensor<S, bool, Self>,
-    ) -> Result<Tensor<S, bool, Self>, CudaError> {
+    ) -> Result<Tensor<S, bool, Self>, Error> {
         if !self.dev.has_func(MODULE_NAME, fn_name) {
             self.dev
                 .load_ptx(PTX_SRC.into(), MODULE_NAME, &ALL_FN_NAMES)?;
@@ -49,10 +49,7 @@ impl Cuda {
 }
 
 impl BooleanKernel for Cuda {
-    fn not<S: Shape>(
-        &self,
-        inp: &Tensor<S, bool, Self>,
-    ) -> Result<Tensor<S, bool, Self>, Self::Err> {
+    fn not<S: Shape>(&self, inp: &Tensor<S, bool, Self>) -> Result<Tensor<S, bool, Self>, Error> {
         if !self.dev.has_func(MODULE_NAME, "boolean_not") {
             self.dev
                 .load_ptx(PTX_SRC.into(), MODULE_NAME, &ALL_FN_NAMES)?;
@@ -77,7 +74,7 @@ impl BooleanKernel for Cuda {
         &self,
         lhs: &Tensor<S, bool, Self>,
         rhs: &Tensor<S, bool, Self>,
-    ) -> Result<Tensor<S, bool, Self>, Self::Err> {
+    ) -> Result<Tensor<S, bool, Self>, Error> {
         self.call_binary("boolean_and", lhs, rhs)
     }
 
@@ -85,7 +82,7 @@ impl BooleanKernel for Cuda {
         &self,
         lhs: &Tensor<S, bool, Self>,
         rhs: &Tensor<S, bool, Self>,
-    ) -> Result<Tensor<S, bool, Self>, Self::Err> {
+    ) -> Result<Tensor<S, bool, Self>, Error> {
         self.call_binary("boolean_or", lhs, rhs)
     }
 
@@ -93,7 +90,7 @@ impl BooleanKernel for Cuda {
         &self,
         lhs: &Tensor<S, bool, Self>,
         rhs: &Tensor<S, bool, Self>,
-    ) -> Result<Tensor<S, bool, Self>, Self::Err> {
+    ) -> Result<Tensor<S, bool, Self>, Error> {
         self.call_binary("boolean_xor", lhs, rhs)
     }
 }
diff --git a/dfdx-core/src/tensor_ops/boolean/mod.rs b/dfdx-core/src/tensor_ops/boolean/mod.rs
index 51c9c5def..bb8cedf42 100644
--- a/dfdx-core/src/tensor_ops/boolean/mod.rs
+++ b/dfdx-core/src/tensor_ops/boolean/mod.rs
@@ -6,7 +6,7 @@ mod cuda_kernels;
 use crate::{
     prelude::{OnesTensor, Tensor, ZerosTensor},
     shapes::*,
-    tensor::Storage,
+    tensor::{Error, Storage},
 };
 
 use std::ops::{BitAnd, BitOr, BitXor, Not};
@@ -14,34 +14,31 @@ use std::ops::{BitAnd, BitOr, BitXor, Not};
 use super::Device;
 
 pub trait BooleanKernel: Storage<bool> + OnesTensor<bool> + ZerosTensor<bool> {
-    fn not<S: Shape>(
-        &self,
-        inp: &Tensor<S, bool, Self>,
-    ) -> Result<Tensor<S, bool, Self>, Self::Err>;
+    fn not<S: Shape>(&self, inp: &Tensor<S, bool, Self>) -> Result<Tensor<S, bool, Self>, Error>;
 
     fn and<S: Shape>(
         &self,
         lhs: &Tensor<S, bool, Self>,
         rhs: &Tensor<S, bool, Self>,
-    ) -> Result<Tensor<S, bool, Self>, Self::Err>;
+    ) -> Result<Tensor<S, bool, Self>, Error>;
 
     fn or<S: Shape>(
         &self,
         lhs: &Tensor<S, bool, Self>,
         rhs: &Tensor<S, bool, Self>,
-    ) -> Result<Tensor<S, bool, Self>, Self::Err>;
+    ) -> Result<Tensor<S, bool, Self>, Error>;
 
     fn xor<S: Shape>(
         &self,
         lhs: &Tensor<S, bool, Self>,
         rhs: &Tensor<S, bool, Self>,
-    ) -> Result<Tensor<S, bool, Self>, Self::Err>;
+    ) -> Result<Tensor<S, bool, Self>, Error>;
 }
 
 fn scalar_and<D: BooleanKernel, S: Shape>(
     lhs: &Tensor<S, bool, D>,
     rhs: bool,
-) -> Result<Tensor<S, bool, D>, D::Err> {
+) -> Result<Tensor<S, bool, D>, crate::tensor::Error> {
     if rhs {
         Ok(lhs.clone())
     } else {
@@ -52,7 +49,7 @@ fn scalar_and<D: BooleanKernel, S: Shape>(
 fn scalar_or<D: BooleanKernel, S: Shape>(
     lhs: &Tensor<S, bool, D>,
     rhs: bool,
-) -> Result<Tensor<S, bool, D>, D::Err> {
+) -> Result<Tensor<S, bool, D>, crate::tensor::Error> {
     if rhs {
         lhs.device.try_ones_like(lhs)
     } else {
@@ -63,7 +60,7 @@ fn scalar_or<D: BooleanKernel, S: Shape>(
 fn scalar_xor<D: BooleanKernel, S: Shape>(
     lhs: &Tensor<S, bool, D>,
     rhs: bool,
-) -> Result<Tensor<S, bool, D>, D::Err> {
+) -> Result<Tensor<S, bool, D>, crate::tensor::Error> {
     if rhs {
         Ok(lhs.device.not(lhs)?)
     } else {
diff --git a/dfdx-core/src/tensor_ops/broadcast_to.rs b/dfdx-core/src/tensor_ops/broadcast_to.rs
index 90dfa93a9..789639c73 100644
--- a/dfdx-core/src/tensor_ops/broadcast_to.rs
+++ b/dfdx-core/src/tensor_ops/broadcast_to.rs
@@ -23,7 +23,7 @@ use crate::{shapes::*, tensor::*};
 /// // It's ambiguous what axes to broadcast here - explicitly say axes 0 and 2
 /// let _: Tensor<Rank3<1, 1, 1>, _, _> = a.clone().broadcast::<_, Axes2<0, 2>>();
 /// ```
-pub trait BroadcastTo: HasErr + HasShape {
+pub trait BroadcastTo: Sized + HasShape {
     /// Broadcast into shape `Dst` along axes `Ax`.
     fn broadcast<Dst: ConstShape, Ax: Axes>(self) -> Self::WithShape<Dst>
     where
@@ -33,7 +33,7 @@ pub trait BroadcastTo: HasErr + HasShape {
             .unwrap()
     }
     /// Fallible version of [BroadcastTo::broadcast]
-    fn try_broadcast<Dst: ConstShape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Self::Err>
+    fn try_broadcast<Dst: ConstShape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Error>
     where
         Self::Shape: BroadcastShapeTo<Dst, Ax>,
     {
@@ -50,7 +50,7 @@ pub trait BroadcastTo: HasErr + HasShape {
     fn try_broadcast_like<Dst: HasShape, Ax: Axes>(
         self,
         dst: &Dst,
-    ) -> Result<Self::WithShape<Dst::Shape>, Self::Err>
+    ) -> Result<Self::WithShape<Dst::Shape>, Error>
     where
         Self::Shape: BroadcastShapeTo<Dst::Shape, Ax>;
 }
@@ -59,7 +59,7 @@ impl<S: Shape, E, D: Storage<E>, T: Tape<E, D>> BroadcastTo for Tensor<S, E, D,
     fn try_broadcast_like<Dst: HasShape, Ax: Axes>(
         self,
         dst: &Dst,
-    ) -> Result<Self::WithShape<Dst::Shape>, Self::Err>
+    ) -> Result<Self::WithShape<Dst::Shape>, Error>
     where
         Self::Shape: BroadcastShapeTo<Dst::Shape, Ax>,
     {
diff --git a/dfdx-core/src/tensor_ops/choose/cpu_kernel.rs b/dfdx-core/src/tensor_ops/choose/cpu_kernel.rs
index 2d7b15f27..068579311 100644
--- a/dfdx-core/src/tensor_ops/choose/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/choose/cpu_kernel.rs
@@ -2,7 +2,7 @@ use crate::{
     shapes::{Dtype, Shape},
     tensor::{
         cpu::{LendingIterator, NdIndex},
-        Cpu, Storage, Tensor, ZerosTensor,
+        Cpu, Error, Storage, Tensor, ZerosTensor,
     },
 };
 
@@ -12,7 +12,7 @@ impl<E: Dtype> super::ChooseKernel<E> for Cpu {
         cond: &Tensor<S, bool, Self>,
         lhs: &Tensor<S, E, Self>,
         rhs: &Tensor<S, E, Self>,
-    ) -> Result<Tensor<S, E, Self>, Self::Err> {
+    ) -> Result<Tensor<S, E, Self>, Error> {
         let mut out = self.try_zeros_like(&lhs.shape)?;
         let mut cond_iter = cond.iter();
         let mut lhs_iter = lhs.iter();
@@ -36,7 +36,7 @@ impl<E: Dtype> super::ChooseKernel<E> for Cpu {
         rhs: &Tensor<S, E, Self>,
         grad_rhs: &mut <Self as Storage<E>>::Vec,
         grad_out: &<Self as Storage<E>>::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let mut lhs_idx = NdIndex::new(lhs.shape, lhs.strides);
         let mut rhs_idx = NdIndex::new(rhs.shape, rhs.strides);
         let mut out_idx = NdIndex::new(lhs.shape, lhs.shape.strides());
diff --git a/dfdx-core/src/tensor_ops/choose/cuda_kernel.rs b/dfdx-core/src/tensor_ops/choose/cuda_kernel.rs
index ed29149bb..e99898c61 100644
--- a/dfdx-core/src/tensor_ops/choose/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/choose/cuda_kernel.rs
@@ -1,7 +1,7 @@
 use crate::{
     dtypes::*,
     shapes::*,
-    tensor::{launch_cfg, Cuda, Storage, Tensor},
+    tensor::{launch_cfg, Cuda, Error, Storage, Tensor},
 };
 use cudarc::driver::{CudaSlice, LaunchAsync};
 
@@ -43,7 +43,7 @@ where
         cond: &Tensor<S, bool, Self>,
         lhs: &Tensor<S, E, Self>,
         rhs: &Tensor<S, E, Self>,
-    ) -> Result<Tensor<S, E, Self>, Self::Err> {
+    ) -> Result<Tensor<S, E, Self>, Error> {
         if !self.dev.has_func(Self::MOD, Self::FNS[0]) {
             self.dev.load_ptx(PTX_SRC.into(), Self::MOD, Self::FNS)?;
         }
@@ -85,7 +85,7 @@ where
         rhs: &Tensor<S, E, Self>,
         grad_rhs: &mut <Self as Storage<E>>::Vec,
         grad_out: &<Self as Storage<E>>::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let bwd_fn = self.dev.get_func(Self::MOD, Self::FNS[1]).unwrap();
         let numel = cond.shape.num_elements();
 
diff --git a/dfdx-core/src/tensor_ops/choose/mod.rs b/dfdx-core/src/tensor_ops/choose/mod.rs
index a1861e448..a82a5c79c 100644
--- a/dfdx-core/src/tensor_ops/choose/mod.rs
+++ b/dfdx-core/src/tensor_ops/choose/mod.rs
@@ -5,7 +5,7 @@ mod cuda_kernel;
 
 use crate::{
     shapes::{Dtype, HasShape, Shape},
-    tensor::{HasErr, Merge, PutTape, SplitTape, Storage, Tape, Tensor},
+    tensor::{Error, Merge, PutTape, SplitTape, Storage, Tape, Tensor},
 };
 
 pub trait ChooseKernel<E: Dtype>: Storage<E> + Storage<bool> {
@@ -14,7 +14,7 @@ pub trait ChooseKernel<E: Dtype>: Storage<E> + Storage<bool> {
         cond: &Tensor<S, bool, Self>,
         lhs: &Tensor<S, E, Self>,
         rhs: &Tensor<S, E, Self>,
-    ) -> Result<Tensor<S, E, Self>, Self::Err>;
+    ) -> Result<Tensor<S, E, Self>, Error>;
 
     fn backward<S: Shape>(
         &self,
@@ -24,7 +24,7 @@ pub trait ChooseKernel<E: Dtype>: Storage<E> + Storage<bool> {
         rhs: &Tensor<S, E, Self>,
         grad_rhs: &mut <Self as Storage<E>>::Vec,
         grad_out: &<Self as Storage<E>>::Vec,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 }
 
 /// Choose values from two tensors using a boolean mask. Equivalent to `torch.where` from pytorch.
@@ -38,7 +38,7 @@ pub trait ChooseKernel<E: Dtype>: Storage<E> + Storage<bool> {
 /// let c = cond.choose(a, b);
 /// assert_eq!(c.array(), [1.0, -2.0, 3.0]);
 /// ```
-pub trait ChooseFrom<Lhs, Rhs>: HasErr {
+pub trait ChooseFrom<Lhs, Rhs>: Sized {
     type Output;
 
     /// Construct a new tensor, where the output tensor contains the elements of lhs where self is
@@ -48,7 +48,7 @@ pub trait ChooseFrom<Lhs, Rhs>: HasErr {
     }
 
     /// Fallible version of choose
-    fn try_choose(self, lhs: Lhs, rhs: Rhs) -> Result<Self::Output, Self::Err>;
+    fn try_choose(self, lhs: Lhs, rhs: Rhs) -> Result<Self::Output, Error>;
 }
 
 impl<
@@ -65,7 +65,7 @@ impl<
         self,
         lhs: Tensor<S, E, D, LhsTape>,
         rhs: Tensor<S, E, D, RhsTape>,
-    ) -> Result<Self::Output, Self::Err> {
+    ) -> Result<Self::Output, Error> {
         assert_eq!(self.shape(), lhs.shape());
         assert_eq!(lhs.shape(), rhs.shape());
 
diff --git a/dfdx-core/src/tensor_ops/clamp/mod.rs b/dfdx-core/src/tensor_ops/clamp/mod.rs
index b23f6a035..1054d0ffa 100644
--- a/dfdx-core/src/tensor_ops/clamp/mod.rs
+++ b/dfdx-core/src/tensor_ops/clamp/mod.rs
@@ -37,7 +37,11 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<ClampKernelOp<E>, E>, T: Tape<E, D>> Ten
         self.try_clamp(min, max).unwrap()
     }
     /// See [clamp]
-    pub fn try_clamp(self, min: impl Into<f64>, max: impl Into<f64>) -> Result<Self, D::Err> {
+    pub fn try_clamp(
+        self,
+        min: impl Into<f64>,
+        max: impl Into<f64>,
+    ) -> Result<Self, crate::tensor::Error> {
         try_unary_op(
             ClampKernelOp {
                 min: E::from_f64(min.into()).unwrap(),
diff --git a/dfdx-core/src/tensor_ops/cmp/cpu_kernels.rs b/dfdx-core/src/tensor_ops/cmp/cpu_kernels.rs
index c42ddb543..8a22615cc 100644
--- a/dfdx-core/src/tensor_ops/cmp/cpu_kernels.rs
+++ b/dfdx-core/src/tensor_ops/cmp/cpu_kernels.rs
@@ -2,7 +2,7 @@ use crate::{
     shapes::{Shape, Unit},
     tensor::{
         cpu::{Cpu, LendingIterator},
-        Tensor, ZerosTensor,
+        Error, Tensor, ZerosTensor,
     },
 };
 
@@ -20,7 +20,7 @@ impl<Op: CmpOpCpuKernel<E>, E: Unit> CmpKernel<Op, E> for Cpu {
         &self,
         lhs: &Tensor<S, E, Self, T>,
         rhs: &Tensor<S, E, Self, T>,
-    ) -> Result<Tensor<S, bool, Self>, Self::Err> {
+    ) -> Result<Tensor<S, bool, Self>, Error> {
         let mut out: Tensor<S, bool, Self> = self.try_zeros_like(&lhs.shape)?;
         let mut lhs_iter = lhs.iter();
         let mut rhs_iter = rhs.iter();
@@ -37,7 +37,7 @@ impl<Op: CmpOpCpuKernel<E>, E: Unit> ScalarCmpKernel<Op, E> for Cpu {
         &self,
         lhs: &Tensor<S, E, Self, T>,
         scalar: E,
-    ) -> Result<Tensor<S, bool, Self>, Self::Err> {
+    ) -> Result<Tensor<S, bool, Self>, Error> {
         let mut out: Tensor<S, bool, Self> = self.try_zeros_like(&lhs.shape)?;
         let mut lhs_iter = lhs.iter();
         let mut out_iter = out.iter_mut();
diff --git a/dfdx-core/src/tensor_ops/cmp/cuda_kernels.rs b/dfdx-core/src/tensor_ops/cmp/cuda_kernels.rs
index 31d0d1196..b4a43cb8e 100644
--- a/dfdx-core/src/tensor_ops/cmp/cuda_kernels.rs
+++ b/dfdx-core/src/tensor_ops/cmp/cuda_kernels.rs
@@ -1,7 +1,7 @@
 use crate::{
     dtypes::*,
     shapes::Shape,
-    tensor::{launch_cfg, Cuda, Tensor},
+    tensor::{launch_cfg, Cuda, Error, Tensor},
 };
 use cudarc::driver::{CudaSlice, LaunchAsync};
 
@@ -39,7 +39,7 @@ impl<E: Unit, Op: CmpOpCudaKernel<E>> CmpKernel<Op, E> for Cuda {
         &self,
         lhs: &Tensor<S, E, Self, T>,
         rhs: &Tensor<S, E, Self, T>,
-    ) -> Result<Tensor<S, bool, Self>, Self::Err> {
+    ) -> Result<Tensor<S, bool, Self>, Error> {
         if !self.dev.has_func(Op::MODULE_NAME, Op::FWD_FN_NAME) {
             self.dev
                 .load_ptx(Op::PTX_SRC.into(), Op::MODULE_NAME, &[Op::FWD_FN_NAME])?;
@@ -80,7 +80,7 @@ impl<E: Unit, Op: ScalarCmpOpCudaKernel<E>> ScalarCmpKernel<Op, E> for Cuda {
         &self,
         lhs: &Tensor<S, E, Self, T>,
         scalar: E,
-    ) -> Result<Tensor<S, bool, Self>, Self::Err> {
+    ) -> Result<Tensor<S, bool, Self>, Error> {
         if !self.dev.has_func(Op::MODULE_NAME, Op::FWD_FN_NAME) {
             self.dev
                 .load_ptx(Op::PTX_SRC.into(), Op::MODULE_NAME, &[Op::FWD_FN_NAME])?;
diff --git a/dfdx-core/src/tensor_ops/cmp/mod.rs b/dfdx-core/src/tensor_ops/cmp/mod.rs
index cd1c3cc08..cf5feed3f 100644
--- a/dfdx-core/src/tensor_ops/cmp/mod.rs
+++ b/dfdx-core/src/tensor_ops/cmp/mod.rs
@@ -1,6 +1,6 @@
 use crate::{
     shapes::{HasShape, Shape},
-    tensor::{HasErr, NoneTape, Storage, Tape, Tensor},
+    tensor::{Error, NoneTape, Storage, Tape, Tensor},
 };
 
 mod cpu_kernels;
@@ -12,13 +12,13 @@ pub trait CmpKernel<Op, E>: Storage<E> + Storage<bool> {
         &self,
         lhs: &Tensor<S, E, Self, T>,
         rhs: &Tensor<S, E, Self, T>,
-    ) -> Result<Tensor<S, bool, Self>, Self::Err>;
+    ) -> Result<Tensor<S, bool, Self>, Error>;
 }
 
 fn try_cmp_op<Op, S: Shape, E, D: CmpKernel<Op, E>, T: Tape<E, D>>(
     lhs: &Tensor<S, E, D, T>,
     rhs: &Tensor<S, E, D, T>,
-) -> Result<Tensor<S, bool, D, NoneTape>, D::Err> {
+) -> Result<Tensor<S, bool, D, NoneTape>, crate::tensor::Error> {
     assert_eq!(lhs.shape(), rhs.shape());
     lhs.device.forward(lhs, rhs)
 }
@@ -28,13 +28,13 @@ pub trait ScalarCmpKernel<Op, E>: Storage<E> + Storage<bool> {
         &self,
         tensor: &Tensor<S, E, Self, T>,
         scalar: E,
-    ) -> Result<Tensor<S, bool, Self>, Self::Err>;
+    ) -> Result<Tensor<S, bool, Self>, Error>;
 }
 
 fn try_scalar_cmp_op<Op, S: Shape, E, D: ScalarCmpKernel<Op, E>, T: Tape<E, D>>(
     tensor: &Tensor<S, E, D, T>,
     scalar: E,
-) -> Result<Tensor<S, bool, D, NoneTape>, D::Err> {
+) -> Result<Tensor<S, bool, D, NoneTape>, crate::tensor::Error> {
     tensor.device.forward(tensor, scalar)
 }
 
@@ -198,14 +198,14 @@ pub fn le<S: Shape, E, D: CmpKernel<LeKernelOp, E>, T: Tape<E, D>>(
 // Macro to reduce boilerplate of implementing comparison methods on Tensor.
 macro_rules! impl_cmp_kernel_op {
     ($TraitName:tt, $FnName:tt, $TryFnName:tt, $KernelOp:tt, $doc:expr, $ScalarFnName:tt, $TryScalarFnName:tt) => {
-        pub trait $TraitName<Rhs>: HasErr {
+        pub trait $TraitName<Rhs> {
             type Output;
             #[doc = $doc]
             fn $FnName(&self, rhs: Rhs) -> Self::Output {
                 self.$TryFnName(rhs).unwrap()
             }
             #[doc = $doc]
-            fn $TryFnName(&self, rhs: Rhs) -> Result<Self::Output, Self::Err>;
+            fn $TryFnName(&self, rhs: Rhs) -> Result<Self::Output, Error>;
         }
 
         impl<S: Shape, E, D: CmpKernel<$KernelOp, E>, T: Tape<E, D>> $TraitName<&Self>
@@ -213,7 +213,7 @@ macro_rules! impl_cmp_kernel_op {
         {
             type Output = Tensor<S, bool, D, NoneTape>;
             #[doc = $doc]
-            fn $TryFnName(&self, other: &Self) -> Result<Self::Output, D::Err> {
+            fn $TryFnName(&self, other: &Self) -> Result<Self::Output, crate::tensor::Error> {
                 try_cmp_op(self, other)
             }
         }
@@ -223,7 +223,7 @@ macro_rules! impl_cmp_kernel_op {
         {
             type Output = Tensor<S, bool, D, NoneTape>;
             #[doc = $doc]
-            fn $TryFnName(&self, other: E) -> Result<Self::Output, D::Err> {
+            fn $TryFnName(&self, other: E) -> Result<Self::Output, crate::tensor::Error> {
                 try_scalar_cmp_op(self, other)
             }
         }
@@ -234,7 +234,7 @@ macro_rules! impl_cmp_kernel_op {
         {
             type Output = Tensor<S, bool, D, NoneTape>;
             #[doc = $doc]
-            fn $TryFnName(&self, other: f32) -> Result<Self::Output, D::Err> {
+            fn $TryFnName(&self, other: f32) -> Result<Self::Output, crate::tensor::Error> {
                 try_scalar_cmp_op(self, half::f16::from_f32(other))
             }
         }
@@ -248,7 +248,7 @@ macro_rules! impl_cmp_kernel_op {
         {
             type Output = Tensor<S, bool, D, NoneTape>;
             #[doc = $doc]
-            fn $TryFnName(&self, other: f32) -> Result<Self::Output, D::Err> {
+            fn $TryFnName(&self, other: f32) -> Result<Self::Output, crate::tensor::Error> {
                 try_scalar_cmp_op(self, crate::dtypes::AMP(half::f16::from_f32(other)))
             }
         }
@@ -265,7 +265,7 @@ macro_rules! impl_cmp_kernel_op {
             pub fn $TryScalarFnName(
                 &self,
                 other: E,
-            ) -> Result<Tensor<S, bool, D, NoneTape>, D::Err> {
+            ) -> Result<Tensor<S, bool, D, NoneTape>, crate::tensor::Error> {
                 try_scalar_cmp_op(self, other)
             }
         }
diff --git a/dfdx-core/src/tensor_ops/concat/cpu_kernel.rs b/dfdx-core/src/tensor_ops/concat/cpu_kernel.rs
index 19b1dd5f7..63d864bde 100644
--- a/dfdx-core/src/tensor_ops/concat/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/concat/cpu_kernel.rs
@@ -1,6 +1,6 @@
 use crate::{
     shapes::{Dtype, Shape},
-    tensor::{unique_id, Cpu, Tensor},
+    tensor::{unique_id, Cpu, Error, Tensor},
 };
 
 impl<E: Dtype> super::ConcatKernel<E> for Cpu {
@@ -8,7 +8,7 @@ impl<E: Dtype> super::ConcatKernel<E> for Cpu {
         &self,
         a: &Tensor<A, E, Self>,
         b: &Tensor<B, E, Self>,
-    ) -> Result<Tensor<A::Catted, E, Self>, Self::Err>
+    ) -> Result<Tensor<A::Catted, E, Self>, Error>
     where
         A: super::ConcatShape<B>,
     {
@@ -45,7 +45,7 @@ impl<E: Dtype> super::ConcatKernel<E> for Cpu {
         grad_a: &mut Self::Vec,
         grad_b: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let mut offset = 0;
         for ga in grad_a.iter_mut() {
             *ga += grad_out[offset];
diff --git a/dfdx-core/src/tensor_ops/concat/cuda_kernel.rs b/dfdx-core/src/tensor_ops/concat/cuda_kernel.rs
index 147c6273b..8da4df742 100644
--- a/dfdx-core/src/tensor_ops/concat/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/concat/cuda_kernel.rs
@@ -1,6 +1,6 @@
 use crate::{
     shapes::*,
-    tensor::{launch_cfg, Cuda, Tensor},
+    tensor::{launch_cfg, Cuda, Error, Tensor},
 };
 use cudarc::{
     driver::{DeviceSlice, LaunchAsync},
@@ -13,7 +13,7 @@ impl<E: Dtype + CudaTypeName> super::ConcatKernel<E> for Cuda {
         &self,
         a: &Tensor<A, E, Self>,
         b: &Tensor<B, E, Self>,
-    ) -> Result<Tensor<A::Catted, E, Self>, Self::Err>
+    ) -> Result<Tensor<A::Catted, E, Self>, Error>
     where
         A: super::ConcatShape<B>,
     {
@@ -33,7 +33,7 @@ impl<E: Dtype + CudaTypeName> super::ConcatKernel<E> for Cuda {
         grad_a: &mut Self::Vec,
         grad_b: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let module_name = std::format!("concat_bwd_{}", E::NAME);
         if !self.dev.has_func(&module_name, "concat_bwd") {
             let src = BWD_KERNEL.replace("$Ty", E::NAME);
diff --git a/dfdx-core/src/tensor_ops/concat/mod.rs b/dfdx-core/src/tensor_ops/concat/mod.rs
index 0ad3a9669..da6cdfee7 100644
--- a/dfdx-core/src/tensor_ops/concat/mod.rs
+++ b/dfdx-core/src/tensor_ops/concat/mod.rs
@@ -27,7 +27,7 @@ mod cuda_kernel;
 /// assert_eq!(c.shape().0, 6);
 /// ```
 #[deprecated = "Use TryConcatAlong instead"]
-pub trait TryConcat<Rhs>: HasErr {
+pub trait TryConcat<Rhs>: Sized {
     type Output;
 
     /// Concatenate two tensors along the first dimension.
@@ -41,7 +41,7 @@ pub trait TryConcat<Rhs>: HasErr {
     /// Fallible version of [TryConcat::concat].
     #[deprecated = "Use TryConcatAlong::try_concat_along instead"]
     #[allow(deprecated)]
-    fn try_concat(self, rhs: Rhs) -> Result<Self::Output, Self::Err>;
+    fn try_concat(self, rhs: Rhs) -> Result<Self::Output, Error>;
 }
 
 #[allow(deprecated)]
@@ -54,7 +54,7 @@ where
 {
     type Output = Tensor<A::Catted, E, D, T>;
     #[allow(deprecated)]
-    fn try_concat(self, rhs: Tensor<B, E, D, R>) -> Result<Self::Output, Self::Err> {
+    fn try_concat(self, rhs: Tensor<B, E, D, R>) -> Result<Self::Output, Error> {
         assert_eq!(
             self.strides,
             self.shape.strides(),
@@ -89,7 +89,7 @@ pub trait ConcatKernel<E: Dtype>: Storage<E> {
         &self,
         a: &Tensor<A, E, Self>,
         b: &Tensor<B, E, Self>,
-    ) -> Result<Tensor<A::Catted, E, Self>, Self::Err>
+    ) -> Result<Tensor<A::Catted, E, Self>, Error>
     where
         A: ConcatShape<B>;
     fn backward(
@@ -97,7 +97,7 @@ pub trait ConcatKernel<E: Dtype>: Storage<E> {
         grad_a: &mut Self::Vec,
         grad_b: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 }
 
 pub trait ConcatShape<Rhs: Shape>: Shape {
diff --git a/dfdx-core/src/tensor_ops/concat_along/cpu_kernel.rs b/dfdx-core/src/tensor_ops/concat_along/cpu_kernel.rs
index 7404b9aff..e6ab2eb20 100644
--- a/dfdx-core/src/tensor_ops/concat_along/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/concat_along/cpu_kernel.rs
@@ -10,7 +10,7 @@ impl<E: Dtype> super::ConcatAlongKernel<E> for Cpu {
         a: &Tensor<A, E, Self>,
         b: &Tensor<B, E, Self>,
         c: &mut Tensor<C, E, Self>,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let mut a_idx = NdIndex::new(a.shape, a.strides);
         let mut b_idx = NdIndex::new(b.shape, b.strides);
 
@@ -44,7 +44,7 @@ impl<E: Dtype> super::ConcatAlongKernel<E> for Cpu {
         b: &GhostTensor<B, E, Self>,
         grad_b: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let mut a_idx = NdIndex::new(a.shape, a.strides);
         let mut b_idx = NdIndex::new(b.shape, b.strides);
 
diff --git a/dfdx-core/src/tensor_ops/concat_along/cuda_kernel.rs b/dfdx-core/src/tensor_ops/concat_along/cuda_kernel.rs
index 7c0d1247c..c779cadf2 100644
--- a/dfdx-core/src/tensor_ops/concat_along/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/concat_along/cuda_kernel.rs
@@ -1,6 +1,6 @@
 use crate::{
     shapes::*,
-    tensor::{launch_cfg, Cuda, GhostTensor, Tensor},
+    tensor::{launch_cfg, Cuda, Error, GhostTensor, Tensor},
 };
 use cudarc::{
     driver::{DeviceSlice, LaunchAsync},
@@ -15,7 +15,7 @@ impl<E: Dtype + CudaTypeName> super::ConcatAlongKernel<E> for Cuda {
         a: &Tensor<A, E, Self>,
         b: &Tensor<B, E, Self>,
         c: &mut Tensor<C, E, Self>,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let module_name = std::format!("concat_{}", E::NAME);
         if !self.dev.has_func(&module_name, "fwd") {
             let src = KERNEL.replace("$Ty", E::NAME);
@@ -67,7 +67,7 @@ impl<E: Dtype + CudaTypeName> super::ConcatAlongKernel<E> for Cuda {
         b: &GhostTensor<B, E, Self>,
         grad_b: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let module_name = std::format!("concat_{}", E::NAME);
         let bwd = self.dev.get_func(&module_name, "bwd").unwrap();
         let cfg = launch_cfg::<128>(grad_out.data.len() as u32);
diff --git a/dfdx-core/src/tensor_ops/concat_along/mod.rs b/dfdx-core/src/tensor_ops/concat_along/mod.rs
index 28282419f..0c796d6e6 100644
--- a/dfdx-core/src/tensor_ops/concat_along/mod.rs
+++ b/dfdx-core/src/tensor_ops/concat_along/mod.rs
@@ -48,13 +48,13 @@ mod cuda_kernel;
 /// ```
 pub trait TryConcatAlong<Ax>: Sized {
     type Output;
-    type Error: std::fmt::Debug;
+
     /// Concatenates self along the given axis.
     fn concat_along(self, ax: Ax) -> Self::Output {
         self.try_concat_along(ax).unwrap()
     }
     /// Fallibly concatenates self along the given axis.
-    fn try_concat_along(self, ax: Ax) -> Result<Self::Output, Self::Error>;
+    fn try_concat_along(self, ax: Ax) -> Result<Self::Output, Error>;
 }
 
 pub trait ConcatAlongKernel<E: Dtype>: Storage<E> {
@@ -64,7 +64,7 @@ pub trait ConcatAlongKernel<E: Dtype>: Storage<E> {
         a: &Tensor<A, E, Self>,
         b: &Tensor<B, E, Self>,
         c: &mut Tensor<C, E, Self>,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 
     fn backward<A: Shape, B: Shape>(
         &self,
@@ -74,7 +74,7 @@ pub trait ConcatAlongKernel<E: Dtype>: Storage<E> {
         b: &GhostTensor<B, E, Self>,
         grad_b: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 }
 
 impl<A, B, Ax, E: Dtype, D, T: Tape<E, D>, R: Tape<E, D>> TryConcatAlong<Ax>
@@ -89,8 +89,8 @@ where
     T: Merge<R>,
 {
     type Output = Tensor<<(A, B) as TryConcatAlong<Ax>>::Output, E, D, T>;
-    type Error = D::Err;
-    fn try_concat_along(self, ax: Ax) -> Result<Self::Output, Self::Error> {
+
+    fn try_concat_along(self, ax: Ax) -> Result<Self::Output, Error> {
         let (lhs, rhs) = self;
 
         let out_shape = (*lhs.shape(), *rhs.shape()).concat_along(ax);
@@ -135,8 +135,8 @@ macro_rules! impl_concat {
                     <A as std::ops::Add<B>>::Output,
                     $($Tail, )*
                 );
-                type Error = std::convert::Infallible;
-                fn try_concat_along(self, _: Axis<$Ax>) -> Result<Self::Output, Self::Error> {
+
+                fn try_concat_along(self, _: Axis<$Ax>) -> Result<Self::Output, Error> {
                     let (lhs, rhs) = self;
                     let lhs_dims = lhs.concrete();
                     let rhs_dims = rhs.concrete();
diff --git a/dfdx-core/src/tensor_ops/conv1d/cpu_kernel.rs b/dfdx-core/src/tensor_ops/conv1d/cpu_kernel.rs
index 4383c9eb2..97798dd60 100644
--- a/dfdx-core/src/tensor_ops/conv1d/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/conv1d/cpu_kernel.rs
@@ -34,7 +34,7 @@ impl Cpu {
         filters: &[E],
         out: &mut [E],
         buf: &mut [E],
-    ) -> Result<(), CpuError>
+    ) -> Result<(), Error>
     where
         Self: MatMulImpl<E>,
     {
@@ -85,7 +85,7 @@ impl Cpu {
         grad_filters_tr: &mut [E],
         grad_out: &[E],
         buf: &mut [E],
-    ) -> Result<(), CpuError>
+    ) -> Result<(), Error>
     where
         Self: MatMulImpl<E>,
     {
@@ -150,7 +150,7 @@ impl<E: Dtype> Conv1DKernel<E> for Cpu
 where
     Self: MatMulImpl<E>,
 {
-    fn alloc<S: Shape>(&self, s: S) -> Result<Tensor<S, E, Self>, Self::Err> {
+    fn alloc<S: Shape>(&self, s: S) -> Result<Tensor<S, E, Self>, Error> {
         self.try_zeros_like(&s)
     }
 
@@ -160,7 +160,7 @@ where
         lhs: &Tensor<L, E, Self>,
         rhs: &Tensor<R, E, Self>,
         out: &mut Tensor<O, E, Self>,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let patches = (op.chan_in, op.kernel, op.l_out);
         let mut patches = self.try_alloc_zeros::<E>(patches.num_elements())?;
         let [lstride, ostride] = match L::NUM_DIMS {
@@ -192,7 +192,7 @@ where
         grad_rhs: &mut Self::Vec,
         out: &impl Tensorlike<O, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let f_tr_shape = [
             op.groups,
             op.chan_in / op.groups,
diff --git a/dfdx-core/src/tensor_ops/conv1d/cuda_kernel.rs b/dfdx-core/src/tensor_ops/conv1d/cuda_kernel.rs
index 4443d9da0..45779e26d 100644
--- a/dfdx-core/src/tensor_ops/conv1d/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/conv1d/cuda_kernel.rs
@@ -4,7 +4,7 @@ use cudarc::driver::{DeviceRepr, LaunchAsync, ValidAsZeroBits};
 use crate::{
     dtypes::*,
     shapes::*,
-    tensor::{launch_cfg, Cuda, Tensor, Tensorlike},
+    tensor::{launch_cfg, Cuda, Error, Tensor, Tensorlike},
 };
 
 use std::sync::Arc;
@@ -73,7 +73,7 @@ where
     Self: HasCudaKernel<E>,
     CudaBlas: Gemm<E>,
 {
-    fn alloc<S: Shape>(&self, shape: S) -> Result<Tensor<S, E, Self>, Self::Err> {
+    fn alloc<S: Shape>(&self, shape: S) -> Result<Tensor<S, E, Self>, Error> {
         let data = unsafe { self.alloc_empty::<E>(shape.num_elements()) }?;
         Ok(self.build_tensor(shape, shape.strides(), data))
     }
@@ -83,7 +83,7 @@ where
         img: &Tensor<L, E, Self>,
         fil: &Tensor<R, E, Self>,
         out: &mut Tensor<O, E, Self>,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         if !self.dev.has_func(Self::MOD, Self::FNS[0]) {
             self.dev.load_ptx(PTX_SRC.into(), Self::MOD, Self::FNS)?;
         }
@@ -151,7 +151,7 @@ where
         grad_rhs: &mut Self::Vec,
         _: &impl Tensorlike<O, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let patches_item_numel = op.chan_out * op.kernel * op.l_in;
         let patches_numel = op.batch * patches_item_numel;
         let filters_numel =
diff --git a/dfdx-core/src/tensor_ops/conv1d/mod.rs b/dfdx-core/src/tensor_ops/conv1d/mod.rs
index 25bdddbee..db51c02bf 100644
--- a/dfdx-core/src/tensor_ops/conv1d/mod.rs
+++ b/dfdx-core/src/tensor_ops/conv1d/mod.rs
@@ -23,7 +23,7 @@ pub(super) struct Conv1DOp {
 }
 
 pub(super) trait Conv1DKernel<E: Dtype>: Storage<E> {
-    fn alloc<S: Shape>(&self, s: S) -> Result<Tensor<S, E, Self>, Self::Err>;
+    fn alloc<S: Shape>(&self, s: S) -> Result<Tensor<S, E, Self>, Error>;
 
     fn forward<L: Shape, R: Shape, O: Shape>(
         &self,
@@ -31,7 +31,7 @@ pub(super) trait Conv1DKernel<E: Dtype>: Storage<E> {
         lhs: &Tensor<L, E, Self>,
         rhs: &Tensor<R, E, Self>,
         out: &mut Tensor<O, E, Self>,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 
     #[allow(clippy::too_many_arguments)]
     fn backward<L: Shape, R: Shape, O: Shape>(
@@ -43,7 +43,7 @@ pub(super) trait Conv1DKernel<E: Dtype>: Storage<E> {
         grad_rhs: &mut Self::Vec,
         out: &impl Tensorlike<O, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 }
 
 /// Applies a 1d convolution to a tensor.
@@ -86,7 +86,6 @@ pub(super) trait Conv1DKernel<E: Dtype>: Storage<E> {
 /// ```
 pub trait TryConv1D<Stride, Padding, Dilation, Groups>: Sized {
     type Convolved;
-    type Error: std::fmt::Debug;
 
     /// Applies a 1D convolution to the input tensor.
     fn conv1d(
@@ -106,7 +105,7 @@ pub trait TryConv1D<Stride, Padding, Dilation, Groups>: Sized {
         padding: Padding,
         dilation: Dilation,
         groups: Groups,
-    ) -> Result<Self::Convolved, Self::Error>;
+    ) -> Result<Self::Convolved, Error>;
 }
 
 impl<
@@ -122,14 +121,13 @@ where
     Const<{ (DIM + 2 * PADDING - DILATION * (KERNEL - 1) - 1) / STRIDE + 1 }>: Sized,
 {
     type Convolved = Const<{ (DIM + 2 * PADDING - DILATION * (KERNEL - 1) - 1) / STRIDE + 1 }>;
-    type Error = std::convert::Infallible;
     fn try_conv1d(
         self,
         _: Const<STRIDE>,
         _: Const<PADDING>,
         _: Const<DILATION>,
         _: Groups,
-    ) -> Result<Self::Convolved, Self::Error> {
+    ) -> Result<Self::Convolved, Error> {
         Ok(Const)
     }
 }
@@ -138,14 +136,13 @@ impl<Kernel: Dim, Stride: Dim, Padding: Dim, Dilation: Dim, Groups: Dim>
     TryConv1D<Stride, Padding, Dilation, Groups> for (usize, Kernel)
 {
     type Convolved = usize;
-    type Error = std::convert::Infallible;
     fn try_conv1d(
         self,
         stride: Stride,
         padding: Padding,
         dilation: Dilation,
         _: Groups,
-    ) -> Result<Self::Convolved, Self::Error> {
+    ) -> Result<Self::Convolved, Error> {
         let (dim, kernel) = self;
         Ok((dim + 2 * padding.size() - 1)
             .checked_sub(dilation.size() * (kernel.size() - 1))
@@ -187,15 +184,13 @@ where
         D,
         T,
     >;
-    type Error = D::Err;
-
     fn try_conv1d(
         self,
         stride: Stride,
         padding: Padding,
         dilation: Dilation,
         groups: Groups,
-    ) -> Result<Self::Convolved, Self::Error> {
+    ) -> Result<Self::Convolved, Error> {
         let (img, filters) = self;
         let (inp_chan, l) = img.shape;
         let img = img.try_reshape_like(&(Const::<1>, inp_chan, l))?;
@@ -239,15 +234,13 @@ where
         D,
         T,
     >;
-    type Error = D::Err;
-
     fn try_conv1d(
         self,
         stride: Stride,
         padding: Padding,
         dilation: Dilation,
         groups: Groups,
-    ) -> Result<Self::Convolved, Self::Error> {
+    ) -> Result<Self::Convolved, Error> {
         let (img, filters) = self;
         assert_eq!(img.shape.1.size(), filters.shape.1.size() * groups.size());
         let (batch, inp_chan, l) = img.shape;
diff --git a/dfdx-core/src/tensor_ops/conv2d/cpu_kernel.rs b/dfdx-core/src/tensor_ops/conv2d/cpu_kernel.rs
index 5edf859b1..f058fc422 100644
--- a/dfdx-core/src/tensor_ops/conv2d/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/conv2d/cpu_kernel.rs
@@ -48,7 +48,7 @@ impl Cpu {
         filters: &[E],
         out: &mut [E],
         buf: &mut [E],
-    ) -> Result<(), CpuError>
+    ) -> Result<(), Error>
     where
         Self: MatMulImpl<E>,
     {
@@ -105,7 +105,7 @@ impl Cpu {
         grad_filters_tr: &mut [E],
         grad_out: &[E],
         buf: &mut [E],
-    ) -> Result<(), CpuError>
+    ) -> Result<(), Error>
     where
         Self: MatMulImpl<E>,
     {
@@ -175,7 +175,7 @@ impl<E: Dtype> Conv2DKernel<E> for Cpu
 where
     Self: MatMulImpl<E>,
 {
-    fn alloc<S: Shape>(&self, s: S) -> Result<Tensor<S, E, Self>, Self::Err> {
+    fn alloc<S: Shape>(&self, s: S) -> Result<Tensor<S, E, Self>, Error> {
         self.try_zeros_like(&s)
     }
 
@@ -185,7 +185,7 @@ where
         lhs: &Tensor<L, E, Self>,
         rhs: &Tensor<R, E, Self>,
         out: &mut Tensor<O, E, Self>,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let patches = (op.chan_in, op.kernel, op.kernel, op.h_out, op.w_out);
         let mut patches = self.try_alloc_zeros::<E>(patches.num_elements())?;
         let [lstride, ostride] = match L::NUM_DIMS {
@@ -217,7 +217,7 @@ where
         grad_rhs: &mut Self::Vec,
         out: &impl Tensorlike<O, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let f_tr_shape = [
             op.groups,
             op.chan_in / op.groups,
diff --git a/dfdx-core/src/tensor_ops/conv2d/cuda_kernel.rs b/dfdx-core/src/tensor_ops/conv2d/cuda_kernel.rs
index 4836ed65c..cb38ad192 100644
--- a/dfdx-core/src/tensor_ops/conv2d/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/conv2d/cuda_kernel.rs
@@ -4,7 +4,7 @@ use cudarc::driver::{DeviceRepr, LaunchAsync, ValidAsZeroBits};
 use crate::{
     dtypes::*,
     shapes::*,
-    tensor::{launch_cfg, Cuda, Tensor, Tensorlike},
+    tensor::{launch_cfg, Cuda, Error, Tensor, Tensorlike},
 };
 
 use std::sync::Arc;
@@ -73,7 +73,7 @@ where
     Self: HasCudaKernel<E>,
     CudaBlas: Gemm<E>,
 {
-    fn alloc<S: Shape>(&self, shape: S) -> Result<Tensor<S, E, Self>, Self::Err> {
+    fn alloc<S: Shape>(&self, shape: S) -> Result<Tensor<S, E, Self>, Error> {
         let data = unsafe { self.alloc_empty::<E>(shape.num_elements()) }?;
         Ok(self.build_tensor(shape, shape.strides(), data))
     }
@@ -83,7 +83,7 @@ where
         img: &Tensor<L, E, Self>,
         fil: &Tensor<R, E, Self>,
         out: &mut Tensor<O, E, Self>,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         if !self.dev.has_func(Self::MOD, Self::FNS[0]) {
             self.dev.load_ptx(PTX_SRC.into(), Self::MOD, Self::FNS)?;
         }
@@ -152,7 +152,7 @@ where
         grad_rhs: &mut Self::Vec,
         _: &impl Tensorlike<O, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let patches_item_numel = op.chan_out * op.kernel * op.kernel * op.h_in * op.w_in;
         let patches_numel = op.batch * patches_item_numel;
         let filters_numel = op.groups
diff --git a/dfdx-core/src/tensor_ops/conv2d/cudnn_kernel.rs b/dfdx-core/src/tensor_ops/conv2d/cudnn_kernel.rs
index 8e614e5eb..872740243 100644
--- a/dfdx-core/src/tensor_ops/conv2d/cudnn_kernel.rs
+++ b/dfdx-core/src/tensor_ops/conv2d/cudnn_kernel.rs
@@ -4,7 +4,7 @@ use cudarc::driver::DeviceSlice;
 use crate::{
     dtypes::*,
     shapes::*,
-    tensor::{Cuda, Tensor, Tensorlike},
+    tensor::{Cuda, Error, Tensor, Tensorlike},
 };
 
 use std::sync::Arc;
@@ -29,7 +29,7 @@ impl<E: Dtype + CudnnDataType> super::Conv2DKernel<E> for Cuda
 where
     Self: HasCudnnKernel<E>,
 {
-    fn alloc<S: Shape>(&self, shape: S) -> Result<Tensor<S, E, Self>, Self::Err> {
+    fn alloc<S: Shape>(&self, shape: S) -> Result<Tensor<S, E, Self>, Error> {
         let data = unsafe { self.alloc_empty::<E>(shape.num_elements()) }?;
         Ok(self.build_tensor(shape, shape.strides(), data))
     }
@@ -39,7 +39,7 @@ where
         lhs: &Tensor<L, E, Self>,
         rhs: &Tensor<R, E, Self>,
         out: &mut Tensor<O, E, Self>,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let mut conv = self.cudnn.create_conv2d::<E>(
             [op.padding as i32, op.padding as i32],
             [op.stride as i32, op.stride as i32],
@@ -97,7 +97,7 @@ where
         grad_rhs: &mut Self::Vec,
         out: &impl Tensorlike<O, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let mut conv = self.cudnn.create_conv2d::<E>(
             [op.padding as i32, op.padding as i32],
             [op.stride as i32, op.stride as i32],
diff --git a/dfdx-core/src/tensor_ops/conv2d/mod.rs b/dfdx-core/src/tensor_ops/conv2d/mod.rs
index c61b4cb7d..c5be96945 100644
--- a/dfdx-core/src/tensor_ops/conv2d/mod.rs
+++ b/dfdx-core/src/tensor_ops/conv2d/mod.rs
@@ -29,7 +29,7 @@ pub(super) struct Conv2DOp {
 }
 
 pub(super) trait Conv2DKernel<E: Dtype>: Storage<E> {
-    fn alloc<S: Shape>(&self, s: S) -> Result<Tensor<S, E, Self>, Self::Err>;
+    fn alloc<S: Shape>(&self, s: S) -> Result<Tensor<S, E, Self>, Error>;
 
     fn forward<L: Shape, R: Shape, O: Shape>(
         &self,
@@ -37,7 +37,7 @@ pub(super) trait Conv2DKernel<E: Dtype>: Storage<E> {
         lhs: &Tensor<L, E, Self>,
         rhs: &Tensor<R, E, Self>,
         out: &mut Tensor<O, E, Self>,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 
     #[allow(clippy::too_many_arguments)]
     fn backward<L: Shape, R: Shape, O: Shape>(
@@ -49,7 +49,7 @@ pub(super) trait Conv2DKernel<E: Dtype>: Storage<E> {
         grad_rhs: &mut Self::Vec,
         out: &impl Tensorlike<O, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 }
 
 /// Apply the 2d convolution to a tensor.
@@ -94,7 +94,6 @@ pub(super) trait Conv2DKernel<E: Dtype>: Storage<E> {
 /// ```
 pub trait TryConv2D<Stride, Padding, Dilation, Groups>: Sized {
     type Convolved;
-    type Error: std::fmt::Debug;
 
     /// Applies a 2D convolution to the input tensor.
     fn conv2d(
@@ -114,7 +113,7 @@ pub trait TryConv2D<Stride, Padding, Dilation, Groups>: Sized {
         padding: Padding,
         dilation: Dilation,
         groups: Groups,
-    ) -> Result<Self::Convolved, Self::Error>;
+    ) -> Result<Self::Convolved, Error>;
 }
 
 impl<
@@ -130,14 +129,13 @@ where
     Const<{ (DIM + 2 * PADDING - DILATION * (KERNEL - 1) - 1) / STRIDE + 1 }>: Sized,
 {
     type Convolved = Const<{ (DIM + 2 * PADDING - DILATION * (KERNEL - 1) - 1) / STRIDE + 1 }>;
-    type Error = std::convert::Infallible;
     fn try_conv2d(
         self,
         _: Const<STRIDE>,
         _: Const<PADDING>,
         _: Const<DILATION>,
         _: Groups,
-    ) -> Result<Self::Convolved, Self::Error> {
+    ) -> Result<Self::Convolved, Error> {
         Ok(Const)
     }
 }
@@ -146,14 +144,13 @@ impl<Kernel: Dim, Stride: Dim, Padding: Dim, Dilation: Dim, Groups: Dim>
     TryConv2D<Stride, Padding, Dilation, Groups> for (usize, Kernel)
 {
     type Convolved = usize;
-    type Error = std::convert::Infallible;
     fn try_conv2d(
         self,
         stride: Stride,
         padding: Padding,
         dilation: Dilation,
         _: Groups,
-    ) -> Result<Self::Convolved, Self::Error> {
+    ) -> Result<Self::Convolved, Error> {
         let (dim, kernel) = self;
         Ok((dim + 2 * padding.size() - 1)
             .checked_sub(dilation.size() * (kernel.size() - 1))
@@ -208,7 +205,6 @@ where
         D,
         T,
     >;
-    type Error = D::Err;
 
     fn try_conv2d(
         self,
@@ -216,7 +212,7 @@ where
         padding: Padding,
         dilation: Dilation,
         groups: Groups,
-    ) -> Result<Self::Convolved, Self::Error> {
+    ) -> Result<Self::Convolved, Error> {
         let (img, filters) = self;
         let (inp_chan, h, w) = img.shape;
         let img = img.try_reshape_like(&(Const::<1>, inp_chan, h, w))?;
@@ -273,7 +269,6 @@ where
         D,
         T,
     >;
-    type Error = D::Err;
 
     fn try_conv2d(
         self,
@@ -281,7 +276,7 @@ where
         padding: Padding,
         dilation: Dilation,
         groups: Groups,
-    ) -> Result<Self::Convolved, Self::Error> {
+    ) -> Result<Self::Convolved, Error> {
         let (img, filters) = self;
         assert_eq!(img.shape.1.size(), filters.shape.1.size() * groups.size());
         assert_eq!(filters.shape.2, filters.shape.3);
diff --git a/dfdx-core/src/tensor_ops/convtrans2d/cpu_kernel.rs b/dfdx-core/src/tensor_ops/convtrans2d/cpu_kernel.rs
index f61943f84..d8e842248 100644
--- a/dfdx-core/src/tensor_ops/convtrans2d/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/convtrans2d/cpu_kernel.rs
@@ -1,6 +1,6 @@
 use crate::prelude::Tensorlike;
 use crate::shapes::{Dtype, Shape};
-use crate::tensor::{cpu::*, Tensor, ZerosTensor};
+use crate::tensor::{cpu::*, Error, Tensor, ZerosTensor};
 use crate::tensor_ops::matmul::cpu_kernel::MatMulImpl;
 
 use std::sync::Arc;
@@ -27,7 +27,7 @@ impl Cpu {
         filters_tr: &[E],
         out: &mut [E],
         buf: &mut [E],
-    ) -> Result<(), CpuError>
+    ) -> Result<(), Error>
     where
         Self: MatMulImpl<E>,
     {
@@ -107,7 +107,7 @@ impl Cpu {
         grad_filters: &mut [E],
         grad_out: &[E],
         buf: &mut [E],
-    ) -> Result<(), CpuError>
+    ) -> Result<(), Error>
     where
         Self: MatMulImpl<E>,
     {
@@ -179,7 +179,7 @@ impl<E: Dtype> ConvTrans2DKernel<E> for Cpu
 where
     Self: MatMulImpl<E>,
 {
-    fn alloc<S: Shape>(&self, s: S) -> Result<Tensor<S, E, Self>, Self::Err> {
+    fn alloc<S: Shape>(&self, s: S) -> Result<Tensor<S, E, Self>, Error> {
         self.try_zeros_like(&s)
     }
 
@@ -189,7 +189,7 @@ where
         lhs: &Tensor<L, E, Self>,
         rhs: &Tensor<R, E, Self>,
         out: &mut Tensor<O, E, Self>,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let patches = (op.chan_in, op.kernel, op.kernel, op.h_out, op.w_out);
         let mut patches = self.try_alloc_zeros::<E>(patches.num_elements())?;
         let f_tr_shape = [
@@ -242,7 +242,7 @@ where
         grad_rhs: &mut Self::Vec,
         out: &impl Tensorlike<O, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let patches_shape = [op.chan_out, op.kernel, op.kernel, op.h_in, op.w_in];
         let mut patches = self.try_alloc_zeros::<E>(patches_shape.num_elements())?;
 
diff --git a/dfdx-core/src/tensor_ops/convtrans2d/cuda_kernel.rs b/dfdx-core/src/tensor_ops/convtrans2d/cuda_kernel.rs
index 6ddcd10c7..8ea312f95 100644
--- a/dfdx-core/src/tensor_ops/convtrans2d/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/convtrans2d/cuda_kernel.rs
@@ -4,7 +4,7 @@ use cudarc::driver::{DeviceRepr, LaunchAsync, ValidAsZeroBits};
 use crate::{
     dtypes::*,
     shapes::*,
-    tensor::{launch_cfg, Cuda, Tensor, Tensorlike},
+    tensor::{launch_cfg, Cuda, Error, Tensor, Tensorlike},
 };
 
 use std::sync::Arc;
@@ -69,7 +69,7 @@ where
     Self: HasCudaKernel<E>,
     CudaBlas: Gemm<E>,
 {
-    fn alloc<S: Shape>(&self, shape: S) -> Result<Tensor<S, E, Self>, Self::Err> {
+    fn alloc<S: Shape>(&self, shape: S) -> Result<Tensor<S, E, Self>, Error> {
         let data = unsafe { self.alloc_empty::<E>(shape.num_elements()) }?;
         Ok(self.build_tensor(shape, shape.strides(), data))
     }
@@ -80,7 +80,7 @@ where
         lhs: &Tensor<L, E, Self>,
         rhs: &Tensor<R, E, Self>,
         out: &mut Tensor<O, E, Self>,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         if !self.dev.has_func(Self::MOD, Self::FNS[0]) {
             self.dev.load_ptx(PTX_SRC.into(), Self::MOD, Self::FNS)?;
         }
@@ -160,7 +160,7 @@ where
         grad_rhs: &mut Self::Vec,
         _: &impl Tensorlike<O, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let patches_numel = op.batch * op.chan_out * op.kernel * op.kernel * op.h_in * op.w_in;
 
         let mut patches = unsafe { self.get_workspace::<E>(patches_numel) }?;
diff --git a/dfdx-core/src/tensor_ops/convtrans2d/mod.rs b/dfdx-core/src/tensor_ops/convtrans2d/mod.rs
index dc56a2fc8..761ab4915 100644
--- a/dfdx-core/src/tensor_ops/convtrans2d/mod.rs
+++ b/dfdx-core/src/tensor_ops/convtrans2d/mod.rs
@@ -28,7 +28,7 @@ pub(super) struct ConvTrans2DOp {
 }
 
 pub(super) trait ConvTrans2DKernel<E: Dtype>: Storage<E> {
-    fn alloc<S: Shape>(&self, s: S) -> Result<Tensor<S, E, Self>, Self::Err>;
+    fn alloc<S: Shape>(&self, s: S) -> Result<Tensor<S, E, Self>, Error>;
 
     fn forward<L: Shape, R: Shape, O: Shape>(
         &self,
@@ -36,7 +36,7 @@ pub(super) trait ConvTrans2DKernel<E: Dtype>: Storage<E> {
         lhs: &Tensor<L, E, Self>,
         rhs: &Tensor<R, E, Self>,
         out: &mut Tensor<O, E, Self>,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 
     #[allow(clippy::too_many_arguments)]
     fn backward<L: Shape, R: Shape, O: Shape>(
@@ -48,12 +48,11 @@ pub(super) trait ConvTrans2DKernel<E: Dtype>: Storage<E> {
         grad_rhs: &mut Self::Vec,
         out: &impl Tensorlike<O, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 }
 
 pub trait TryConvTrans2D<Stride, Padding, Dilation, Groups>: Sized {
     type Convolved;
-    type Error: std::fmt::Debug;
 
     /// Applies a 2D convolution to the input tensor.
     fn convtrans2d(
@@ -74,7 +73,7 @@ pub trait TryConvTrans2D<Stride, Padding, Dilation, Groups>: Sized {
         padding: Padding,
         dilation: Dilation,
         groups: Groups,
-    ) -> Result<Self::Convolved, Self::Error>;
+    ) -> Result<Self::Convolved, Error>;
 }
 
 impl<
@@ -90,7 +89,6 @@ where
     Const<{ (DIM - 1) * STRIDE - 2 * PADDING + DILATION * (KERNEL - 1) + 1 }>: Sized,
 {
     type Convolved = Const<{ (DIM - 1) * STRIDE - 2 * PADDING + DILATION * (KERNEL - 1) + 1 }>;
-    type Error = std::convert::Infallible;
 
     fn try_convtrans2d(
         self,
@@ -98,7 +96,7 @@ where
         _: Const<PADDING>,
         _: Const<DILATION>,
         _: Groups,
-    ) -> Result<Self::Convolved, Self::Error> {
+    ) -> Result<Self::Convolved, Error> {
         Ok(Const)
     }
 }
@@ -107,7 +105,6 @@ impl<Kernel: Dim, Stride: Dim, Padding: Dim, Dilation: Dim, Groups: Dim>
     TryConvTrans2D<Stride, Padding, Dilation, Groups> for (usize, Kernel)
 {
     type Convolved = usize;
-    type Error = std::convert::Infallible;
 
     fn try_convtrans2d(
         self,
@@ -115,7 +112,7 @@ impl<Kernel: Dim, Stride: Dim, Padding: Dim, Dilation: Dim, Groups: Dim>
         padding: Padding,
         dilation: Dilation,
         _: Groups,
-    ) -> Result<Self::Convolved, Self::Error> {
+    ) -> Result<Self::Convolved, Error> {
         let (dim, kernel) = self;
         Ok(
             ((dim - 1) * stride.size() + dilation.size() * (kernel.size() - 1) + 1)
@@ -161,7 +158,6 @@ where
         D,
         T,
     >;
-    type Error = D::Err;
 
     fn try_convtrans2d(
         self,
@@ -169,7 +165,7 @@ where
         padding: Padding,
         dilation: Dilation,
         groups: Groups,
-    ) -> Result<Self::Convolved, Self::Error> {
+    ) -> Result<Self::Convolved, Error> {
         let (img, filters) = self;
         let (inp_chan, h, w) = img.shape;
         let img = img.try_reshape_like(&(Const::<1>, inp_chan, h, w))?;
@@ -229,7 +225,6 @@ where
         D,
         T,
     >;
-    type Error = D::Err;
 
     fn try_convtrans2d(
         self,
@@ -237,7 +232,7 @@ where
         padding: Padding,
         dilation: Dilation,
         groups: Groups,
-    ) -> Result<Self::Convolved, Self::Error> {
+    ) -> Result<Self::Convolved, Error> {
         let (img, filters) = self;
         assert_eq!(img.shape.1, filters.shape.0);
         assert_eq!(filters.shape.2, filters.shape.3);
diff --git a/dfdx-core/src/tensor_ops/cos/mod.rs b/dfdx-core/src/tensor_ops/cos/mod.rs
index 01bd69d4c..a18be20b4 100644
--- a/dfdx-core/src/tensor_ops/cos/mod.rs
+++ b/dfdx-core/src/tensor_ops/cos/mod.rs
@@ -33,7 +33,7 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<CosKernelOp, E>, T: Tape<E, D>> Tensor<S
         self.try_cos().unwrap()
     }
     /// See [cos]
-    pub fn try_cos(self) -> Result<Self, D::Err> {
+    pub fn try_cos(self) -> Result<Self, crate::tensor::Error> {
         try_unary_op(CosKernelOp, self)
     }
 }
diff --git a/dfdx-core/src/tensor_ops/div/mod.rs b/dfdx-core/src/tensor_ops/div/mod.rs
index 1907b3971..41b0fe586 100644
--- a/dfdx-core/src/tensor_ops/div/mod.rs
+++ b/dfdx-core/src/tensor_ops/div/mod.rs
@@ -47,9 +47,9 @@ where
 }
 
 /// Fallible version of [std::ops::Div]. See [div]
-pub trait TryDiv<Rhs = Self>: HasErr {
+pub trait TryDiv<Rhs = Self> {
     type Output;
-    fn try_div(self, rhs: Rhs) -> Result<Self::Output, Self::Err>;
+    fn try_div(self, rhs: Rhs) -> Result<Self::Output, Error>;
 }
 
 impl<S: Shape, E: Dtype, D, LhsTape: Tape<E, D>, R> TryDiv<Tensor<S, E, D, R>>
@@ -60,7 +60,7 @@ where
 {
     type Output = Self;
     /// See [div]
-    fn try_div(self, rhs: Tensor<S, E, D, R>) -> Result<Self, Self::Err> {
+    fn try_div(self, rhs: Tensor<S, E, D, R>) -> Result<Self, Error> {
         try_binary_op(BinaryDivKernelOp, self, rhs)
     }
 }
@@ -71,7 +71,7 @@ where
 {
     type Output = Self;
     /// See [div]
-    fn try_div(self, rhs: Rhs) -> Result<Self, Self::Err> {
+    fn try_div(self, rhs: Rhs) -> Result<Self, Error> {
         let rhs: f64 = rhs.into();
         let scalar = E::from_f64(rhs).unwrap();
         try_unary_op(ScalarDivKernelOp { scalar }, self)
diff --git a/dfdx-core/src/tensor_ops/dropout/cpu_kernel.rs b/dfdx-core/src/tensor_ops/dropout/cpu_kernel.rs
index 3216a1fa9..4eed47cab 100644
--- a/dfdx-core/src/tensor_ops/dropout/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/dropout/cpu_kernel.rs
@@ -1,6 +1,6 @@
 use crate::{
     shapes::{Dtype, Shape},
-    tensor::{unique_id, Cpu, Tensor},
+    tensor::{unique_id, Cpu, Error, Tensor},
 };
 
 use num_traits::Float;
@@ -12,7 +12,7 @@ impl<E: Float + Dtype> super::DropoutKernel<E> for Cpu {
         &self,
         op: super::DropoutKernelOp,
         inp: &Tensor<S, E, Self>,
-    ) -> Result<Tensor<S, E, Self>, Self::Err> {
+    ) -> Result<Tensor<S, E, Self>, Error> {
         let mut rng = StdRng::seed_from_u64(op.seed);
         let dist = Bernoulli::new(op.prob).unwrap();
         let mut out = Tensor {
@@ -39,7 +39,7 @@ impl<E: Float + Dtype> super::DropoutKernel<E> for Cpu {
         inp: &Tensor<S, E, Self>,
         grad_inp: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let mut rng = StdRng::seed_from_u64(op.seed);
         let dist = Bernoulli::new(op.prob).unwrap();
         debug_assert_eq!(grad_inp.len(), grad_out.len());
diff --git a/dfdx-core/src/tensor_ops/dropout/cuda_kernel.rs b/dfdx-core/src/tensor_ops/dropout/cuda_kernel.rs
index fe66e508e..c13125f1a 100644
--- a/dfdx-core/src/tensor_ops/dropout/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/dropout/cuda_kernel.rs
@@ -1,7 +1,7 @@
 use crate::{
     dtypes::*,
     shapes::*,
-    tensor::{launch_cfg, Cuda, Tensor},
+    tensor::{launch_cfg, Cuda, Error, Tensor},
 };
 
 use std::vec::Vec;
@@ -48,7 +48,7 @@ where
         &self,
         op: super::DropoutKernelOp,
         inp: &Tensor<S, E, Self>,
-    ) -> Result<Tensor<S, E, Self>, Self::Err> {
+    ) -> Result<Tensor<S, E, Self>, Error> {
         let mask = {
             let mut rng = StdRng::seed_from_u64(op.seed);
             let dist = Bernoulli::new(op.prob).unwrap();
@@ -78,7 +78,7 @@ where
         inp: &Tensor<S, E, Self>,
         grad_inp: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let mask = {
             let mut rng = StdRng::seed_from_u64(op.seed);
             let dist = Bernoulli::new(op.prob).unwrap();
diff --git a/dfdx-core/src/tensor_ops/dropout/mod.rs b/dfdx-core/src/tensor_ops/dropout/mod.rs
index 1528853db..0ef0a10fb 100644
--- a/dfdx-core/src/tensor_ops/dropout/mod.rs
+++ b/dfdx-core/src/tensor_ops/dropout/mod.rs
@@ -3,10 +3,7 @@ mod cpu_kernel;
 #[cfg(feature = "cuda")]
 mod cuda_kernel;
 
-use crate::{
-    shapes::*,
-    tensor::{PutTape, RandomU64, SplitTape, Storage, Tape, Tensor},
-};
+use crate::{shapes::*, tensor::*};
 
 #[repr(C)]
 #[derive(Debug, Clone, Copy)]
@@ -20,14 +17,14 @@ pub trait DropoutKernel<E: Dtype>: Storage<E> + RandomU64 {
         &self,
         op: DropoutKernelOp,
         inp: &Tensor<S, E, Self>,
-    ) -> Result<Tensor<S, E, Self>, Self::Err>;
+    ) -> Result<Tensor<S, E, Self>, Error>;
     fn backward<S: Shape>(
         &self,
         op: DropoutKernelOp,
         inp: &Tensor<S, E, Self>,
         grad_inp: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 }
 
 /// Zeros elements with probability `p` and scales all elements by `1 / (1 - p)`.
@@ -62,7 +59,7 @@ impl<S: Shape, E: Dtype, D: DropoutKernel<E>, T: Tape<E, D>> Tensor<S, E, D, T>
         self.try_dropout(prob).unwrap()
     }
     /// See [dropout]
-    pub fn try_dropout(self, prob: impl Into<f64>) -> Result<Self, D::Err> {
+    pub fn try_dropout(self, prob: impl Into<f64>) -> Result<Self, crate::tensor::Error> {
         let seed = self.device.random_u64();
         let prob = prob.into();
         let op = DropoutKernelOp { seed, prob };
diff --git a/dfdx-core/src/tensor_ops/exp/mod.rs b/dfdx-core/src/tensor_ops/exp/mod.rs
index d17c56f72..3d04959d0 100644
--- a/dfdx-core/src/tensor_ops/exp/mod.rs
+++ b/dfdx-core/src/tensor_ops/exp/mod.rs
@@ -33,7 +33,7 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<ExpKernelOp, E>, T: Tape<E, D>> Tensor<S
         self.try_exp().unwrap()
     }
     /// See [exp]
-    pub fn try_exp(self) -> Result<Self, D::Err> {
+    pub fn try_exp(self) -> Result<Self, crate::tensor::Error> {
         try_unary_op(ExpKernelOp, self)
     }
 }
diff --git a/dfdx-core/src/tensor_ops/fast_gelu/mod.rs b/dfdx-core/src/tensor_ops/fast_gelu/mod.rs
index 831ef5db6..6a4b46df5 100644
--- a/dfdx-core/src/tensor_ops/fast_gelu/mod.rs
+++ b/dfdx-core/src/tensor_ops/fast_gelu/mod.rs
@@ -48,7 +48,7 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<FastGeLUKernelOp, E>, T: Tape<E, D>> Ten
         self.try_fast_gelu().unwrap()
     }
     /// See [fast_gelu]
-    pub fn try_fast_gelu(self) -> Result<Self, D::Err> {
+    pub fn try_fast_gelu(self) -> Result<Self, crate::tensor::Error> {
         try_unary_op(FastGeLUKernelOp, self)
     }
 
@@ -60,7 +60,7 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<FastGeLUKernelOp, E>, T: Tape<E, D>> Ten
 
     /// Use [Tensor::try_fast_gelu] instead
     #[deprecated(since = "0.12.0", note = "Use `Tensor::try_fast_gelu` instead")]
-    pub fn try_gelu(self) -> Result<Self, D::Err> {
+    pub fn try_gelu(self) -> Result<Self, crate::tensor::Error> {
         self.try_fast_gelu()
     }
 }
diff --git a/dfdx-core/src/tensor_ops/huber_error/mod.rs b/dfdx-core/src/tensor_ops/huber_error/mod.rs
index b2ee13c39..38bd27b93 100644
--- a/dfdx-core/src/tensor_ops/huber_error/mod.rs
+++ b/dfdx-core/src/tensor_ops/huber_error/mod.rs
@@ -50,7 +50,7 @@ impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Tensor<S, E, D, T> {
         self,
         rhs: Tensor<S, E, D, R>,
         delta: impl Into<f64>,
-    ) -> Result<Self, D::Err>
+    ) -> Result<Self, crate::tensor::Error>
     where
         T: Merge<R>,
     {
diff --git a/dfdx-core/src/tensor_ops/ln/mod.rs b/dfdx-core/src/tensor_ops/ln/mod.rs
index a144e3b6f..2e1ae0673 100644
--- a/dfdx-core/src/tensor_ops/ln/mod.rs
+++ b/dfdx-core/src/tensor_ops/ln/mod.rs
@@ -33,7 +33,7 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<LnKernelOp, E>, T: Tape<E, D>> Tensor<S,
         self.try_ln().unwrap()
     }
     /// See [ln]
-    pub fn try_ln(self) -> Result<Self, D::Err> {
+    pub fn try_ln(self) -> Result<Self, crate::tensor::Error> {
         try_unary_op(LnKernelOp, self)
     }
 }
diff --git a/dfdx-core/src/tensor_ops/log_softmax.rs b/dfdx-core/src/tensor_ops/log_softmax.rs
index 9a8023899..487c33e5a 100644
--- a/dfdx-core/src/tensor_ops/log_softmax.rs
+++ b/dfdx-core/src/tensor_ops/log_softmax.rs
@@ -38,7 +38,7 @@ impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Tensor<S, E, D, T> {
         self.try_log_softmax::<Ax>().unwrap()
     }
     /// See [log_softmax()]
-    pub fn try_log_softmax<Ax: Axes>(self) -> Result<Self, D::Err>
+    pub fn try_log_softmax<Ax: Axes>(self) -> Result<Self, crate::tensor::Error>
     where
         S: ReduceShape<Ax>,
     {
diff --git a/dfdx-core/src/tensor_ops/logsumexp_to.rs b/dfdx-core/src/tensor_ops/logsumexp_to.rs
index 8f43abf4f..c8573638b 100644
--- a/dfdx-core/src/tensor_ops/logsumexp_to.rs
+++ b/dfdx-core/src/tensor_ops/logsumexp_to.rs
@@ -2,7 +2,7 @@ use super::*;
 use crate::{shapes::*, tensor::*};
 
 /// Reduction along multiple axes using [LogSumExp](https://en.wikipedia.org/wiki/LogSumExp).
-pub trait LogSumExpTo: HasErr + HasShape {
+pub trait LogSumExpTo: Sized + HasShape {
     /// [LogSumExp](https://en.wikipedia.org/wiki/LogSumExp) reduction.
     ///
     /// **Pytorch equivalent**: `t.exp().sum(Axes).log()`
@@ -31,13 +31,13 @@ pub trait LogSumExpTo: HasErr + HasShape {
         self.try_logsumexp().unwrap()
     }
     /// Fallible version of [LogSumExpTo::logsumexp]
-    fn try_logsumexp<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Self::Err>
+    fn try_logsumexp<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Error>
     where
         Self::Shape: ReduceShapeTo<Dst, Ax>;
 }
 
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> LogSumExpTo for Tensor<S, E, D, T> {
-    fn try_logsumexp<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Self::Err>
+    fn try_logsumexp<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Error>
     where
         Self::Shape: ReduceShapeTo<Dst, Ax>,
     {
diff --git a/dfdx-core/src/tensor_ops/matmul/cpu_kernel.rs b/dfdx-core/src/tensor_ops/matmul/cpu_kernel.rs
index 9e9b497ae..bf3e6ce0e 100644
--- a/dfdx-core/src/tensor_ops/matmul/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/matmul/cpu_kernel.rs
@@ -1,7 +1,7 @@
 #![allow(clippy::needless_return)]
 
 use crate::shapes::*;
-use crate::tensor::{Cpu, Tensor, ZerosTensor};
+use crate::tensor::{Cpu, Error, Tensor, ZerosTensor};
 
 use std::sync::Arc;
 
@@ -236,7 +236,7 @@ where
         &self,
         lhs: &Tensor<(M, K), E, Self>,
         rhs: &Tensor<(K, N), E, Self>,
-    ) -> Result<Tensor<(M, N), E, Self>, Self::Err> {
+    ) -> Result<Tensor<(M, N), E, Self>, Error> {
         let (m, k) = lhs.shape;
         let n = rhs.shape.1;
         let mut out = self.try_zeros_like(&(m, n))?;
@@ -259,7 +259,7 @@ where
         rhs: &Tensor<(K, N), E, Self>,
         grad_rhs: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let (m, k) = lhs.shape;
         let n = rhs.shape.1;
         let strides = (m, n).strides();
@@ -295,7 +295,7 @@ where
         &self,
         lhs: &Tensor<(B, M, K), E, Self>,
         rhs: &Tensor<(K, N), E, Self>,
-    ) -> Result<Tensor<(B, M, N), E, Self>, Self::Err> {
+    ) -> Result<Tensor<(B, M, N), E, Self>, Error> {
         let (batch, m, k) = lhs.shape;
         let n = rhs.shape.1;
         let mut out = self.try_zeros_like(&(batch, m, n))?;
@@ -321,7 +321,7 @@ where
         rhs: &Tensor<(K, N), E, Self>,
         grad_rhs: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let (batch, m, k) = lhs.shape;
         let n = rhs.shape.1;
         let strides = (batch, m, n).strides();
@@ -359,7 +359,7 @@ where
         &self,
         lhs: &Tensor<(B, M, K), E, Self>,
         rhs: &Tensor<(B, K, N), E, Self>,
-    ) -> Result<Tensor<(B, M, N), E, Self>, Self::Err> {
+    ) -> Result<Tensor<(B, M, N), E, Self>, Error> {
         let (b, m, k) = lhs.shape;
         let n = rhs.shape.2;
         let mut out = self.try_zeros_like(&(b, m, n))?;
@@ -387,7 +387,7 @@ where
         rhs: &Tensor<(B, K, N), E, Self>,
         grad_rhs: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let (b, m, k) = lhs.shape;
         let n = rhs.shape.2;
         let strides = (b, m, n).strides();
@@ -425,7 +425,7 @@ where
         &self,
         lhs: &Tensor<(B, S, M, K), E, Self>,
         rhs: &Tensor<(B, S, K, N), E, Self>,
-    ) -> Result<Tensor<(B, S, M, N), E, Self>, Self::Err> {
+    ) -> Result<Tensor<(B, S, M, N), E, Self>, Error> {
         let (b, s, m, k) = lhs.shape;
         let n = rhs.shape.3;
         let mut out = self.try_zeros_like(&(b, s, m, n))?;
@@ -453,7 +453,7 @@ where
         rhs: &Tensor<(B, S, K, N), E, Self>,
         grad_rhs: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let (b, s, m, k) = lhs.shape;
         let n = rhs.shape.3;
         let strides = (b, s, m, n).strides();
diff --git a/dfdx-core/src/tensor_ops/matmul/cuda_kernel.rs b/dfdx-core/src/tensor_ops/matmul/cuda_kernel.rs
index b6787d848..179ef06f2 100644
--- a/dfdx-core/src/tensor_ops/matmul/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/matmul/cuda_kernel.rs
@@ -1,7 +1,7 @@
 use crate::{
     dtypes::*,
     shapes::*,
-    tensor::{cuda::Cuda, Tensor},
+    tensor::{cuda::Cuda, Error, Tensor},
 };
 
 use cudarc::{
@@ -245,7 +245,7 @@ where
         &self,
         lhs: &Tensor<(M, K), E, Self>,
         rhs: &Tensor<(K, N), E, Self>,
-    ) -> Result<Tensor<(M, N), E, Self>, Self::Err> {
+    ) -> Result<Tensor<(M, N), E, Self>, Error> {
         let (m, _) = lhs.shape;
         let (k, n) = rhs.shape;
         let shape = (m, n);
@@ -275,7 +275,7 @@ where
         rhs: &Tensor<(K, N), E, Self>,
         grad_rhs: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let (m, _) = lhs.shape;
         let (k, n) = rhs.shape;
         let strides = (m, n).strides();
@@ -320,7 +320,7 @@ where
         &self,
         lhs: &Tensor<(B, M, K), E, Self>,
         rhs: &Tensor<(K, N), E, Self>,
-    ) -> Result<Tensor<(B, M, N), E, Self>, Self::Err> {
+    ) -> Result<Tensor<(B, M, N), E, Self>, Error> {
         let (batch, m, _) = lhs.shape;
         let (k, n) = rhs.shape;
         let shape = (batch, m, n);
@@ -347,7 +347,7 @@ where
         rhs: &Tensor<(K, N), E, Self>,
         grad_rhs: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let (batch, m, _) = lhs.shape;
         let (k, n) = rhs.shape;
         let strides = (batch, m, n).strides();
@@ -396,7 +396,7 @@ where
         &self,
         lhs: &Tensor<(B, M, K), E, Self>,
         rhs: &Tensor<(B, K, N), E, Self>,
-    ) -> Result<Tensor<(B, M, N), E, Self>, Self::Err> {
+    ) -> Result<Tensor<(B, M, N), E, Self>, Error> {
         assert_ne!(lhs.strides[0], 0);
         assert_ne!(rhs.strides[0], 0);
         let (batch, m, _) = lhs.shape;
@@ -425,7 +425,7 @@ where
         rhs: &Tensor<(B, K, N), E, Self>,
         grad_rhs: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let (batch, m, _) = lhs.shape;
         let (_, k, n) = rhs.shape;
         let strides = (batch, m, n).strides();
@@ -470,7 +470,7 @@ where
         &self,
         lhs: &Tensor<(B, S, M, K), E, Self>,
         rhs: &Tensor<(B, S, K, N), E, Self>,
-    ) -> Result<Tensor<(B, S, M, N), E, Self>, Self::Err> {
+    ) -> Result<Tensor<(B, S, M, N), E, Self>, Error> {
         assert_ne!(lhs.strides[0], 0);
         assert_ne!(rhs.strides[0], 0);
         assert_ne!(lhs.strides[1], 0);
@@ -527,7 +527,7 @@ where
         rhs: &Tensor<(B, S, K, N), E, Self>,
         grad_rhs: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let (batch, seq, m, _) = lhs.shape;
         let (_, _, k, n) = rhs.shape;
         let strides = (batch, seq, m, n).strides();
diff --git a/dfdx-core/src/tensor_ops/matmul/mod.rs b/dfdx-core/src/tensor_ops/matmul/mod.rs
index 5247f4f88..b5ece54b2 100644
--- a/dfdx-core/src/tensor_ops/matmul/mod.rs
+++ b/dfdx-core/src/tensor_ops/matmul/mod.rs
@@ -7,7 +7,7 @@ pub(super) mod cuda_kernel;
 
 use crate::{
     shapes::{Const, Dim, Dtype, Shape},
-    tensor::{HasErr, Merge, PutTape, SplitTape, Storage, Tape, Tensor},
+    tensor::{Error, Merge, PutTape, SplitTape, Storage, Tape, Tensor},
 };
 
 use super::reshape_to::{ReshapeKernel, ReshapeTo};
@@ -68,12 +68,12 @@ where
 }
 
 /// Fallible matrix multiplication. See [matmul] for examples.
-pub trait TryMatMul<Rhs>: HasErr {
+pub trait TryMatMul<Rhs>: Sized {
     type Output;
     fn matmul(self, rhs: Rhs) -> Self::Output {
         self.try_matmul(rhs).unwrap()
     }
-    fn try_matmul(self, rhs: Rhs) -> Result<Self::Output, Self::Err>;
+    fn try_matmul(self, rhs: Rhs) -> Result<Self::Output, Error>;
 }
 
 #[rustfmt::skip]
@@ -85,14 +85,14 @@ fn try_binary_op<
     D: Storage<E>,
     RhsTape: Tape<E, D>,
     LhsTape: Tape<E, D> + Merge<RhsTape>,
-    Fwd: 'static + FnMut(&D, &Tensor<Lhs, E, D>, &Tensor<Rhs, E, D>) -> Result<Tensor<Out, E,D>, D::Err>,
-    Bwd: 'static + FnMut(&D, &Tensor<Lhs, E, D>, &mut D::Vec, &Tensor<Rhs, E,D>, &mut D::Vec, &D::Vec) -> Result<(), D::Err>,
+    Fwd: 'static + FnMut(&D, &Tensor<Lhs, E, D>, &Tensor<Rhs, E, D>) -> Result<Tensor<Out, E,D>, crate::tensor::Error>,
+    Bwd: 'static + FnMut(&D, &Tensor<Lhs, E, D>, &mut D::Vec, &Tensor<Rhs, E,D>, &mut D::Vec, &D::Vec) -> Result<(), crate::tensor::Error>,
 >(
     lhs: Tensor<Lhs, E, D, LhsTape>,
     rhs: Tensor<Rhs, E, D, RhsTape>,
     mut fwd: Fwd,
     mut bwd: Bwd,
-) -> Result<Tensor<Out, E, D, LhsTape>, D::Err> {
+) -> Result<Tensor<Out, E, D, LhsTape>, crate::tensor::Error> {
     let (lhs, ltape) = lhs.split_tape();
     let (rhs, rtape) = rhs.split_tape();
     let mut tape = ltape.merge(rtape);
@@ -115,7 +115,7 @@ pub trait MatMatKernel<E: Dtype>: Storage<E> {
         &self,
         lhs: &Tensor<(M, K), E, Self>,
         rhs: &Tensor<(K, N), E, Self>,
-    ) -> Result<Tensor<(M, N), E, Self>, Self::Err>;
+    ) -> Result<Tensor<(M, N), E, Self>, Error>;
 
     fn backward<M: Dim, K: Dim, N: Dim>(
         &self,
@@ -124,7 +124,7 @@ pub trait MatMatKernel<E: Dtype>: Storage<E> {
         rhs: &Tensor<(K, N), E, Self>,
         grad_rhs: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 }
 
 impl<M: Dim, N: Dim, E: Dtype, D, T: Tape<E, D> + Merge<R>, R: Tape<E, D>>
@@ -133,7 +133,7 @@ where
     D: MatMatKernel<E> + ReshapeKernel<E>,
 {
     type Output = Tensor<(M, N), E, D, T>;
-    fn try_matmul(self, rhs: Tensor<(N,), E, D, R>) -> Result<Self::Output, Self::Err> {
+    fn try_matmul(self, rhs: Tensor<(N,), E, D, R>) -> Result<Self::Output, Error> {
         let m = self.shape.0;
         let n = rhs.shape.0;
         let lhs = self.try_reshape_like(&(m, Const::<1>))?;
@@ -147,7 +147,7 @@ where
     D: MatMatKernel<E> + ReshapeKernel<E>,
 {
     type Output = Tensor<(N,), E, D, T>;
-    fn try_matmul(self, rhs: Tensor<(K, N), E, D, R>) -> Result<Self::Output, Self::Err> {
+    fn try_matmul(self, rhs: Tensor<(K, N), E, D, R>) -> Result<Self::Output, Error> {
         let k1 = self.shape.0;
         let (k2, n) = rhs.shape;
         assert_eq!(k1, k2);
@@ -162,7 +162,7 @@ where
     D: MatMatKernel<E> + ReshapeKernel<E>,
 {
     type Output = Tensor<(M,), E, D, T>;
-    fn try_matmul(self, rhs: Tensor<(K,), E, D, R>) -> Result<Self::Output, Self::Err> {
+    fn try_matmul(self, rhs: Tensor<(K,), E, D, R>) -> Result<Self::Output, Error> {
         let (m, k1) = self.shape;
         let k2 = rhs.shape.0;
         assert_eq!(k1, k2);
@@ -185,7 +185,7 @@ where
     /// let y: Tensor<Rank2<3, 4>, f32, _> = dev.zeros();
     /// let _: Tensor<Rank2<3, 4>, f32, _> = x.try_matmul(y);
     /// ```
-    fn try_matmul(self, rhs: Tensor<(K, N), E, D, R>) -> Result<Self::Output, Self::Err> {
+    fn try_matmul(self, rhs: Tensor<(K, N), E, D, R>) -> Result<Self::Output, Error> {
         assert_eq!(self.shape.1, rhs.shape.0);
         try_binary_op(self, rhs, D::forward, D::backward)
     }
@@ -196,7 +196,7 @@ pub trait MatMatBrKernel<E: Dtype>: Storage<E> {
         &self,
         lhs: &Tensor<(B, M, K), E, Self>,
         rhs: &Tensor<(K, N), E, Self>,
-    ) -> Result<Tensor<(B, M, N), E, Self>, Self::Err>;
+    ) -> Result<Tensor<(B, M, N), E, Self>, Error>;
 
     fn backward<B: Dim, M: Dim, K: Dim, N: Dim>(
         &self,
@@ -205,7 +205,7 @@ pub trait MatMatBrKernel<E: Dtype>: Storage<E> {
         rhs: &Tensor<(K, N), E, Self>,
         grad_rhs: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 }
 
 impl<B: Dim, M: Dim, K: Dim, N: Dim, E: Dtype, D: MatMatBrKernel<E>, T, R>
@@ -222,7 +222,7 @@ where
     /// let y: Tensor<Rank2<3, 4>, f32, _> = dev.zeros();
     /// let _: Tensor<Rank3<1, 3, 4>, f32, _> = x.try_matmul(y);
     /// ```
-    fn try_matmul(self, rhs: Tensor<(K, N), E, D, R>) -> Result<Self::Output, Self::Err> {
+    fn try_matmul(self, rhs: Tensor<(K, N), E, D, R>) -> Result<Self::Output, Error> {
         assert_eq!(self.shape.2, rhs.shape.0);
         try_binary_op(self, rhs, D::forward, D::backward)
     }
@@ -233,7 +233,7 @@ pub trait MatMatBatch3Kernel<E: Dtype>: Storage<E> {
         &self,
         lhs: &Tensor<(B, M, K), E, Self>,
         rhs: &Tensor<(B, K, N), E, Self>,
-    ) -> Result<Tensor<(B, M, N), E, Self>, Self::Err>;
+    ) -> Result<Tensor<(B, M, N), E, Self>, Error>;
 
     fn backward<B: Dim, M: Dim, K: Dim, N: Dim>(
         &self,
@@ -242,7 +242,7 @@ pub trait MatMatBatch3Kernel<E: Dtype>: Storage<E> {
         rhs: &Tensor<(B, K, N), E, Self>,
         grad_rhs: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 }
 
 impl<B: Dim, M: Dim, K: Dim, N: Dim, E: Dtype, D, T, R> TryMatMul<Tensor<(B, K, N), E, D, R>>
@@ -260,7 +260,7 @@ where
     /// let y: Tensor<Rank3<1, 3, 4>, f32, _> = dev.zeros();
     /// let _: Tensor<Rank3<1, 3, 4>, f32, _> = x.try_matmul(y);
     /// ```
-    fn try_matmul(self, rhs: Tensor<(B, K, N), E, D, R>) -> Result<Self::Output, Self::Err> {
+    fn try_matmul(self, rhs: Tensor<(B, K, N), E, D, R>) -> Result<Self::Output, Error> {
         assert_eq!(self.shape.0, rhs.shape.0);
         assert_eq!(self.shape.2, rhs.shape.1);
         try_binary_op(self, rhs, D::forward, D::backward)
@@ -272,7 +272,7 @@ pub trait MatMatBatch4Kernel<E: Dtype>: Storage<E> {
         &self,
         lhs: &Tensor<(B, S, M, K), E, Self>,
         rhs: &Tensor<(B, S, K, N), E, Self>,
-    ) -> Result<Tensor<(B, S, M, N), E, Self>, Self::Err>;
+    ) -> Result<Tensor<(B, S, M, N), E, Self>, Error>;
 
     fn backward<B: Dim, S: Dim, M: Dim, K: Dim, N: Dim>(
         &self,
@@ -281,7 +281,7 @@ pub trait MatMatBatch4Kernel<E: Dtype>: Storage<E> {
         rhs: &Tensor<(B, S, K, N), E, Self>,
         grad_rhs: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 }
 
 impl<B: Dim, S: Dim, M: Dim, K: Dim, N: Dim, E: Dtype, D, T, R>
@@ -299,7 +299,7 @@ where
     /// let y: Tensor<Rank4<1, 5, 3, 4>, f32, _> = dev.zeros();
     /// let _: Tensor<Rank3<1, 5, 3, 4>, f32, _> = x.try_matmul(y);
     /// ```
-    fn try_matmul(self, rhs: Tensor<(B, S, K, N), E, D, R>) -> Result<Self::Output, Self::Err> {
+    fn try_matmul(self, rhs: Tensor<(B, S, K, N), E, D, R>) -> Result<Self::Output, Error> {
         assert_eq!(self.shape.0, rhs.shape.0);
         assert_eq!(self.shape.1, rhs.shape.1);
         assert_eq!(self.shape.3, rhs.shape.2);
diff --git a/dfdx-core/src/tensor_ops/max_to/cpu_kernel.rs b/dfdx-core/src/tensor_ops/max_to/cpu_kernel.rs
index 049028bac..1308e84f6 100644
--- a/dfdx-core/src/tensor_ops/max_to/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/max_to/cpu_kernel.rs
@@ -1,6 +1,6 @@
 use crate::{
     shapes::{Axes, Dtype, HasAxes, ReduceShapeTo, Shape},
-    tensor::{Cpu, Tensor, ZerosTensor},
+    tensor::{Cpu, Error, Tensor, ZerosTensor},
     tensor_ops::utilities::reduction_utils::index_for_reductions,
 };
 
@@ -11,7 +11,7 @@ impl<E: Dtype + Float> super::MaxReduceKernel<E> for Cpu {
         &self,
         dst: Dst,
         inp: &Tensor<Src, E, Self>,
-    ) -> Result<Tensor<Dst, E, Self>, Self::Err>
+    ) -> Result<Tensor<Dst, E, Self>, Error>
     where
         Src: ReduceShapeTo<Dst, Ax>,
     {
@@ -44,7 +44,7 @@ impl<E: Dtype + Float> super::MaxReduceKernel<E> for Cpu {
         grad_inp: &mut Self::Vec,
         out: &Tensor<Dst, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>
+    ) -> Result<(), Error>
     where
         Src: ReduceShapeTo<Dst, Ax>,
     {
diff --git a/dfdx-core/src/tensor_ops/max_to/cuda_kernel.rs b/dfdx-core/src/tensor_ops/max_to/cuda_kernel.rs
index 31e384a6a..1926a24b3 100644
--- a/dfdx-core/src/tensor_ops/max_to/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/max_to/cuda_kernel.rs
@@ -1,7 +1,7 @@
 use crate::{
     dtypes::*,
     shapes::*,
-    tensor::{launch_cfg, Cuda, Tensor},
+    tensor::{launch_cfg, Cuda, Error, Tensor},
     tensor_ops::reduction_utils::*,
 };
 
@@ -51,7 +51,7 @@ where
         &self,
         dst: Dst,
         inp: &Tensor<Src, E, Self>,
-    ) -> Result<Tensor<Dst, E, Self>, Self::Err>
+    ) -> Result<Tensor<Dst, E, Self>, Error>
     where
         Src: ReduceShapeTo<Dst, Ax>,
     {
@@ -103,7 +103,7 @@ where
         grad_inp: &mut Self::Vec,
         out: &Tensor<Dst, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>
+    ) -> Result<(), Error>
     where
         Src: ReduceShapeTo<Dst, Ax>,
     {
diff --git a/dfdx-core/src/tensor_ops/max_to/mod.rs b/dfdx-core/src/tensor_ops/max_to/mod.rs
index 93eac94b3..462c6e561 100644
--- a/dfdx-core/src/tensor_ops/max_to/mod.rs
+++ b/dfdx-core/src/tensor_ops/max_to/mod.rs
@@ -10,7 +10,7 @@ pub trait MaxReduceKernel<E: Dtype>: Storage<E> {
         &self,
         dst: Dst,
         inp: &Tensor<Src, E, Self>,
-    ) -> Result<Tensor<Dst, E, Self>, Self::Err>
+    ) -> Result<Tensor<Dst, E, Self>, Error>
     where
         Src: ReduceShapeTo<Dst, Ax>;
     fn backward<Src: Shape, Dst: Shape, Ax: Axes>(
@@ -19,13 +19,13 @@ pub trait MaxReduceKernel<E: Dtype>: Storage<E> {
         grad_inp: &mut Self::Vec,
         out: &Tensor<Dst, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>
+    ) -> Result<(), Error>
     where
         Src: ReduceShapeTo<Dst, Ax>;
 }
 
 /// Reduction along multiple axes using `max`.
-pub trait MaxTo: HasErr + HasShape {
+pub trait MaxTo: Sized + HasShape {
     /// Max reduction. **Pytorch equivalent**: `t.amax(Ax)`
     ///
     /// **NOTE** This evenly distributes gradients between all equal maximum values, instead
@@ -55,13 +55,13 @@ pub trait MaxTo: HasErr + HasShape {
         self.try_max().unwrap()
     }
     /// Fallible version of [MaxTo::max]
-    fn try_max<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Self::Err>
+    fn try_max<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Error>
     where
         Self::Shape: ReduceShapeTo<Dst, Ax>;
 }
 
 impl<S: Shape, E: Dtype, D: MaxReduceKernel<E>, T: Tape<E, D>> MaxTo for Tensor<S, E, D, T> {
-    fn try_max<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Self::Err>
+    fn try_max<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Error>
     where
         Self::Shape: ReduceShapeTo<Dst, Ax>,
     {
diff --git a/dfdx-core/src/tensor_ops/maximum/mod.rs b/dfdx-core/src/tensor_ops/maximum/mod.rs
index 4d042c468..3ef180774 100644
--- a/dfdx-core/src/tensor_ops/maximum/mod.rs
+++ b/dfdx-core/src/tensor_ops/maximum/mod.rs
@@ -39,7 +39,10 @@ impl<S: Shape, E: Dtype, D: Device<E>, LTape: Tape<E, D>> Tensor<S, E, D, LTape>
     }
 
     /// See [maximum]
-    pub fn try_maximum<R: Default>(self, rhs: Tensor<S, E, D, R>) -> Result<Self, D::Err>
+    pub fn try_maximum<R: Default>(
+        self,
+        rhs: Tensor<S, E, D, R>,
+    ) -> Result<Self, crate::tensor::Error>
     where
         LTape: Merge<R>,
     {
diff --git a/dfdx-core/src/tensor_ops/mean_to.rs b/dfdx-core/src/tensor_ops/mean_to.rs
index fb0b2f851..136d64d53 100644
--- a/dfdx-core/src/tensor_ops/mean_to.rs
+++ b/dfdx-core/src/tensor_ops/mean_to.rs
@@ -2,7 +2,7 @@ use super::*;
 use crate::{shapes::*, tensor::*};
 
 /// Reduction along multiple axes using `mean`.
-pub trait MeanTo: HasErr + HasShape {
+pub trait MeanTo: Sized + HasShape {
     /// Mean reduction. **Pytorch equivalent**: `t.mean(Axes)`
     ///
     /// Example:
@@ -29,13 +29,13 @@ pub trait MeanTo: HasErr + HasShape {
         self.try_mean().unwrap()
     }
     /// Fallible version of [MeanTo::mean]
-    fn try_mean<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Self::Err>
+    fn try_mean<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Error>
     where
         Self::Shape: HasAxes<Ax> + ReduceShapeTo<Dst, Ax>;
 }
 
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> MeanTo for Tensor<S, E, D, T> {
-    fn try_mean<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Self::Err>
+    fn try_mean<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Error>
     where
         Self::Shape: HasAxes<Ax> + ReduceShapeTo<Dst, Ax>,
     {
diff --git a/dfdx-core/src/tensor_ops/min_to/cpu_kernel.rs b/dfdx-core/src/tensor_ops/min_to/cpu_kernel.rs
index e44302622..9b7f35491 100644
--- a/dfdx-core/src/tensor_ops/min_to/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/min_to/cpu_kernel.rs
@@ -1,6 +1,6 @@
 use crate::{
     shapes::{Axes, Dtype, HasAxes, ReduceShapeTo, Shape},
-    tensor::{Cpu, Tensor, ZerosTensor},
+    tensor::{Cpu, Error, Tensor, ZerosTensor},
     tensor_ops::utilities::reduction_utils::index_for_reductions,
 };
 
@@ -11,7 +11,7 @@ impl<E: Dtype + Float> super::MinReduceKernel<E> for Cpu {
         &self,
         dst: Dst,
         inp: &Tensor<Src, E, Self>,
-    ) -> Result<Tensor<Dst, E, Self>, Self::Err>
+    ) -> Result<Tensor<Dst, E, Self>, Error>
     where
         Src: ReduceShapeTo<Dst, Ax>,
     {
@@ -44,7 +44,7 @@ impl<E: Dtype + Float> super::MinReduceKernel<E> for Cpu {
         grad_inp: &mut Self::Vec,
         out: &Tensor<Dst, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>
+    ) -> Result<(), Error>
     where
         Src: ReduceShapeTo<Dst, Ax>,
     {
diff --git a/dfdx-core/src/tensor_ops/min_to/cuda_kernel.rs b/dfdx-core/src/tensor_ops/min_to/cuda_kernel.rs
index 8db3c1b5a..1e493b12e 100644
--- a/dfdx-core/src/tensor_ops/min_to/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/min_to/cuda_kernel.rs
@@ -1,7 +1,7 @@
 use crate::{
     dtypes::*,
     shapes::*,
-    tensor::{launch_cfg, Cuda, Tensor},
+    tensor::{launch_cfg, Cuda, Error, Tensor},
     tensor_ops::reduction_utils::*,
 };
 
@@ -51,7 +51,7 @@ where
         &self,
         dst: Dst,
         inp: &Tensor<Src, E, Self>,
-    ) -> Result<Tensor<Dst, E, Self>, Self::Err>
+    ) -> Result<Tensor<Dst, E, Self>, Error>
     where
         Src: ReduceShapeTo<Dst, Ax>,
     {
@@ -103,7 +103,7 @@ where
         grad_inp: &mut Self::Vec,
         out: &Tensor<Dst, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>
+    ) -> Result<(), Error>
     where
         Src: ReduceShapeTo<Dst, Ax>,
     {
diff --git a/dfdx-core/src/tensor_ops/min_to/mod.rs b/dfdx-core/src/tensor_ops/min_to/mod.rs
index ae426f4f1..9cef2da54 100644
--- a/dfdx-core/src/tensor_ops/min_to/mod.rs
+++ b/dfdx-core/src/tensor_ops/min_to/mod.rs
@@ -10,7 +10,7 @@ pub trait MinReduceKernel<E: Dtype>: Storage<E> {
         &self,
         dst: Dst,
         inp: &Tensor<Src, E, Self>,
-    ) -> Result<Tensor<Dst, E, Self>, Self::Err>
+    ) -> Result<Tensor<Dst, E, Self>, Error>
     where
         Src: ReduceShapeTo<Dst, Ax>;
     fn backward<Src: Shape, Dst: Shape, Ax: Axes>(
@@ -19,13 +19,13 @@ pub trait MinReduceKernel<E: Dtype>: Storage<E> {
         grad_inp: &mut Self::Vec,
         out: &Tensor<Dst, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>
+    ) -> Result<(), Error>
     where
         Src: ReduceShapeTo<Dst, Ax>;
 }
 
 /// Reduction along multiple axes using `min`.
-pub trait MinTo: HasErr + HasShape {
+pub trait MinTo: Sized + HasShape {
     /// Min reduction. **Pytorch equivalent**: `t.amin(Ax)`
     ///
     /// **NOTE** This evenly distributes gradients between all equal maximum values, instead
@@ -55,13 +55,13 @@ pub trait MinTo: HasErr + HasShape {
         self.try_min().unwrap()
     }
     /// Fallible version of [MinTo::min]
-    fn try_min<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Self::Err>
+    fn try_min<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Error>
     where
         Self::Shape: ReduceShapeTo<Dst, Ax>;
 }
 
 impl<S: Shape, E: Dtype, D: MinReduceKernel<E>, T: Tape<E, D>> MinTo for Tensor<S, E, D, T> {
-    fn try_min<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Self::Err>
+    fn try_min<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Error>
     where
         Self::Shape: ReduceShapeTo<Dst, Ax>,
     {
diff --git a/dfdx-core/src/tensor_ops/minimum/mod.rs b/dfdx-core/src/tensor_ops/minimum/mod.rs
index e93c38426..adcc6dfbc 100644
--- a/dfdx-core/src/tensor_ops/minimum/mod.rs
+++ b/dfdx-core/src/tensor_ops/minimum/mod.rs
@@ -39,7 +39,10 @@ impl<S: Shape, E: Dtype, D: Device<E>, LTape: Tape<E, D>> Tensor<S, E, D, LTape>
     }
 
     /// See [minimum]
-    pub fn try_minimum<R: Default>(self, rhs: Tensor<S, E, D, R>) -> Result<Self, D::Err>
+    pub fn try_minimum<R: Default>(
+        self,
+        rhs: Tensor<S, E, D, R>,
+    ) -> Result<Self, crate::tensor::Error>
     where
         LTape: Merge<R>,
     {
diff --git a/dfdx-core/src/tensor_ops/mul/mod.rs b/dfdx-core/src/tensor_ops/mul/mod.rs
index 0d345de14..8179509d3 100644
--- a/dfdx-core/src/tensor_ops/mul/mod.rs
+++ b/dfdx-core/src/tensor_ops/mul/mod.rs
@@ -46,9 +46,9 @@ where
 }
 
 /// Fallible version of [std::ops::Mul]. See [mul].
-pub trait TryMul<Rhs = Self>: HasErr {
+pub trait TryMul<Rhs = Self> {
     type Output;
-    fn try_mul(self, rhs: Rhs) -> Result<Self::Output, Self::Err>;
+    fn try_mul(self, rhs: Rhs) -> Result<Self::Output, Error>;
 }
 
 impl<S: Shape, E: Dtype, D: BinaryKernel<BinaryMulKernelOp, E>, LhsTape: Tape<E, D>, R>
@@ -57,7 +57,7 @@ where
     LhsTape: Merge<R>,
 {
     type Output = Self;
-    fn try_mul(self, rhs: Tensor<S, E, D, R>) -> Result<Self, Self::Err> {
+    fn try_mul(self, rhs: Tensor<S, E, D, R>) -> Result<Self, Error> {
         try_binary_op(BinaryMulKernelOp, self, rhs)
     }
 }
@@ -67,7 +67,7 @@ where
     D: UnaryKernel<ScalarMulKernelOp<E>, E>,
 {
     type Output = Self;
-    fn try_mul(self, rhs: Rhs) -> Result<Self, Self::Err> {
+    fn try_mul(self, rhs: Rhs) -> Result<Self, Error> {
         let rhs: f64 = rhs.into();
         let scalar: E = E::from_f64(rhs).unwrap();
         try_unary_op(ScalarMulKernelOp { scalar }, self)
diff --git a/dfdx-core/src/tensor_ops/nans_to/mod.rs b/dfdx-core/src/tensor_ops/nans_to/mod.rs
index 9b511d5e7..176bb1c5e 100644
--- a/dfdx-core/src/tensor_ops/nans_to/mod.rs
+++ b/dfdx-core/src/tensor_ops/nans_to/mod.rs
@@ -35,7 +35,7 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<NansToKernelOp<E>, E>, T: Tape<E, D>> Te
         self.try_nans_to(value).unwrap()
     }
     /// See [nans_to]
-    pub fn try_nans_to(self, value: impl Into<f64>) -> Result<Self, D::Err> {
+    pub fn try_nans_to(self, value: impl Into<f64>) -> Result<Self, crate::tensor::Error> {
         let value = E::from_f64(value.into()).unwrap();
         try_unary_op(NansToKernelOp(value), self)
     }
diff --git a/dfdx-core/src/tensor_ops/negate/mod.rs b/dfdx-core/src/tensor_ops/negate/mod.rs
index 89bf985c6..f6ad27db0 100644
--- a/dfdx-core/src/tensor_ops/negate/mod.rs
+++ b/dfdx-core/src/tensor_ops/negate/mod.rs
@@ -30,7 +30,7 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<NegateKernelOp, E>, T: Tape<E, D>> Tenso
     pub fn negate(self) -> Self {
         self.try_negate().unwrap()
     }
-    pub fn try_negate(self) -> Result<Self, D::Err> {
+    pub fn try_negate(self) -> Result<Self, crate::tensor::Error> {
         try_unary_op(NegateKernelOp, self)
     }
 }
diff --git a/dfdx-core/src/tensor_ops/normalize.rs b/dfdx-core/src/tensor_ops/normalize.rs
index 49e13994d..46e2b0fd5 100644
--- a/dfdx-core/src/tensor_ops/normalize.rs
+++ b/dfdx-core/src/tensor_ops/normalize.rs
@@ -1,6 +1,6 @@
 use crate::{
     shapes::{Axes, Dtype, ReduceShape, Shape},
-    tensor::{HasErr, Tape, Tensor},
+    tensor::{Error, Tape, Tensor},
 };
 
 use super::{BroadcastTo, Device, MeanTo, TryAdd, TryDiv, TrySub};
@@ -32,10 +32,7 @@ impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Tensor<S, E, D, T> {
     }
 
     /// See [normalize]
-    pub fn try_normalize<Ax: Axes>(
-        self,
-        epsilon: impl Into<f64>,
-    ) -> Result<Self, <Self as HasErr>::Err>
+    pub fn try_normalize<Ax: Axes>(self, epsilon: impl Into<f64>) -> Result<Self, Error>
     where
         S: ReduceShape<Ax>,
     {
diff --git a/dfdx-core/src/tensor_ops/permute_to.rs b/dfdx-core/src/tensor_ops/permute_to.rs
index 920d46e62..372ec9df2 100644
--- a/dfdx-core/src/tensor_ops/permute_to.rs
+++ b/dfdx-core/src/tensor_ops/permute_to.rs
@@ -21,7 +21,7 @@ use crate::{shapes::*, tensor::*};
 /// let b: Tensor<Rank2<3, 2>, f32, _> = a.permute::<_, Axes2<1, 0>>();
 /// assert_eq!(b.array(), [[1.0, 4.0], [2.0, 5.0], [3.0, 6.0]]);
 /// ```
-pub trait PermuteTo: HasErr + HasShape {
+pub trait PermuteTo: Sized + HasShape {
     /// Permutes the tensor.
     fn permute<Dst: Shape, Ax: Axes>(self) -> Self::WithShape<Dst>
     where
@@ -30,13 +30,13 @@ pub trait PermuteTo: HasErr + HasShape {
         self.try_permute().unwrap()
     }
     /// Fallible version of [PermuteTo::permute]
-    fn try_permute<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Self::Err>
+    fn try_permute<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Error>
     where
         Self::Shape: PermuteShapeTo<Dst, Ax>;
 }
 
 impl<S: Shape, E, D: Storage<E>, T: Tape<E, D>> PermuteTo for Tensor<S, E, D, T> {
-    fn try_permute<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Self::Err>
+    fn try_permute<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Error>
     where
         Self::Shape: PermuteShapeTo<Dst, Ax>,
     {
diff --git a/dfdx-core/src/tensor_ops/pool2d/cpu_kernel.rs b/dfdx-core/src/tensor_ops/pool2d/cpu_kernel.rs
index 478b19738..aaefa32eb 100644
--- a/dfdx-core/src/tensor_ops/pool2d/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/pool2d/cpu_kernel.rs
@@ -59,7 +59,7 @@ impl super::Pool2DKind {
 }
 
 impl<E: Float + Dtype> super::Pool2DKernel<E> for Cpu {
-    fn alloc<S: Shape>(&self, s: S) -> Result<Tensor<S, E, Self>, Self::Err> {
+    fn alloc<S: Shape>(&self, s: S) -> Result<Tensor<S, E, Self>, Error> {
         self.try_zeros_like(&s)
     }
     fn forward<I: Shape, O: Shape>(
@@ -67,7 +67,7 @@ impl<E: Float + Dtype> super::Pool2DKernel<E> for Cpu {
         op: super::Pool2DOp,
         inp: &Tensor<I, E, Self>,
         out: &mut Tensor<O, E, Self>,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let istr = make_4d::<I>(inp.strides);
         let ostr = make_4d::<O>(out.strides);
 
@@ -106,7 +106,7 @@ impl<E: Float + Dtype> super::Pool2DKernel<E> for Cpu {
         grad_inp: &mut Self::Vec,
         out: &Tensor<O, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let istr = make_4d::<I>(inp.strides);
         let ostr = make_4d::<O>(out.strides);
 
diff --git a/dfdx-core/src/tensor_ops/pool2d/cuda_kernel.rs b/dfdx-core/src/tensor_ops/pool2d/cuda_kernel.rs
index 720a516a0..6ab840ab9 100644
--- a/dfdx-core/src/tensor_ops/pool2d/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/pool2d/cuda_kernel.rs
@@ -1,7 +1,7 @@
 use crate::{
     dtypes::*,
     shapes::*,
-    tensor::{launch_cfg, Cuda, Tensor},
+    tensor::{launch_cfg, Cuda, Error, Tensor},
 };
 
 use std::sync::Arc;
@@ -51,7 +51,7 @@ impl<E: Dtype> super::Pool2DKernel<E> for Cuda
 where
     Self: HasCudaKernel<E>,
 {
-    fn alloc<S: Shape>(&self, s: S) -> Result<Tensor<S, E, Self>, Self::Err> {
+    fn alloc<S: Shape>(&self, s: S) -> Result<Tensor<S, E, Self>, Error> {
         let data = unsafe { self.alloc_empty::<E>(s.num_elements()) }?;
         Ok(self.build_tensor(s, s.strides(), data))
     }
@@ -60,7 +60,7 @@ where
         op: super::Pool2DOp,
         inp: &Tensor<I, E, Self>,
         out: &mut Tensor<O, E, Self>,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         if !self.dev.has_func(Self::FWD, Self::FWD) {
             self.dev
                 .load_ptx(PTX_SRC.into(), Self::FWD, &[Self::FWD, Self::BWD])?;
@@ -87,7 +87,7 @@ where
         grad_inp: &mut Self::Vec,
         out: &Tensor<O, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let inp_strides = self.dev.htod_copy(make_4d::<I>(inp.strides).into())?;
         let out_strides = self.dev.htod_copy(make_4d::<O>(out.strides).into())?;
         let bwd_fn = self.dev.get_func(Self::FWD, Self::BWD).unwrap();
diff --git a/dfdx-core/src/tensor_ops/pool2d/mod.rs b/dfdx-core/src/tensor_ops/pool2d/mod.rs
index 0281d8971..150525c70 100644
--- a/dfdx-core/src/tensor_ops/pool2d/mod.rs
+++ b/dfdx-core/src/tensor_ops/pool2d/mod.rs
@@ -32,14 +32,14 @@ pub struct Pool2DOp {
 }
 
 pub(super) trait Pool2DKernel<E: Dtype>: Storage<E> {
-    fn alloc<S: Shape>(&self, s: S) -> Result<Tensor<S, E, Self>, Self::Err>;
+    fn alloc<S: Shape>(&self, s: S) -> Result<Tensor<S, E, Self>, Error>;
 
     fn forward<I: Shape, O: Shape>(
         &self,
         op: Pool2DOp,
         inp: &Tensor<I, E, Self>,
         out: &mut Tensor<O, E, Self>,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 
     #[allow(clippy::too_many_arguments)]
     fn backward<I: Shape, O: Shape>(
@@ -49,12 +49,11 @@ pub(super) trait Pool2DKernel<E: Dtype>: Storage<E> {
         grad_inp: &mut Self::Vec,
         out: &Tensor<O, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 }
 
 pub trait TryPool2D<Kernel, Stride, Padding, Dilation>: Sized {
     type Pooled;
-    type Error: std::fmt::Debug;
 
     fn pool2d(
         self,
@@ -75,7 +74,7 @@ pub trait TryPool2D<Kernel, Stride, Padding, Dilation>: Sized {
         stride: Stride,
         padding: Padding,
         dilation: Dilation,
-    ) -> Result<Self::Pooled, Self::Error>;
+    ) -> Result<Self::Pooled, Error>;
 }
 
 impl<
@@ -89,7 +88,6 @@ where
     Const<{ (DIM + 2 * PADDING - DILATION * (KERNEL - 1) - 1) / STRIDE + 1 }>: Sized,
 {
     type Pooled = Const<{ (DIM + 2 * PADDING - DILATION * (KERNEL - 1) - 1) / STRIDE + 1 }>;
-    type Error = std::convert::Infallible;
     fn try_pool2d(
         self,
         _: Pool2DKind,
@@ -97,7 +95,7 @@ where
         _: Const<STRIDE>,
         _: Const<PADDING>,
         _: Const<DILATION>,
-    ) -> Result<Self::Pooled, Self::Error> {
+    ) -> Result<Self::Pooled, Error> {
         Ok(Const)
     }
 }
@@ -106,7 +104,6 @@ impl<Kernel: Dim, Stride: Dim, Padding: Dim, Dilation: Dim>
     TryPool2D<Kernel, Stride, Padding, Dilation> for usize
 {
     type Pooled = usize;
-    type Error = std::convert::Infallible;
     fn try_pool2d(
         self,
         _: Pool2DKind,
@@ -114,7 +111,7 @@ impl<Kernel: Dim, Stride: Dim, Padding: Dim, Dilation: Dim>
         stride: Stride,
         padding: Padding,
         dilation: Dilation,
-    ) -> Result<Self::Pooled, Self::Error> {
+    ) -> Result<Self::Pooled, Error> {
         Ok((self + 2 * padding.size() - 1)
             .checked_sub(dilation.size() * (kernel.size() - 1))
             .unwrap()
@@ -140,7 +137,6 @@ where
     T: Tape<E, D>,
 {
     type Pooled = Tensor<(Chan, H::Pooled, W::Pooled), E, D, T>;
-    type Error = D::Err;
 
     fn try_pool2d(
         self,
@@ -149,7 +145,7 @@ where
         stride: Stride,
         padding: Padding,
         dilation: Dilation,
-    ) -> Result<Self::Pooled, Self::Error> {
+    ) -> Result<Self::Pooled, Error> {
         let (chan, h, w) = self.shape;
         let img = self.try_reshape_like(&(Const::<1>, chan, h, w))?;
         let out = img.try_pool2d(kind, kernel, stride, padding, dilation)?;
@@ -176,7 +172,6 @@ where
     T: Tape<E, D>,
 {
     type Pooled = Tensor<(Batch, Chan, H::Pooled, W::Pooled), E, D, T>;
-    type Error = D::Err;
 
     fn try_pool2d(
         self,
@@ -185,7 +180,7 @@ where
         stride: Stride,
         padding: Padding,
         dilation: Dilation,
-    ) -> Result<Self::Pooled, Self::Error> {
+    ) -> Result<Self::Pooled, Error> {
         let (batch, chan, h, w) = self.shape;
         if self.strides != self.shape.strides() {
             panic!("Image input to pool2d must be contiguous");
diff --git a/dfdx-core/src/tensor_ops/pow/cuda_kernel.rs b/dfdx-core/src/tensor_ops/pow/cuda_kernel.rs
index e698e2f4d..7d58027e3 100644
--- a/dfdx-core/src/tensor_ops/pow/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/pow/cuda_kernel.rs
@@ -41,7 +41,7 @@ where
         &self,
         op: super::PowiKernelOp,
         inp: Cow<Tensor<S, E, Self>>,
-    ) -> Result<Tensor<S, E, Self>, Self::Err> {
+    ) -> Result<Tensor<S, E, Self>, Error> {
         self.forward(super::PowfKernelOp(E::from_i32(op.0).unwrap()), inp)
     }
 
@@ -52,7 +52,7 @@ where
         grad_inp: &mut Self::Vec,
         out: &impl Tensorlike<S, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         self.backward(
             super::PowfKernelOp(E::from_i32(op.0).unwrap()),
             inp,
diff --git a/dfdx-core/src/tensor_ops/pow/mod.rs b/dfdx-core/src/tensor_ops/pow/mod.rs
index 1d33ad243..bce8cff1e 100644
--- a/dfdx-core/src/tensor_ops/pow/mod.rs
+++ b/dfdx-core/src/tensor_ops/pow/mod.rs
@@ -34,7 +34,7 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<PowfKernelOp<E>, E>, T: Tape<E, D>> Tens
         self.try_powf(exponent).unwrap()
     }
     /// See [powf]
-    pub fn try_powf(self, exponent: impl Into<f64>) -> Result<Self, D::Err> {
+    pub fn try_powf(self, exponent: impl Into<f64>) -> Result<Self, crate::tensor::Error> {
         let exponent = E::from_f64(exponent.into()).unwrap();
         try_unary_op(PowfKernelOp(exponent), self)
     }
@@ -60,7 +60,7 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<PowiKernelOp, E>, T: Tape<E, D>> Tensor<
         self.try_powi(exponent).unwrap()
     }
     /// See [powi]
-    pub fn try_powi(self, exponent: i32) -> Result<Self, D::Err> {
+    pub fn try_powi(self, exponent: i32) -> Result<Self, crate::tensor::Error> {
         try_unary_op(PowiKernelOp(exponent), self)
     }
 }
diff --git a/dfdx-core/src/tensor_ops/prelu.rs b/dfdx-core/src/tensor_ops/prelu.rs
index 0bc018ca5..29d81c472 100644
--- a/dfdx-core/src/tensor_ops/prelu.rs
+++ b/dfdx-core/src/tensor_ops/prelu.rs
@@ -49,12 +49,12 @@ pub fn leakyrelu<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>>(
 /// let r = prelu(t, a);
 /// assert_eq!(r.array(), [-0.05, 0.0, 1.0, 2.0]);
 /// ```
-pub trait TryPReLU<T = Self>: HasErr {
+pub trait TryPReLU<T = Self>: Sized {
     fn prelu(self, rhs: T) -> Self {
         self.try_prelu(rhs).unwrap()
     }
 
-    fn try_prelu(self, rhs: T) -> Result<Self, Self::Err>;
+    fn try_prelu(self, rhs: T) -> Result<Self, Error>;
 }
 
 impl<S: Shape, E: Dtype, D, LhsTape: Tape<E, D>, R> TryPReLU<Tensor<S, E, D, R>>
@@ -64,7 +64,7 @@ where
     LhsTape: Merge<R>,
 {
     /// See [prelu]
-    fn try_prelu(self, rhs: Tensor<S, E, D, R>) -> Result<Self, Self::Err> {
+    fn try_prelu(self, rhs: Tensor<S, E, D, R>) -> Result<Self, Error> {
         let scaled = self.with_empty_tape().try_mul(rhs)?;
         self.try_lt(E::default())?.try_choose(scaled, self)
     }
@@ -72,7 +72,7 @@ where
 
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> TryPReLU<E> for Tensor<S, E, D, T> {
     /// See [prelu]
-    fn try_prelu(self, rhs: E) -> Result<Self, Self::Err> {
+    fn try_prelu(self, rhs: E) -> Result<Self, Error> {
         let dev = self.device.clone();
         let scale = dev.tensor(rhs).retaped::<T>().broadcast_like(self.shape());
         let scaled = self.with_empty_tape().try_mul(scale)?;
diff --git a/dfdx-core/src/tensor_ops/realize_to.rs b/dfdx-core/src/tensor_ops/realize_to.rs
index 455ed62f1..894b2e86e 100644
--- a/dfdx-core/src/tensor_ops/realize_to.rs
+++ b/dfdx-core/src/tensor_ops/realize_to.rs
@@ -13,7 +13,7 @@ use crate::{shapes::*, tensor::*};
 ///     Err(old) => println!("Shape could not be realized, returned the original tensor"),
 /// }
 /// ```
-pub trait RealizeTo: HasErr + HasShape {
+pub trait RealizeTo: Sized + HasShape {
     /// Realizes the concrete shape of the tensor as another compatable shape,
     /// or returns the original tensor if the new shape's dimensions are incompatable.
     fn realize<Dst: Shape<Concrete = <<Self as HasShape>::Shape as Shape>::Concrete>>(
diff --git a/dfdx-core/src/tensor_ops/recip/mod.rs b/dfdx-core/src/tensor_ops/recip/mod.rs
index 3a1d9157d..359226331 100644
--- a/dfdx-core/src/tensor_ops/recip/mod.rs
+++ b/dfdx-core/src/tensor_ops/recip/mod.rs
@@ -31,7 +31,7 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<RecipKernelOp, E>, T: Tape<E, D>> Tensor
         self.try_recip().unwrap()
     }
     /// See [recip]
-    pub fn try_recip(self) -> Result<Self, D::Err> {
+    pub fn try_recip(self) -> Result<Self, crate::tensor::Error> {
         try_unary_op(RecipKernelOp, self)
     }
 }
diff --git a/dfdx-core/src/tensor_ops/relu/mod.rs b/dfdx-core/src/tensor_ops/relu/mod.rs
index 0701741c3..1b8dc8b1e 100644
--- a/dfdx-core/src/tensor_ops/relu/mod.rs
+++ b/dfdx-core/src/tensor_ops/relu/mod.rs
@@ -34,7 +34,7 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<ReLUKernelOp, E>, T: Tape<E, D>> Tensor<
         self.try_relu().unwrap()
     }
     /// See [relu]
-    pub fn try_relu(self) -> Result<Self, D::Err> {
+    pub fn try_relu(self) -> Result<Self, crate::tensor::Error> {
         try_unary_op(ReLUKernelOp, self)
     }
 }
diff --git a/dfdx-core/src/tensor_ops/reshape_to/cpu_kernel.rs b/dfdx-core/src/tensor_ops/reshape_to/cpu_kernel.rs
index 15a00e75d..31bad33a7 100644
--- a/dfdx-core/src/tensor_ops/reshape_to/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/reshape_to/cpu_kernel.rs
@@ -1,7 +1,7 @@
 use crate::shapes::{Dtype, Shape};
 use crate::tensor::{
     cpu::{LendingIterator, NdIndex},
-    Cpu, Tensor, ZerosTensor,
+    Cpu, Error, Tensor, ZerosTensor,
 };
 
 impl<E: Dtype> super::ReshapeKernel<E> for Cpu {
@@ -9,7 +9,7 @@ impl<E: Dtype> super::ReshapeKernel<E> for Cpu {
         &self,
         dst: &Dst,
         inp: &Tensor<Src, E, Self>,
-    ) -> Result<Tensor<Dst, E, Self>, Self::Err> {
+    ) -> Result<Tensor<Dst, E, Self>, Error> {
         let mut out = self.try_zeros_like(dst)?;
         let mut inp_iter = inp.iter();
         let mut out_iter = out.iter_mut();
@@ -24,7 +24,7 @@ impl<E: Dtype> super::ReshapeKernel<E> for Cpu {
         inp: &Tensor<Src, E, Self>,
         grad_inp: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let mut inp_idx = NdIndex::new(inp.shape, inp.strides);
         let mut out_idx = NdIndex::new(*dst, dst.strides());
         while let Some((i, o)) = inp_idx.next().zip(out_idx.next()) {
diff --git a/dfdx-core/src/tensor_ops/reshape_to/cuda_kernel.rs b/dfdx-core/src/tensor_ops/reshape_to/cuda_kernel.rs
index 607434f36..33612c01e 100644
--- a/dfdx-core/src/tensor_ops/reshape_to/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/reshape_to/cuda_kernel.rs
@@ -1,6 +1,6 @@
 use crate::{
     shapes::*,
-    tensor::{launch_cfg, Cuda, Tensor},
+    tensor::{launch_cfg, Cuda, Error, Tensor},
 };
 use cudarc::{
     driver::{DeviceSlice, LaunchAsync},
@@ -15,7 +15,7 @@ impl<E: Dtype + CudaTypeName> super::ReshapeKernel<E> for Cuda {
         &self,
         dst: &Dst,
         inp: &Tensor<Src, E, Self>,
-    ) -> Result<Tensor<Dst, E, Self>, Self::Err> {
+    ) -> Result<Tensor<Dst, E, Self>, Error> {
         let module = std::format!("reshape_fwd_{}", E::NAME);
         if !self.dev.has_func(&module, "reshape_fwd") {
             let src = FWD_KERNEL.replace("$T", E::NAME);
@@ -62,7 +62,7 @@ impl<E: Dtype + CudaTypeName> super::ReshapeKernel<E> for Cuda {
         inp: &Tensor<Src, E, Self>,
         grad_inp: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let module = std::format!("reshape_bwd_{}", E::NAME);
         if !self.dev.has_func(&module, "reshape_bwd") {
             let src = BWD_KERNEL.replace("$T", E::NAME);
diff --git a/dfdx-core/src/tensor_ops/reshape_to/mod.rs b/dfdx-core/src/tensor_ops/reshape_to/mod.rs
index 04f058313..2d32ef460 100644
--- a/dfdx-core/src/tensor_ops/reshape_to/mod.rs
+++ b/dfdx-core/src/tensor_ops/reshape_to/mod.rs
@@ -10,14 +10,14 @@ pub trait ReshapeKernel<E: Dtype>: Storage<E> {
         &self,
         dst: &Dst,
         inp: &Tensor<Src, E, Self>,
-    ) -> Result<Tensor<Dst, E, Self>, Self::Err>;
+    ) -> Result<Tensor<Dst, E, Self>, Error>;
     fn backward<Src: Shape, Dst: Shape>(
         &self,
         dst: &Dst,
         inp: &Tensor<Src, E, Self>,
         grad_inp: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 }
 
 /// Changes the shape of a tensor without re-ordering axes. If the tensor is contiguous
@@ -47,7 +47,7 @@ pub trait ReshapeKernel<E: Dtype>: Storage<E> {
 /// let t: Tensor<Rank2<2, 4>, f32, _> = dev.zeros();
 /// let t: Tensor<(usize, ), f32, _> = t.reshape_like(&(8, ));
 /// ```
-pub trait ReshapeTo: HasErr + HasShape {
+pub trait ReshapeTo: Sized + HasShape {
     /// Reshapes a tensor to a different compile time shape.
     fn reshape<Dst: ConstShape>(self) -> Self::WithShape<Dst>
     where
@@ -57,7 +57,7 @@ pub trait ReshapeTo: HasErr + HasShape {
         self.try_reshape().unwrap()
     }
     /// Reshapes a tensor to a different compile time shape.
-    fn try_reshape<Dst: ConstShape>(self) -> Result<Self::WithShape<Dst>, Self::Err>
+    fn try_reshape<Dst: ConstShape>(self) -> Result<Self::WithShape<Dst>, Error>
     where
         Self::Shape: ConstShape,
     {
@@ -75,16 +75,16 @@ pub trait ReshapeTo: HasErr + HasShape {
         self.try_contiguous().unwrap()
     }
     /// See [`ReshapeTo::contiguous`]
-    fn try_contiguous(self) -> Result<Self::WithShape<Self::Shape>, Self::Err> {
+    fn try_contiguous(self) -> Result<Self::WithShape<Self::Shape>, Error> {
         let shape = *self.shape();
         self.try_reshape_like(&shape)
     }
     /// Reshapes a tensor to a different runtime shape.
-    fn try_reshape_like<Dst: Shape>(self, dst: &Dst) -> Result<Self::WithShape<Dst>, Self::Err>;
+    fn try_reshape_like<Dst: Shape>(self, dst: &Dst) -> Result<Self::WithShape<Dst>, Error>;
 }
 
 impl<S: Shape, E: Dtype, D: ReshapeKernel<E>, T: Tape<E, D>> ReshapeTo for Tensor<S, E, D, T> {
-    fn try_reshape_like<Dst: Shape>(self, dst: &Dst) -> Result<Self::WithShape<Dst>, Self::Err> {
+    fn try_reshape_like<Dst: Shape>(self, dst: &Dst) -> Result<Self::WithShape<Dst>, Error> {
         assert_eq!(self.shape().num_elements(), dst.num_elements());
         if self.shape.strides() == self.strides {
             Ok(Tensor {
diff --git a/dfdx-core/src/tensor_ops/rmsprop/cpu_kernel.rs b/dfdx-core/src/tensor_ops/rmsprop/cpu_kernel.rs
index f7408651c..d6c7f70fd 100644
--- a/dfdx-core/src/tensor_ops/rmsprop/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/rmsprop/cpu_kernel.rs
@@ -1,6 +1,6 @@
 use crate::{
     dtypes::{Dtype, NotMixedPrecision},
-    tensor::cpu::Cpu,
+    tensor::{cpu::Cpu, Error},
 };
 
 use super::{RMSpropConfig, RMSpropKernel, WeightDecay};
@@ -15,7 +15,7 @@ impl RMSpropKernel<crate::dtypes::AMP<crate::dtypes::f16>> for Cpu {
         square_avg: &mut Self::Vec,
         grad_avg: &mut Self::Vec,
         grad: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let alpha = cfg.alpha as f32;
         let eps = cfg.eps as f32;
         let lr = cfg.lr as f32;
@@ -80,7 +80,7 @@ impl<E: num_traits::Float + Dtype + NotMixedPrecision> RMSpropKernel<E> for Cpu
         square_avg: &mut Self::Vec,
         grad_avg: &mut Self::Vec,
         grad: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let alpha = E::from_f64(cfg.alpha).unwrap();
         let eps = E::from_f64(cfg.eps).unwrap();
         let lr = E::from_f64(cfg.lr).unwrap();
diff --git a/dfdx-core/src/tensor_ops/rmsprop/cuda_kernel.rs b/dfdx-core/src/tensor_ops/rmsprop/cuda_kernel.rs
index 8491d44c3..ca1842e70 100644
--- a/dfdx-core/src/tensor_ops/rmsprop/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/rmsprop/cuda_kernel.rs
@@ -1,7 +1,7 @@
 use super::RMSpropConfig;
 use crate::{
     dtypes::*,
-    tensor::{launch_cfg, Cuda},
+    tensor::{launch_cfg, Cuda, Error},
     tensor_ops::optim::*,
 };
 
@@ -82,7 +82,7 @@ where
         square_avg: &mut Self::Vec,
         grad_avg: &mut Self::Vec,
         grad: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         if !self.dev.has_func(Self::MOD, Self::FWD) {
             self.dev.load_ptx(PTX_SRC.into(), Self::MOD, &[Self::FWD])?;
         }
diff --git a/dfdx-core/src/tensor_ops/rmsprop/mod.rs b/dfdx-core/src/tensor_ops/rmsprop/mod.rs
index 55afb3089..0ccddad39 100644
--- a/dfdx-core/src/tensor_ops/rmsprop/mod.rs
+++ b/dfdx-core/src/tensor_ops/rmsprop/mod.rs
@@ -55,7 +55,7 @@ pub trait RMSpropKernel<E: Dtype>: Storage<E> {
         square_avg: &mut Self::Vec,
         grad_avg: &mut Self::Vec,
         grad: &Self::Vec,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 }
 
 impl RMSpropConfig {
@@ -67,7 +67,7 @@ impl RMSpropConfig {
         square_avg: &mut D::Vec,
         grad_avg: &mut D::Vec,
         grad: &D::Vec,
-    ) -> Result<(), D::Err> {
+    ) -> Result<(), crate::tensor::Error> {
         param.device.rmsprop_kernel(
             self,
             std::sync::Arc::make_mut(&mut param.data),
diff --git a/dfdx-core/src/tensor_ops/roll/cpu_kernel.rs b/dfdx-core/src/tensor_ops/roll/cpu_kernel.rs
index f1814c701..f77e7d058 100644
--- a/dfdx-core/src/tensor_ops/roll/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/roll/cpu_kernel.rs
@@ -10,7 +10,7 @@ impl<E: Dtype> super::RollKernel<E> for Cpu {
         &self,
         op: super::RollOp,
         inp: &Tensor<S, E, Self>,
-    ) -> Result<Tensor<S, E, Self>, Self::Err> {
+    ) -> Result<Tensor<S, E, Self>, Error> {
         let dims = inp.shape.concrete();
         let strides = inp.shape.strides();
         let mut data = self.try_alloc_zeros::<E>(inp.shape.num_elements())?;
@@ -39,7 +39,7 @@ impl<E: Dtype> super::RollKernel<E> for Cpu {
         inp: &Tensor<S, E, Self>,
         grad_inp: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let dims = inp.shape.concrete();
         let strides = inp.shape.strides();
         let mut idx = NdIndex::new(inp.shape, inp.strides);
diff --git a/dfdx-core/src/tensor_ops/roll/cuda_kernel.rs b/dfdx-core/src/tensor_ops/roll/cuda_kernel.rs
index aaf3bcf6c..4404ceb7f 100644
--- a/dfdx-core/src/tensor_ops/roll/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/roll/cuda_kernel.rs
@@ -32,7 +32,7 @@ where
         &self,
         op: super::RollOp,
         inp: &Tensor<S, E, Self>,
-    ) -> Result<Tensor<S, E, Self>, Self::Err> {
+    ) -> Result<Tensor<S, E, Self>, Error> {
         if !self.dev.has_func(Self::FNS[0], Self::FNS[0]) {
             self.dev.load_ptx(PTX_SRC.into(), Self::FNS[0], Self::FNS)?;
         }
@@ -66,7 +66,7 @@ where
         inp: &Tensor<S, E, Self>,
         grad_inp: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let numel = inp.shape.num_elements();
         let strides = inp.shape.strides();
 
diff --git a/dfdx-core/src/tensor_ops/roll/mod.rs b/dfdx-core/src/tensor_ops/roll/mod.rs
index ad7318792..40a3ac5ec 100644
--- a/dfdx-core/src/tensor_ops/roll/mod.rs
+++ b/dfdx-core/src/tensor_ops/roll/mod.rs
@@ -19,14 +19,14 @@ pub trait RollKernel<E: Dtype>: Storage<E> {
         &self,
         op: RollOp,
         inp: &Tensor<S, E, Self>,
-    ) -> Result<Tensor<S, E, Self>, Self::Err>;
+    ) -> Result<Tensor<S, E, Self>, Error>;
     fn backward<S: Shape>(
         &self,
         op: RollOp,
         inp: &Tensor<S, E, Self>,
         grad_inp: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 }
 
 /// Shifts data along an axis by a specified amount.
@@ -47,7 +47,7 @@ pub trait RollKernel<E: Dtype>: Storage<E> {
 /// let r = t.roll::<Axis<3>>(1);
 /// assert_eq!(r.array(), [4.0, 1.0, 2.0, 3.0]);
 /// ```
-pub trait Roll: HasShape + HasErr {
+pub trait Roll: Sized + HasShape {
     /// Shifts data along an axis by a specified amount.
     fn roll<Ax: Axes<Array = [isize; 1]>>(self, amount: usize) -> Self
     where
@@ -57,13 +57,16 @@ pub trait Roll: HasShape + HasErr {
     }
 
     /// Shifts data along an axis by a specified amount.
-    fn try_roll<Ax: Axes<Array = [isize; 1]>>(self, amount: usize) -> Result<Self, Self::Err>
+    fn try_roll<Ax: Axes<Array = [isize; 1]>>(self, amount: usize) -> Result<Self, Error>
     where
         Self::Shape: HasAxes<Ax>;
 }
 
 impl<S: Shape, E: Dtype, D: RollKernel<E>, T: Tape<E, D>> Roll for Tensor<S, E, D, T> {
-    fn try_roll<Ax: Axes<Array = [isize; 1]>>(self, amount: usize) -> Result<Self, D::Err>
+    fn try_roll<Ax: Axes<Array = [isize; 1]>>(
+        self,
+        amount: usize,
+    ) -> Result<Self, crate::tensor::Error>
     where
         S: HasAxes<Ax>,
     {
diff --git a/dfdx-core/src/tensor_ops/select_and_gather/cpu_kernel.rs b/dfdx-core/src/tensor_ops/select_and_gather/cpu_kernel.rs
index 5b7d0ec7d..6acf39105 100644
--- a/dfdx-core/src/tensor_ops/select_and_gather/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/select_and_gather/cpu_kernel.rs
@@ -3,7 +3,7 @@
 use crate::shapes::{Axes, Dtype, RemoveDimTo, ReplaceDimTo, Shape};
 use crate::tensor::{
     cpu::{index_to_i, LendingIterator, NdIndex},
-    Cpu, Storage, Tensor, ZerosTensor,
+    Cpu, Error, Storage, Tensor, ZerosTensor,
 };
 
 impl<E: Dtype> super::ReplaceDimKernel<E> for Cpu {
@@ -11,7 +11,7 @@ impl<E: Dtype> super::ReplaceDimKernel<E> for Cpu {
         &self,
         inp: &Tensor<Src, E, Self>,
         idx: &Tensor<Idx, usize, Self>,
-    ) -> Result<Tensor<Dst, E, Self>, Self::Err>
+    ) -> Result<Tensor<Dst, E, Self>, Error>
     where
         Src: ReplaceDimTo<Dst, Idx>,
     {
@@ -53,7 +53,7 @@ impl<E: Dtype> super::ReplaceDimKernel<E> for Cpu {
         idx: &Tensor<Idx, usize, Self>,
         out: &Tensor<Dst, E, Self>,
         grad_out: &<Self as Storage<E>>::Vec,
-    ) -> Result<(), Self::Err>
+    ) -> Result<(), Error>
     where
         Src: ReplaceDimTo<Dst, Idx>,
     {
@@ -87,7 +87,7 @@ impl<E: Dtype> super::RemoveDimKernel<E> for Cpu {
         &self,
         inp: &Tensor<Src, E, Self>,
         idx: &Tensor<Idx, usize, Self>,
-    ) -> Result<Tensor<Dst, E, Self>, Self::Err>
+    ) -> Result<Tensor<Dst, E, Self>, Error>
     where
         Src: RemoveDimTo<Dst, Idx>,
     {
@@ -126,7 +126,7 @@ impl<E: Dtype> super::RemoveDimKernel<E> for Cpu {
         idx: &Tensor<Idx, usize, Self>,
         out: &Tensor<Dst, E, Self>,
         grad_out: &<Self as Storage<E>>::Vec,
-    ) -> Result<(), Self::Err>
+    ) -> Result<(), Error>
     where
         Src: RemoveDimTo<Dst, Idx>,
     {
diff --git a/dfdx-core/src/tensor_ops/select_and_gather/cuda_kernel.rs b/dfdx-core/src/tensor_ops/select_and_gather/cuda_kernel.rs
index 1d4ce5b5b..a7a5133ae 100644
--- a/dfdx-core/src/tensor_ops/select_and_gather/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/select_and_gather/cuda_kernel.rs
@@ -2,7 +2,7 @@
 use crate::{
     dtypes::*,
     shapes::{RemoveDimTo, ReplaceDimTo, Shape},
-    tensor::{launch_cfg, Cuda, Storage, Tensor},
+    tensor::{launch_cfg, Cuda, Error, Storage, Tensor},
 };
 use cudarc::driver::{DeviceSlice, LaunchAsync};
 
@@ -16,7 +16,7 @@ macro_rules! impl_cuda_kernels {
                 &self,
                 inp: &Tensor<Src, $TypeName, Self>,
                 idx: &Tensor<Idx, usize, Self>,
-            ) -> Result<Tensor<Dst, $TypeName, Self>, Self::Err>
+            ) -> Result<Tensor<Dst, $TypeName, Self>, Error>
             where
                 Src: ReplaceDimTo<Dst, Idx>,
             {
@@ -65,7 +65,7 @@ macro_rules! impl_cuda_kernels {
                 idx: &Tensor<Idx, usize, Self>,
                 _: &Tensor<Dst, $TypeName, Self>,
                 grad_out: &<Self as Storage<$TypeName>>::Vec,
-            ) -> Result<(), Self::Err>
+            ) -> Result<(), Error>
             where
                 Src: ReplaceDimTo<Dst, Idx>,
             {
@@ -101,7 +101,7 @@ macro_rules! impl_cuda_kernels {
                 &self,
                 inp: &Tensor<Src, $TypeName, Self>,
                 idx: &Tensor<Idx, usize, Self>,
-            ) -> Result<Tensor<Dst, $TypeName, Self>, Self::Err>
+            ) -> Result<Tensor<Dst, $TypeName, Self>, Error>
             where
                 Src: RemoveDimTo<Dst, Idx>,
             {
@@ -153,7 +153,7 @@ macro_rules! impl_cuda_kernels {
                 idx: &Tensor<Idx, usize, Self>,
                 out: &Tensor<Dst, $TypeName, Self>,
                 grad_out: &<Self as Storage<$TypeName>>::Vec,
-            ) -> Result<(), Self::Err>
+            ) -> Result<(), Error>
             where
                 Src: RemoveDimTo<Dst, Idx>,
             {
diff --git a/dfdx-core/src/tensor_ops/select_and_gather/mod.rs b/dfdx-core/src/tensor_ops/select_and_gather/mod.rs
index f6178e9c7..2a56f1acf 100644
--- a/dfdx-core/src/tensor_ops/select_and_gather/mod.rs
+++ b/dfdx-core/src/tensor_ops/select_and_gather/mod.rs
@@ -12,7 +12,7 @@ pub trait ReplaceDimKernel<E: Dtype>: Storage<E> + Storage<usize> {
         &self,
         inp: &Tensor<Src, E, Self>,
         idx: &Tensor<Idx, usize, Self>,
-    ) -> Result<Tensor<Dst, E, Self>, Self::Err>
+    ) -> Result<Tensor<Dst, E, Self>, Error>
     where
         Src: ReplaceDimTo<Dst, Idx>;
     fn backward<Src: Shape, Dst: Shape, Idx: Shape>(
@@ -22,7 +22,7 @@ pub trait ReplaceDimKernel<E: Dtype>: Storage<E> + Storage<usize> {
         idx: &Tensor<Idx, usize, Self>,
         out: &Tensor<Dst, E, Self>,
         grad_out: &<Self as Storage<E>>::Vec,
-    ) -> Result<(), Self::Err>
+    ) -> Result<(), Error>
     where
         Src: ReplaceDimTo<Dst, Idx>;
 }
@@ -32,7 +32,7 @@ pub trait RemoveDimKernel<E: Dtype>: Storage<E> + Storage<usize> {
         &self,
         inp: &Tensor<Src, E, Self>,
         idx: &Tensor<Idx, usize, Self>,
-    ) -> Result<Tensor<Dst, E, Self>, Self::Err>
+    ) -> Result<Tensor<Dst, E, Self>, Error>
     where
         Src: RemoveDimTo<Dst, Idx>;
     fn backward<Src: Shape, Dst: Shape, Idx: Shape>(
@@ -42,7 +42,7 @@ pub trait RemoveDimKernel<E: Dtype>: Storage<E> + Storage<usize> {
         idx: &Tensor<Idx, usize, Self>,
         out: &Tensor<Dst, E, Self>,
         grad_out: &<Self as Storage<E>>::Vec,
-    ) -> Result<(), Self::Err>
+    ) -> Result<(), Error>
     where
         Src: RemoveDimTo<Dst, Idx>;
 }
@@ -73,7 +73,7 @@ pub trait RemoveDimKernel<E: Dtype>: Storage<E> + Storage<usize> {
 /// let idx: Tensor<Rank1<3>, usize, _> = dev.tensor([0, 2, 4]);
 /// let _: Tensor<Rank1<3>, f32, _> = a.select(idx);
 ///```
-pub trait SelectTo<E, D: Storage<E> + Storage<usize>>: HasErr + HasShape {
+pub trait SelectTo<E, D: Storage<E> + Storage<usize>>: Sized + HasShape {
     /// Select values given indices.
     fn select<Dst: Shape, Idx: Shape>(self, idx: Tensor<Idx, usize, D>) -> Self::WithShape<Dst>
     where
@@ -86,7 +86,7 @@ pub trait SelectTo<E, D: Storage<E> + Storage<usize>>: HasErr + HasShape {
     fn try_select<Dst: Shape, Idx: Shape>(
         self,
         idx: Tensor<Idx, usize, D>,
-    ) -> Result<Self::WithShape<Dst>, Self::Err>
+    ) -> Result<Self::WithShape<Dst>, Error>
     where
         Self::Shape: RemoveDimTo<Dst, Idx>;
 }
@@ -97,7 +97,7 @@ impl<Src: Shape, E: Dtype, D: RemoveDimKernel<E>, T: Tape<E, D>> SelectTo<E, D>
     fn try_select<Dst: Shape, Idx: Shape>(
         self,
         idx: Tensor<Idx, usize, D>,
-    ) -> Result<Self::WithShape<Dst>, Self::Err>
+    ) -> Result<Self::WithShape<Dst>, Error>
     where
         Self::Shape: RemoveDimTo<Dst, Idx>,
     {
@@ -146,7 +146,7 @@ impl<Src: Shape, E: Dtype, D: RemoveDimKernel<E>, T: Tape<E, D>> SelectTo<E, D>
 /// let idx: Tensor<Rank2<3, 2>, usize, _> = dev.tensor([[0, 1], [2, 3], [4, 4]]);
 /// let _: Tensor<Rank2<3, 2>, f32, _> = a.gather(idx);
 ///```
-pub trait GatherTo<E, D: Storage<E> + Storage<usize>>: HasErr + HasShape {
+pub trait GatherTo<E, D: Storage<E> + Storage<usize>>: Sized + HasShape {
     /// Gather values given indices.
     fn gather<Dst: Shape, Idx: Shape>(self, idx: Tensor<Idx, usize, D>) -> Self::WithShape<Dst>
     where
@@ -158,7 +158,7 @@ pub trait GatherTo<E, D: Storage<E> + Storage<usize>>: HasErr + HasShape {
     fn try_gather<Dst: Shape, Idx: Shape>(
         self,
         idx: Tensor<Idx, usize, D>,
-    ) -> Result<Self::WithShape<Dst>, Self::Err>
+    ) -> Result<Self::WithShape<Dst>, Error>
     where
         Self::Shape: ReplaceDimTo<Dst, Idx>;
 }
@@ -169,7 +169,7 @@ impl<Src: Shape, E: Dtype, D: ReplaceDimKernel<E>, T: Tape<E, D>> GatherTo<E, D>
     fn try_gather<Dst: Shape, Idx: Shape>(
         self,
         idx: Tensor<Idx, usize, D>,
-    ) -> Result<Self::WithShape<Dst>, Self::Err>
+    ) -> Result<Self::WithShape<Dst>, Error>
     where
         Self::Shape: ReplaceDimTo<Dst, Idx>,
     {
diff --git a/dfdx-core/src/tensor_ops/sgd/cpu_kernel.rs b/dfdx-core/src/tensor_ops/sgd/cpu_kernel.rs
index 69c5653c4..2fcae414b 100644
--- a/dfdx-core/src/tensor_ops/sgd/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/sgd/cpu_kernel.rs
@@ -1,6 +1,6 @@
 use crate::{
     dtypes::{Dtype, NotMixedPrecision},
-    tensor::cpu::*,
+    tensor::{cpu::*, Error},
 };
 
 use super::{Momentum, SgdConfig, SgdKernel, WeightDecay};
@@ -13,7 +13,7 @@ impl SgdKernel<crate::dtypes::AMP<crate::dtypes::f16>> for Cpu {
         param: &mut Self::Vec,
         velocity: &mut Self::Vec,
         grad: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let lr = cfg.lr as f32;
 
         for ((p, g), v) in param
@@ -62,7 +62,7 @@ impl<E: Dtype + NotMixedPrecision> SgdKernel<E> for Cpu {
         param: &mut Self::Vec,
         velocity: &mut Self::Vec,
         grad: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let lr = E::from_f64(cfg.lr).unwrap();
 
         for ((p, mut g), v) in param
diff --git a/dfdx-core/src/tensor_ops/sgd/cuda_kernel.rs b/dfdx-core/src/tensor_ops/sgd/cuda_kernel.rs
index 6d29812fc..d893a325d 100644
--- a/dfdx-core/src/tensor_ops/sgd/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/sgd/cuda_kernel.rs
@@ -2,7 +2,7 @@ use super::SgdConfig;
 
 use crate::{
     dtypes::*,
-    tensor::{launch_cfg, Cuda},
+    tensor::{launch_cfg, Cuda, Error},
     tensor_ops::optim::*,
 };
 
@@ -71,7 +71,7 @@ where
         param: &mut Self::Vec,
         velocity: &mut Self::Vec,
         grad: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         if !self.dev.has_func(Self::MOD, Self::FWD) {
             self.dev.load_ptx(PTX_SRC.into(), Self::MOD, &[Self::FWD])?;
         }
diff --git a/dfdx-core/src/tensor_ops/sgd/mod.rs b/dfdx-core/src/tensor_ops/sgd/mod.rs
index 3515f9f57..16737b2c5 100644
--- a/dfdx-core/src/tensor_ops/sgd/mod.rs
+++ b/dfdx-core/src/tensor_ops/sgd/mod.rs
@@ -5,7 +5,7 @@ mod cuda_kernel;
 
 use crate::{
     shapes::{Dtype, Shape},
-    tensor::{Storage, Tensor},
+    tensor::{Error, Storage, Tensor},
 };
 
 use super::optim::{Momentum, WeightDecay};
@@ -90,7 +90,7 @@ pub trait SgdKernel<E: Dtype>: Storage<E> {
         param: &mut Self::Vec,
         velocity: &mut Self::Vec,
         grad: &Self::Vec,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 }
 
 impl SgdConfig {
@@ -100,7 +100,7 @@ impl SgdConfig {
         param: &mut Tensor<S, E, D>,
         velocity: &mut D::Vec,
         grad: &D::Vec,
-    ) -> Result<(), D::Err> {
+    ) -> Result<(), crate::tensor::Error> {
         param.device.sgd_kernel(
             self,
             std::sync::Arc::make_mut(&mut param.data),
diff --git a/dfdx-core/src/tensor_ops/sigmoid/mod.rs b/dfdx-core/src/tensor_ops/sigmoid/mod.rs
index 316b85564..ff41f64f5 100644
--- a/dfdx-core/src/tensor_ops/sigmoid/mod.rs
+++ b/dfdx-core/src/tensor_ops/sigmoid/mod.rs
@@ -33,7 +33,7 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<SigmoidKernelOp, E>, T: Tape<E, D>> Tens
         self.try_sigmoid().unwrap()
     }
     /// See [sigmoid]
-    pub fn try_sigmoid(self) -> Result<Self, D::Err> {
+    pub fn try_sigmoid(self) -> Result<Self, crate::tensor::Error> {
         try_unary_op(SigmoidKernelOp, self)
     }
 }
diff --git a/dfdx-core/src/tensor_ops/sin/mod.rs b/dfdx-core/src/tensor_ops/sin/mod.rs
index 841d1ebba..035da953c 100644
--- a/dfdx-core/src/tensor_ops/sin/mod.rs
+++ b/dfdx-core/src/tensor_ops/sin/mod.rs
@@ -33,7 +33,7 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<SinKernelOp, E>, T: Tape<E, D>> Tensor<S
         self.try_sin().unwrap()
     }
     /// See [sin]
-    pub fn try_sin(self) -> Result<Self, D::Err> {
+    pub fn try_sin(self) -> Result<Self, crate::tensor::Error> {
         try_unary_op(SinKernelOp, self)
     }
 }
diff --git a/dfdx-core/src/tensor_ops/slice/cpu_kernel.rs b/dfdx-core/src/tensor_ops/slice/cpu_kernel.rs
index c987112cf..25d27ed76 100644
--- a/dfdx-core/src/tensor_ops/slice/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/slice/cpu_kernel.rs
@@ -7,7 +7,7 @@ impl<E: Unit> SliceKernel<E> for Cpu {
         &self,
         inp: &Tensor<Src, E, Self>,
         slice: &Slice,
-    ) -> Result<Tensor<Src::Sliced, E, Self>, Self::Err> {
+    ) -> Result<Tensor<Src::Sliced, E, Self>, Error> {
         let dst = inp.shape.slice(slice).unwrap();
         let mut out = self.try_zeros_like(&dst)?;
 
@@ -31,7 +31,7 @@ impl<E: Unit> SliceKernel<E> for Cpu {
         grad_inp: &mut Self::Vec,
         grad_out: &Self::Vec,
         slice: &Slice,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let dst = inp.shape.slice(slice).unwrap();
 
         let mut inp_idx = NdIndex::new(dst, inp.strides);
diff --git a/dfdx-core/src/tensor_ops/slice/cuda_kernel.rs b/dfdx-core/src/tensor_ops/slice/cuda_kernel.rs
index 7e2f85d1a..d7548daaa 100644
--- a/dfdx-core/src/tensor_ops/slice/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/slice/cuda_kernel.rs
@@ -2,7 +2,7 @@ use crate::{
     dtypes::*,
     prelude::cpu::NdIndex,
     shapes::*,
-    tensor::{launch_cfg, Cuda, Tensor},
+    tensor::{launch_cfg, Cuda, Error, Tensor},
 };
 use cudarc::driver::{CudaSlice, LaunchAsync};
 
@@ -56,7 +56,7 @@ where
         &self,
         inp: &Tensor<Src, E, Self>,
         slice: &Slice,
-    ) -> Result<Tensor<Src::Sliced, E, Self>, Self::Err> {
+    ) -> Result<Tensor<Src::Sliced, E, Self>, Error> {
         if !self.dev.has_func(Self::MOD, Self::FNS[0]) {
             self.dev.load_ptx(PTX_SRC.into(), Self::MOD, Self::FNS)?;
         }
@@ -94,7 +94,7 @@ where
         grad_inp: &mut Self::Vec,
         grad_out: &Self::Vec,
         slice: &Slice,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         if !self.dev.has_func(Self::MOD, Self::FNS[1]) {
             self.dev.load_ptx(PTX_SRC.into(), Self::MOD, Self::FNS)?;
         }
diff --git a/dfdx-core/src/tensor_ops/slice/mod.rs b/dfdx-core/src/tensor_ops/slice/mod.rs
index 9dce18815..db3a6854d 100644
--- a/dfdx-core/src/tensor_ops/slice/mod.rs
+++ b/dfdx-core/src/tensor_ops/slice/mod.rs
@@ -9,7 +9,7 @@ pub trait SliceKernel<E: Unit>: Storage<E> {
         &self,
         inp: &Tensor<Src, E, Self>,
         slice: &Slice,
-    ) -> Result<Tensor<Src::Sliced, E, Self>, Self::Err>;
+    ) -> Result<Tensor<Src::Sliced, E, Self>, Error>;
 
     fn backward<Src: Shape + SliceShape<Slice>, Slice>(
         &self,
@@ -17,7 +17,7 @@ pub trait SliceKernel<E: Unit>: Storage<E> {
         grad_inp: &mut Self::Vec,
         grad_out: &Self::Vec,
         slice: &Slice,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 }
 
 /// Slices all dimensions of a tensor, with the starting and ending indices of each dimension
@@ -53,7 +53,10 @@ pub fn slice<S: SliceShape<Slice>, E: Unit, D: SliceKernel<E>, T: Tape<E, D>, Sl
 
 impl<S: Shape, E: Unit, D: SliceKernel<E>, T: Tape<E, D>> Tensor<S, E, D, T> {
     /// Fallible version of [Tensor::slice]
-    pub fn try_slice<Slice>(self, slice: Slice) -> Result<Tensor<S::Sliced, E, D, T>, D::Err>
+    pub fn try_slice<Slice>(
+        self,
+        slice: Slice,
+    ) -> Result<Tensor<S::Sliced, E, D, T>, crate::tensor::Error>
     where
         S: SliceShape<Slice>,
         Slice: 'static,
diff --git a/dfdx-core/src/tensor_ops/softmax.rs b/dfdx-core/src/tensor_ops/softmax.rs
index 4390319da..0a6ec8aab 100644
--- a/dfdx-core/src/tensor_ops/softmax.rs
+++ b/dfdx-core/src/tensor_ops/softmax.rs
@@ -33,7 +33,7 @@ impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Tensor<S, E, D, T> {
         self.try_softmax::<Ax>().unwrap()
     }
     /// See [softmax()]
-    pub fn try_softmax<Ax: Axes>(self) -> Result<Self, D::Err>
+    pub fn try_softmax<Ax: Axes>(self) -> Result<Self, crate::tensor::Error>
     where
         S: ReduceShape<Ax>,
     {
diff --git a/dfdx-core/src/tensor_ops/sqrt/mod.rs b/dfdx-core/src/tensor_ops/sqrt/mod.rs
index 6fa472f1c..b4703946e 100644
--- a/dfdx-core/src/tensor_ops/sqrt/mod.rs
+++ b/dfdx-core/src/tensor_ops/sqrt/mod.rs
@@ -33,7 +33,7 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<SqrtKernelOp, E>, T: Tape<E, D>> Tensor<
         self.try_sqrt().unwrap()
     }
     /// See [sqrt]
-    pub fn try_sqrt(self) -> Result<Self, D::Err> {
+    pub fn try_sqrt(self) -> Result<Self, crate::tensor::Error> {
         try_unary_op(SqrtKernelOp, self)
     }
 }
diff --git a/dfdx-core/src/tensor_ops/square/mod.rs b/dfdx-core/src/tensor_ops/square/mod.rs
index aadab4f17..ae9ae0da3 100644
--- a/dfdx-core/src/tensor_ops/square/mod.rs
+++ b/dfdx-core/src/tensor_ops/square/mod.rs
@@ -33,7 +33,7 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<SquareKernelOp, E>, T: Tape<E, D>> Tenso
         self.try_square().unwrap()
     }
     /// See [square]
-    pub fn try_square(self) -> Result<Self, D::Err> {
+    pub fn try_square(self) -> Result<Self, crate::tensor::Error> {
         try_unary_op(SquareKernelOp, self)
     }
 }
diff --git a/dfdx-core/src/tensor_ops/stack/cpu_kernel.rs b/dfdx-core/src/tensor_ops/stack/cpu_kernel.rs
index 49a0174e9..3f09fa923 100644
--- a/dfdx-core/src/tensor_ops/stack/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/stack/cpu_kernel.rs
@@ -1,6 +1,6 @@
 use crate::{
     shapes::*,
-    tensor::{unique_id, Cpu, Tensor},
+    tensor::{unique_id, Cpu, Error, Tensor},
 };
 
 use std::vec::Vec;
@@ -10,7 +10,7 @@ impl<E: Dtype> super::StackKernel<E> for Cpu {
         &self,
         num: Num,
         inp: &[Tensor<S, E, Self>],
-    ) -> Result<Tensor<S::Larger, E, Self>, Self::Err>
+    ) -> Result<Tensor<S::Larger, E, Self>, Error>
     where
         S: super::AddDim<Num>,
     {
@@ -52,7 +52,7 @@ impl<E: Dtype> super::StackKernel<E> for Cpu {
         &self,
         mut grad_inp: Vec<&mut Self::Vec>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let mut offset = 0;
         for item in grad_inp.drain(..) {
             for gi in item.iter_mut() {
diff --git a/dfdx-core/src/tensor_ops/stack/cuda_kernel.rs b/dfdx-core/src/tensor_ops/stack/cuda_kernel.rs
index d6a783fbc..827a2be17 100644
--- a/dfdx-core/src/tensor_ops/stack/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/stack/cuda_kernel.rs
@@ -1,6 +1,6 @@
 use crate::{
     shapes::*,
-    tensor::{launch_cfg, Cuda, Tensor},
+    tensor::{launch_cfg, Cuda, Error, Tensor},
 };
 use cudarc::{
     driver::{DeviceSlice, LaunchAsync},
@@ -14,7 +14,7 @@ impl<E: Dtype + CudaTypeName> super::StackKernel<E> for Cuda {
         &self,
         num: Num,
         inps: &[Tensor<S, E, Self>],
-    ) -> Result<Tensor<S::Larger, E, Self>, Self::Err>
+    ) -> Result<Tensor<S::Larger, E, Self>, Error>
     where
         S: super::AddDim<Num>,
     {
@@ -54,7 +54,7 @@ impl<E: Dtype + CudaTypeName> super::StackKernel<E> for Cuda {
         &self,
         mut grad_inp: Vec<&mut Self::Vec>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let module_name = std::format!("stack_bwd_{}", E::NAME);
         if !self.dev.has_func(&module_name, "stack_bwd") {
             let src = BWD_KERNEL.replace("$Ty", E::NAME);
diff --git a/dfdx-core/src/tensor_ops/stack/mod.rs b/dfdx-core/src/tensor_ops/stack/mod.rs
index 3a6f8350e..f8e13d590 100644
--- a/dfdx-core/src/tensor_ops/stack/mod.rs
+++ b/dfdx-core/src/tensor_ops/stack/mod.rs
@@ -32,14 +32,13 @@ mod cuda_kernel;
 /// ```
 pub trait TryStack: Sized {
     type Stacked;
-    type Err: std::fmt::Debug;
 
     /// Stack an array or vec of tensors together along a new dimension.
     fn stack(self) -> Self::Stacked {
         self.try_stack().unwrap()
     }
     /// Fallible version of [TryStack::stack]
-    fn try_stack(self) -> Result<Self::Stacked, Self::Err>;
+    fn try_stack(self) -> Result<Self::Stacked, Error>;
 }
 
 impl<S: Shape, E: Dtype, D: StackKernel<E>, T, const N: usize> TryStack for [Tensor<S, E, D, T>; N]
@@ -48,8 +47,7 @@ where
     T: Tape<E, D>,
 {
     type Stacked = Tensor<S::Larger, E, D, T>;
-    type Err = D::Err;
-    fn try_stack(self) -> Result<Self::Stacked, Self::Err> {
+    fn try_stack(self) -> Result<Self::Stacked, Error> {
         try_stack(self)
     }
 }
@@ -60,16 +58,14 @@ where
     T: Tape<E, D>,
 {
     type Stacked = Tensor<S::Larger, E, D, T>;
-    type Err = D::Err;
-    fn try_stack(self) -> Result<Self::Stacked, Self::Err> {
+    fn try_stack(self) -> Result<Self::Stacked, Error> {
         try_stack(self)
     }
 }
 
-impl<A: TryStack, B: TryStack<Err = A::Err>> TryStack for (A, B) {
+impl<A: TryStack, B: TryStack> TryStack for (A, B) {
     type Stacked = (A::Stacked, B::Stacked);
-    type Err = A::Err;
-    fn try_stack(self) -> Result<Self::Stacked, Self::Err> {
+    fn try_stack(self) -> Result<Self::Stacked, Error> {
         Ok((self.0.try_stack()?, self.1.try_stack()?))
     }
 }
@@ -115,19 +111,15 @@ pub trait StackKernel<E: Dtype>: Storage<E> {
         &self,
         num: Num,
         inp: &[Tensor<S, E, Self>],
-    ) -> Result<Tensor<S::Larger, E, Self>, Self::Err>
+    ) -> Result<Tensor<S::Larger, E, Self>, Error>
     where
         S: AddDim<Num>;
-    fn backward(
-        &self,
-        grad_inp: Vec<&mut Self::Vec>,
-        grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>;
+    fn backward(&self, grad_inp: Vec<&mut Self::Vec>, grad_out: &Self::Vec) -> Result<(), Error>;
 }
 
 fn try_stack<S: Shape, E: Dtype, D: StackKernel<E>, T, Items>(
     items: Items,
-) -> Result<Tensor<S::Larger, E, D, T>, D::Err>
+) -> Result<Tensor<S::Larger, E, D, T>, crate::tensor::Error>
 where
     Items: Array<Tensor<S, E, D, T>>,
     S: AddDim<Items::Dim>,
diff --git a/dfdx-core/src/tensor_ops/stddev_to.rs b/dfdx-core/src/tensor_ops/stddev_to.rs
index 9e116cabb..07beaecd2 100644
--- a/dfdx-core/src/tensor_ops/stddev_to.rs
+++ b/dfdx-core/src/tensor_ops/stddev_to.rs
@@ -2,7 +2,7 @@ use super::*;
 use crate::{shapes::*, tensor::*};
 
 /// Reduction along multiple axes using standard deviation.
-pub trait StddevTo<E: Dtype>: HasErr + HasShape {
+pub trait StddevTo<E: Dtype>: Sized + HasShape {
     /// Standard deviation reduction.
     ///
     /// **Pytorch equivalent**: `t.std(Axes, unbiased=False)`
@@ -25,7 +25,7 @@ pub trait StddevTo<E: Dtype>: HasErr + HasShape {
     fn try_stddev<Dst: Shape, Ax: Axes>(
         self,
         epsilon: impl Into<f64>,
-    ) -> Result<Self::WithShape<Dst>, Self::Err>
+    ) -> Result<Self::WithShape<Dst>, Error>
     where
         Self::Shape: HasAxes<Ax> + ReduceShapeTo<Dst, Ax>;
 }
@@ -34,7 +34,7 @@ impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> StddevTo<E> for Tensor<S,
     fn try_stddev<Dst: Shape, Ax: Axes>(
         self,
         epsilon: impl Into<f64>,
-    ) -> Result<Self::WithShape<Dst>, Self::Err>
+    ) -> Result<Self::WithShape<Dst>, Error>
     where
         Self::Shape: HasAxes<Ax> + ReduceShapeTo<Dst, Ax>,
     {
diff --git a/dfdx-core/src/tensor_ops/sub/mod.rs b/dfdx-core/src/tensor_ops/sub/mod.rs
index 1fdd35813..a7b82759d 100644
--- a/dfdx-core/src/tensor_ops/sub/mod.rs
+++ b/dfdx-core/src/tensor_ops/sub/mod.rs
@@ -47,9 +47,9 @@ where
 }
 
 /// Fallible version of [std::ops::Sub]. See [sub]
-pub trait TrySub<Rhs = Self>: HasErr {
+pub trait TrySub<Rhs = Self> {
     type Output;
-    fn try_sub(self, rhs: Rhs) -> Result<Self::Output, Self::Err>;
+    fn try_sub(self, rhs: Rhs) -> Result<Self::Output, Error>;
 }
 
 impl<S: Shape, E: Dtype, D: BinaryKernel<BinarySubKernelOp, E>, LTape: Tape<E, D>, R>
@@ -58,7 +58,7 @@ where
     LTape: Merge<R>,
 {
     type Output = Self;
-    fn try_sub(self, rhs: Tensor<S, E, D, R>) -> Result<Self, Self::Err> {
+    fn try_sub(self, rhs: Tensor<S, E, D, R>) -> Result<Self, Error> {
         try_binary_op(BinarySubKernelOp, self, rhs)
     }
 }
@@ -68,7 +68,7 @@ where
     D: UnaryKernel<ScalarSubKernelOp<E>, E>,
 {
     type Output = Self;
-    fn try_sub(self, rhs: Rhs) -> Result<Self, Self::Err> {
+    fn try_sub(self, rhs: Rhs) -> Result<Self, Error> {
         let rhs: f64 = rhs.into();
         let scalar = E::from_f64(rhs).unwrap();
         try_unary_op(ScalarSubKernelOp { scalar }, self)
diff --git a/dfdx-core/src/tensor_ops/sum_to/cpu_kernel.rs b/dfdx-core/src/tensor_ops/sum_to/cpu_kernel.rs
index 5daa7cced..373622dcf 100644
--- a/dfdx-core/src/tensor_ops/sum_to/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/sum_to/cpu_kernel.rs
@@ -1,7 +1,7 @@
 use crate::{
     dtypes::{Dtype, NotMixedPrecision},
     shapes::{Axes, HasAxes, ReduceShapeTo, Shape},
-    tensor::{Cpu, Tensor, Tensorlike, ZerosTensor},
+    tensor::{Cpu, Error, Tensor, Tensorlike, ZerosTensor},
     tensor_ops::utilities::reduction_utils::index_for_reductions,
 };
 
@@ -11,7 +11,7 @@ impl super::SumKernel<crate::dtypes::AMP<crate::dtypes::f16>> for Cpu {
         &self,
         dst: Dst,
         inp: &Tensor<Src, crate::dtypes::AMP<crate::dtypes::f16>, Self>,
-    ) -> Result<Tensor<Dst, crate::dtypes::AMP<crate::dtypes::f16>, Self>, Self::Err>
+    ) -> Result<Tensor<Dst, crate::dtypes::AMP<crate::dtypes::f16>, Self>, Error>
     where
         Src: ReduceShapeTo<Dst, Ax>,
     {
@@ -46,7 +46,7 @@ impl super::SumKernel<crate::dtypes::AMP<crate::dtypes::f16>> for Cpu {
         inp: &impl Tensorlike<Src, crate::dtypes::AMP<crate::dtypes::f16>, Self>,
         grad_inp: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>
+    ) -> Result<(), Error>
     where
         Src: ReduceShapeTo<Dst, Ax>,
     {
@@ -75,7 +75,7 @@ impl<E: Dtype + NotMixedPrecision> super::SumKernel<E> for Cpu {
         &self,
         dst: Dst,
         inp: &Tensor<Src, E, Self>,
-    ) -> Result<Tensor<Dst, E, Self>, Self::Err>
+    ) -> Result<Tensor<Dst, E, Self>, Error>
     where
         Src: ReduceShapeTo<Dst, Ax>,
     {
@@ -108,7 +108,7 @@ impl<E: Dtype + NotMixedPrecision> super::SumKernel<E> for Cpu {
         inp: &impl Tensorlike<Src, E, Self>,
         grad_inp: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>
+    ) -> Result<(), Error>
     where
         Src: ReduceShapeTo<Dst, Ax>,
     {
diff --git a/dfdx-core/src/tensor_ops/sum_to/cuda_kernel.rs b/dfdx-core/src/tensor_ops/sum_to/cuda_kernel.rs
index 05d1dec32..868909411 100644
--- a/dfdx-core/src/tensor_ops/sum_to/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/sum_to/cuda_kernel.rs
@@ -1,7 +1,7 @@
 use crate::{
     dtypes::*,
     shapes::*,
-    tensor::{launch_cfg, Cuda, Tensor, Tensorlike},
+    tensor::{launch_cfg, Cuda, Error, Tensor, Tensorlike},
     tensor_ops::reduction_utils::*,
 };
 
@@ -46,7 +46,7 @@ where
         &self,
         dst: Dst,
         inp: &Tensor<Src, E, Self>,
-    ) -> Result<Tensor<Dst, E, Self>, Self::Err>
+    ) -> Result<Tensor<Dst, E, Self>, Error>
     where
         Src: ReduceShapeTo<Dst, Ax>,
     {
@@ -99,7 +99,7 @@ where
         inp: &impl Tensorlike<Src, E, Self>,
         grad_inp: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>
+    ) -> Result<(), Error>
     where
         Src: ReduceShapeTo<Dst, Ax>,
     {
diff --git a/dfdx-core/src/tensor_ops/sum_to/mod.rs b/dfdx-core/src/tensor_ops/sum_to/mod.rs
index d0dae445b..e99c37101 100644
--- a/dfdx-core/src/tensor_ops/sum_to/mod.rs
+++ b/dfdx-core/src/tensor_ops/sum_to/mod.rs
@@ -10,7 +10,7 @@ pub trait SumKernel<E: Dtype>: Storage<E> {
         &self,
         dst: Dst,
         inp: &Tensor<Src, E, Self>,
-    ) -> Result<Tensor<Dst, E, Self>, Self::Err>
+    ) -> Result<Tensor<Dst, E, Self>, Error>
     where
         Src: ReduceShapeTo<Dst, Ax>;
     fn backward<Src: Shape, Dst: Shape, Ax: Axes>(
@@ -19,13 +19,13 @@ pub trait SumKernel<E: Dtype>: Storage<E> {
         inp: &impl Tensorlike<Src, E, Self>,
         grad_inp: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>
+    ) -> Result<(), Error>
     where
         Src: ReduceShapeTo<Dst, Ax>;
 }
 
 /// Reduction along multiple axes using `sum`.
-pub trait SumTo: HasErr + HasShape {
+pub trait SumTo: Sized + HasShape {
     /// Sum reduction. **Pytorch equivalent**: `t.sum(Ax)`
     ///
     /// Example reducing a single axis:
@@ -52,13 +52,13 @@ pub trait SumTo: HasErr + HasShape {
         self.try_sum().unwrap()
     }
     /// Fallible version of [SumTo::sum]
-    fn try_sum<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Self::Err>
+    fn try_sum<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Error>
     where
         Self::Shape: ReduceShapeTo<Dst, Ax>;
 }
 
 impl<S: Shape, E: Dtype, D: SumKernel<E>, T: Tape<E, D>> SumTo for Tensor<S, E, D, T> {
-    fn try_sum<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Self::Err>
+    fn try_sum<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Error>
     where
         Self::Shape: ReduceShapeTo<Dst, Ax>,
     {
diff --git a/dfdx-core/src/tensor_ops/tanh/mod.rs b/dfdx-core/src/tensor_ops/tanh/mod.rs
index 7ddf191ba..2c5b96064 100644
--- a/dfdx-core/src/tensor_ops/tanh/mod.rs
+++ b/dfdx-core/src/tensor_ops/tanh/mod.rs
@@ -33,7 +33,7 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<TanhKernelOp, E>, T: Tape<E, D>> Tensor<
         self.try_tanh().unwrap()
     }
     /// See [tanh]
-    pub fn try_tanh(self) -> Result<Self, D::Err> {
+    pub fn try_tanh(self) -> Result<Self, crate::tensor::Error> {
         try_unary_op(TanhKernelOp, self)
     }
 }
diff --git a/dfdx-core/src/tensor_ops/to_dtype/cpu_kernel.rs b/dfdx-core/src/tensor_ops/to_dtype/cpu_kernel.rs
index a0f948052..e06e7e01e 100644
--- a/dfdx-core/src/tensor_ops/to_dtype/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/to_dtype/cpu_kernel.rs
@@ -1,10 +1,10 @@
 use num_traits::AsPrimitive;
 use std::{sync::Arc, vec::Vec};
 
-use crate::prelude::{cpu::CachableVec, Cpu, Shape, Tensor, Unit};
+use crate::prelude::{cpu::CachableVec, Cpu, Error, Shape, Tensor, Unit};
 
 impl<E1: Unit + AsPrimitive<E2>, E2: Unit> super::ToDtypeKernel<E1, E2> for Cpu {
-    fn forward<S: Shape>(inp: Tensor<S, E1, Self>) -> Result<Tensor<S, E2, Self>, Self::Err> {
+    fn forward<S: Shape>(inp: Tensor<S, E1, Self>) -> Result<Tensor<S, E2, Self>, Error> {
         let data: &[E1] = inp.data.as_ref();
         let data: Vec<E2> = data.iter().map(|x| (*x).as_()).collect();
         let data = CachableVec {
diff --git a/dfdx-core/src/tensor_ops/to_dtype/cuda_kernel.rs b/dfdx-core/src/tensor_ops/to_dtype/cuda_kernel.rs
index 5e6d232c6..ed147b0da 100644
--- a/dfdx-core/src/tensor_ops/to_dtype/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/to_dtype/cuda_kernel.rs
@@ -1,6 +1,6 @@
 use crate::{
     shapes::{Shape, Unit},
-    tensor::{launch_cfg, Cuda, Tensor},
+    tensor::{launch_cfg, Cuda, Error, Tensor},
 };
 use cudarc::{
     driver::{DeviceSlice, LaunchAsync},
@@ -22,7 +22,7 @@ extern \"C\" __global__ void kernel(const size_t n, const $Src *inp, $Dst *out)
 }";
 
 impl<E1: Unit + CudaTypeName, E2: Unit + CudaTypeName> super::ToDtypeKernel<E1, E2> for Cuda {
-    fn forward<S: Shape>(inp: Tensor<S, E1, Self>) -> Result<Tensor<S, E2, Self>, Self::Err> {
+    fn forward<S: Shape>(inp: Tensor<S, E1, Self>) -> Result<Tensor<S, E2, Self>, Error> {
         let module = std::format!("convert_{}_to_{}", E1::NAME, E2::NAME);
         let cuda = &inp.device;
 
diff --git a/dfdx-core/src/tensor_ops/to_dtype/mod.rs b/dfdx-core/src/tensor_ops/to_dtype/mod.rs
index 85b0b5ab0..5f0b7c999 100644
--- a/dfdx-core/src/tensor_ops/to_dtype/mod.rs
+++ b/dfdx-core/src/tensor_ops/to_dtype/mod.rs
@@ -2,10 +2,10 @@ mod cpu_kernel;
 #[cfg(feature = "cuda")]
 mod cuda_kernel;
 
-use crate::prelude::{Shape, Storage, Tensor, Unit};
+use crate::prelude::{Error, Shape, Storage, Tensor, Unit};
 
 pub trait ToDtypeKernel<E1: Unit, E2: Unit>: Storage<E1> + Storage<E2> {
-    fn forward<S: Shape>(inp: Tensor<S, E1, Self>) -> Result<Tensor<S, E2, Self>, Self::Err>;
+    fn forward<S: Shape>(inp: Tensor<S, E1, Self>) -> Result<Tensor<S, E2, Self>, Error>;
 }
 
 /// Copies the elements of a tensor, converting its data to a different dtype.
@@ -29,7 +29,7 @@ pub fn to_dtype<E2: Unit, S: Shape, E1: Unit, D: ToDtypeKernel<E1, E2>>(
 }
 
 impl<S: Shape, E: Unit, D: Storage<E>> Tensor<S, E, D> {
-    pub fn try_to_dtype<E2: Unit>(self) -> Result<Tensor<S, E2, D>, D::Err>
+    pub fn try_to_dtype<E2: Unit>(self) -> Result<Tensor<S, E2, D>, crate::tensor::Error>
     where
         D: ToDtypeKernel<E, E2>,
     {
diff --git a/dfdx-core/src/tensor_ops/tri.rs b/dfdx-core/src/tensor_ops/tri.rs
index 3a03d1da9..c1f8fa451 100644
--- a/dfdx-core/src/tensor_ops/tri.rs
+++ b/dfdx-core/src/tensor_ops/tri.rs
@@ -1,5 +1,5 @@
 use crate::shapes::{Dtype, Shape};
-use crate::tensor::{HasErr, Tape, Tensor, TriangleTensor};
+use crate::tensor::{Error, Tape, Tensor, TriangleTensor};
 
 use super::TryMul;
 
@@ -11,7 +11,7 @@ pub fn lower_tri<S: Shape, E: Dtype, D: TriangleTensor<E>, T: Tape<E, D>>(
     diagonal: impl Into<Option<isize>>,
 ) -> Tensor<S, E, D, T>
 where
-    Tensor<S, E, D, T>: TryMul<Tensor<S, E, D>, Output = Tensor<S, E, D, T>> + HasErr<Err = D::Err>,
+    Tensor<S, E, D, T>: TryMul<Tensor<S, E, D>, Output = Tensor<S, E, D, T>>,
 {
     t.lower_tri(diagonal)
 }
@@ -24,20 +24,17 @@ pub fn upper_tri<S: Shape, E: Dtype, D: TriangleTensor<E>, T: Tape<E, D>>(
     diagonal: impl Into<Option<isize>>,
 ) -> Tensor<S, E, D, T>
 where
-    Tensor<S, E, D, T>: TryMul<Tensor<S, E, D>, Output = Tensor<S, E, D, T>> + HasErr<Err = D::Err>,
+    Tensor<S, E, D, T>: TryMul<Tensor<S, E, D>, Output = Tensor<S, E, D, T>>,
 {
     t.upper_tri(diagonal)
 }
 
 impl<S: Shape, E: Dtype, D: TriangleTensor<E>, T: Tape<E, D>> Tensor<S, E, D, T>
 where
-    Self: TryMul<Tensor<S, E, D>, Output = Self> + HasErr<Err = D::Err>,
+    Self: TryMul<Tensor<S, E, D>, Output = Self>,
 {
     /// See [lower_tri]
-    pub fn try_lower_tri(
-        self,
-        diagonal: impl Into<Option<isize>>,
-    ) -> Result<Self, <Self as HasErr>::Err> {
+    pub fn try_lower_tri(self, diagonal: impl Into<Option<isize>>) -> Result<Self, Error> {
         let out = self
             .device
             .try_lower_tri_like(&self.shape, E::ONE, diagonal)?;
@@ -50,10 +47,7 @@ where
     }
 
     /// See [upper_tri]
-    pub fn try_upper_tri(
-        self,
-        diagonal: impl Into<Option<isize>>,
-    ) -> Result<Self, <Self as HasErr>::Err> {
+    pub fn try_upper_tri(self, diagonal: impl Into<Option<isize>>) -> Result<Self, Error> {
         let out = self
             .device
             .try_upper_tri_like(&self.shape, E::ONE, diagonal)?;
diff --git a/dfdx-core/src/tensor_ops/upscale2d/cpu_kernel.rs b/dfdx-core/src/tensor_ops/upscale2d/cpu_kernel.rs
index 82083e513..efbff95ef 100644
--- a/dfdx-core/src/tensor_ops/upscale2d/cpu_kernel.rs
+++ b/dfdx-core/src/tensor_ops/upscale2d/cpu_kernel.rs
@@ -1,5 +1,5 @@
 use crate::shapes::*;
-use crate::tensor::{Cpu, Tensor};
+use crate::tensor::{Cpu, Error, Tensor};
 
 use std::sync::Arc;
 
@@ -23,7 +23,7 @@ impl<E: Float + Unit + std::ops::AddAssign + std::ops::DivAssign>
         op: super::Upscale2DOp,
         inp: &Tensor<I, E, Self>,
         out: &mut Tensor<O, E, Self>,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let istr = make_4d::<I>(inp.strides);
         let ostr = make_4d::<O>(out.strides);
 
@@ -56,7 +56,7 @@ impl<E: Float + Unit + std::ops::AddAssign + std::ops::DivAssign>
         grad_inp: &mut Self::Vec,
         out: &Tensor<O, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let istr = make_4d::<I>(inp.strides);
         let ostr = make_4d::<O>(out.strides);
 
@@ -87,7 +87,7 @@ impl<E: Float + Dtype> super::Upscale2DKernel<E, Bilinear> for Cpu {
         op: super::Upscale2DOp,
         inp: &Tensor<I, E, Self>,
         out: &mut Tensor<O, E, Self>,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let istr = make_4d::<I>(inp.strides);
         let ostr = make_4d::<O>(out.strides);
 
@@ -138,7 +138,7 @@ impl<E: Float + Dtype> super::Upscale2DKernel<E, Bilinear> for Cpu {
         grad_inp: &mut Self::Vec,
         out: &Tensor<O, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let istr = make_4d::<I>(inp.strides);
         let ostr = make_4d::<O>(out.strides);
 
diff --git a/dfdx-core/src/tensor_ops/upscale2d/cuda_kernel.rs b/dfdx-core/src/tensor_ops/upscale2d/cuda_kernel.rs
index 038a86ae8..9c98646bd 100644
--- a/dfdx-core/src/tensor_ops/upscale2d/cuda_kernel.rs
+++ b/dfdx-core/src/tensor_ops/upscale2d/cuda_kernel.rs
@@ -1,7 +1,7 @@
 use crate::{
     dtypes::*,
     shapes::*,
-    tensor::{launch_cfg, Cuda, Tensor},
+    tensor::{launch_cfg, Cuda, Error, Tensor},
 };
 
 use std::sync::Arc;
@@ -71,7 +71,7 @@ where
         op: super::Upscale2DOp,
         inp: &Tensor<I, E, Self>,
         out: &mut Tensor<O, E, Self>,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         if !self.dev.has_func(Self::FWD, Self::FWD) {
             self.dev
                 .load_ptx(PTX_SRC.into(), Self::FWD, &[Self::FWD, Self::BWD])?;
@@ -96,7 +96,7 @@ where
         grad_inp: &mut Self::Vec,
         out: &Tensor<O, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let strides = self.dev.htod_copy(make_4d::<I>(inp.strides).into())?;
         let bwd_fn = self.dev.get_func(Self::FWD, Self::BWD).unwrap();
         let cfg = launch_cfg::<128>(out.shape().num_elements() as u32);
diff --git a/dfdx-core/src/tensor_ops/upscale2d/mod.rs b/dfdx-core/src/tensor_ops/upscale2d/mod.rs
index 5cd849ddc..d0adce479 100644
--- a/dfdx-core/src/tensor_ops/upscale2d/mod.rs
+++ b/dfdx-core/src/tensor_ops/upscale2d/mod.rs
@@ -5,7 +5,7 @@ mod cuda_kernel;
 
 use crate::{
     shapes::*,
-    tensor::{HasErr, PutTape, SplitTape, Storage, Tape, Tensor, ZerosTensor},
+    tensor::{Error, PutTape, SplitTape, Storage, Tape, Tensor, ZerosTensor},
 };
 
 #[repr(C)]
@@ -65,7 +65,7 @@ pub trait Upscale2DKernel<E: Unit, M: UpscaleMethod>: Storage<E> {
         op: Upscale2DOp,
         inp: &Tensor<I, E, Self>,
         out: &mut Tensor<O, E, Self>,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 
     fn backward<I: Shape, O: Shape>(
         &self,
@@ -74,17 +74,17 @@ pub trait Upscale2DKernel<E: Unit, M: UpscaleMethod>: Storage<E> {
         grad_inp: &mut Self::Vec,
         out: &Tensor<O, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 }
 
-pub trait GenericUpscale2D<M: UpscaleMethod>: HasErr {
+pub trait GenericUpscale2D<M: UpscaleMethod> {
     type Output<OH: Dim, OW: Dim>;
     fn generic_upscale2d_like<OH: Dim, OW: Dim>(
         self,
         method: M,
         height: OH,
         width: OW,
-    ) -> Result<Self::Output<OH, OW>, Self::Err>;
+    ) -> Result<Self::Output<OH, OW>, Error>;
 }
 
 /// Upscales an image to a new shape. Valid methods of upscaling are:
@@ -107,7 +107,7 @@ pub trait GenericUpscale2D<M: UpscaleMethod>: HasErr {
 /// let t: Tensor<Rank3<3, 32, 32>, f32, _> = dev.zeros();
 /// let y: Tensor<(Const<3>, usize, usize), f32, _> = t.upscale2d_like(NearestNeighbor, 64, 64);
 /// ```
-pub trait TryUpscale2D {
+pub trait TryUpscale2D: Sized {
     /// Upscale to compile time known dimensions.
     fn upscale2d<const OH: usize, const OW: usize, M: UpscaleMethod>(
         self,
@@ -122,7 +122,7 @@ pub trait TryUpscale2D {
     fn try_upscale2d<const OH: usize, const OW: usize, M: UpscaleMethod>(
         self,
         method: M,
-    ) -> Result<<Self as GenericUpscale2D<M>>::Output<Const<OH>, Const<OW>>, Self::Err>
+    ) -> Result<<Self as GenericUpscale2D<M>>::Output<Const<OH>, Const<OW>>, Error>
     where
         Self: GenericUpscale2D<M>,
     {
@@ -146,7 +146,7 @@ pub trait TryUpscale2D {
         method: M,
         height: OH,
         width: OW,
-    ) -> Result<<Self as GenericUpscale2D<M>>::Output<OH, OW>, Self::Err>
+    ) -> Result<<Self as GenericUpscale2D<M>>::Output<OH, OW>, Error>
     where
         Self: GenericUpscale2D<M>,
     {
@@ -172,7 +172,7 @@ impl<
         _method: M,
         out_height: OH,
         out_width: OW,
-    ) -> Result<Self::Output<OH, OW>, Self::Err> {
+    ) -> Result<Self::Output<OH, OW>, Error> {
         let in_height = self.shape.1;
         let in_width = self.shape.2;
 
@@ -216,7 +216,7 @@ impl<
         _method: M,
         out_height: OH,
         out_width: OW,
-    ) -> Result<Self::Output<OH, OW>, Self::Err> {
+    ) -> Result<Self::Output<OH, OW>, Error> {
         let in_height = self.shape.2;
         let in_width = self.shape.3;
 
diff --git a/dfdx-core/src/tensor_ops/utilities/backward.rs b/dfdx-core/src/tensor_ops/utilities/backward.rs
index 7dc01ccf7..f78981468 100644
--- a/dfdx-core/src/tensor_ops/utilities/backward.rs
+++ b/dfdx-core/src/tensor_ops/utilities/backward.rs
@@ -4,19 +4,19 @@ use crate::tensor::*;
 /// Runs backprop algorithm with all operations contained in the tape that `t` has.
 ///
 /// This function takes ownership of `self` and returns [Gradients].
-pub trait Backward<E, D: Storage<E>>: HasErr {
+pub trait Backward<E, D: Storage<E>>: Sized {
     /// Runs backprop
     fn backward(self) -> Gradients<E, D> {
         self.try_backward().unwrap()
     }
     /// Fallible version of [Backward::backward]
-    fn try_backward(self) -> Result<Gradients<E, D>, Self::Err>;
+    fn try_backward(self) -> Result<Gradients<E, D>, Error>;
 }
 
 impl<E: 'static + Clone, D: OneFillStorage<E>> Backward<E, D>
     for Tensor<Rank0, E, D, OwnedTape<E, D>>
 {
-    fn try_backward(self) -> Result<Gradients<E, D>, Self::Err> {
+    fn try_backward(self) -> Result<Gradients<E, D>, Error> {
         let (t, mut tape) = self.split_tape();
         let t_ghost = t.ghost();
         tape.add_backward_op(move |grads| {
@@ -33,7 +33,7 @@ impl<E: 'static + Clone, D: OneFillStorage<E>> Backward<E, D>
 impl<E: 'static + Clone, D: OneFillStorage<E>> Backward<E, D>
     for Tensor<Rank0, E, D, std::sync::Arc<std::sync::Mutex<OwnedTape<E, D>>>>
 {
-    fn try_backward(self) -> Result<Gradients<E, D>, Self::Err> {
+    fn try_backward(self) -> Result<Gradients<E, D>, Error> {
         let (t, tape) = self.split_tape();
         let t_ghost = t.ghost();
         let mut tape = tape.lock().unwrap();
diff --git a/dfdx-core/src/tensor_ops/utilities/cpu_kernels.rs b/dfdx-core/src/tensor_ops/utilities/cpu_kernels.rs
index 56718a72c..0848c1edc 100644
--- a/dfdx-core/src/tensor_ops/utilities/cpu_kernels.rs
+++ b/dfdx-core/src/tensor_ops/utilities/cpu_kernels.rs
@@ -5,7 +5,7 @@ use crate::{
     shapes::{Dtype, Shape},
     tensor::{
         cpu::{Cpu, LendingIterator, NdIndex},
-        unique_id, Tensor, Tensorlike, ZerosTensor,
+        unique_id, Error, Tensor, Tensorlike, ZerosTensor,
     },
 };
 
@@ -51,7 +51,7 @@ impl<E: Dtype, Op: UnaryDerivative<E>> UnaryKernel<Op, E> for Cpu {
         &self,
         op: Op,
         inp: Cow<Tensor<S, E, Self>>,
-    ) -> Result<Tensor<S, E, Self>, Self::Err> {
+    ) -> Result<Tensor<S, E, Self>, Error> {
         let mut out = match inp {
             Cow::Borrowed(inp) => {
                 // allocate a new data buffer
@@ -84,7 +84,7 @@ impl<E: Dtype, Op: UnaryDerivative<E>> UnaryKernel<Op, E> for Cpu {
         grad_inp: &mut Self::Vec,
         out: &impl Tensorlike<S, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         match (inp.data(), out.data()) {
             (None, None) => {
                 let df = op.const_df();
@@ -115,7 +115,7 @@ impl<E: Dtype, Op: BinaryDerivative<E>> BinaryKernel<Op, E> for Cpu {
         op: Op,
         lhs: Cow<Tensor<S, E, Self>>,
         rhs: Cow<Tensor<S, E, Self>>,
-    ) -> Result<Tensor<S, E, Self>, Self::Err> {
+    ) -> Result<Tensor<S, E, Self>, Error> {
         match (lhs, rhs) {
             (Cow::Borrowed(lhs), Cow::Borrowed(rhs)) => {
                 let mut out = self.try_zeros_like(&lhs.shape)?;
@@ -169,7 +169,7 @@ impl<E: Dtype, Op: BinaryDerivative<E>> BinaryKernel<Op, E> for Cpu {
         rhs: &impl Tensorlike<S, E, Self>,
         grad_rhs: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         match (lhs.data(), rhs.data()) {
             (Some(lhs_buf), Some(rhs_buf)) => {
                 let mut lhs_idx = NdIndex::new(*lhs.shape(), lhs.strides());
diff --git a/dfdx-core/src/tensor_ops/utilities/cuda_kernels.rs b/dfdx-core/src/tensor_ops/utilities/cuda_kernels.rs
index 9ce4df1d6..b472b0253 100644
--- a/dfdx-core/src/tensor_ops/utilities/cuda_kernels.rs
+++ b/dfdx-core/src/tensor_ops/utilities/cuda_kernels.rs
@@ -67,7 +67,7 @@ impl<E: Dtype, K: UnaryOpCudaKernel<E> + DeviceRepr> UnaryKernel<K, E> for Cuda
         &self,
         op: K,
         inp: Cow<Tensor<S, E, Self>>,
-    ) -> Result<Tensor<S, E, Self>, Self::Err> {
+    ) -> Result<Tensor<S, E, Self>, Error> {
         if !self.dev.has_func(K::MODULE_NAME, K::FWD_FN_NAME) {
             self.dev
                 .load_ptx(K::PTX_SRC.into(), K::MODULE_NAME, &K::ALL_FN_NAMES)?;
@@ -103,7 +103,7 @@ impl<E: Dtype, K: UnaryOpCudaKernel<E> + DeviceRepr> UnaryKernel<K, E> for Cuda
         grad_inp: &mut Self::Vec,
         out: &impl Tensorlike<S, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let bwd_fn = self.dev.get_func(K::MODULE_NAME, K::BWD_FN_NAME).unwrap();
         match (inp.data(), out.data()) {
             (None, None) => {
@@ -219,7 +219,7 @@ impl<E: Dtype, K: BinaryOpCudaKernel<E> + DeviceRepr + Clone> BinaryKernel<K, E>
         op: K,
         lhs: Cow<Tensor<S, E, Self>>,
         rhs: Cow<Tensor<S, E, Self>>,
-    ) -> Result<Tensor<S, E, Self>, Self::Err> {
+    ) -> Result<Tensor<S, E, Self>, Error> {
         if !self.dev.has_func(K::MODULE_NAME, K::FWD_FN_NAME) {
             self.dev
                 .load_ptx(K::PTX_SRC.into(), K::MODULE_NAME, &K::ALL_FN_NAMES)?;
@@ -326,7 +326,7 @@ impl<E: Dtype, K: BinaryOpCudaKernel<E> + DeviceRepr + Clone> BinaryKernel<K, E>
         rhs: &impl Tensorlike<S, E, Self>,
         grad_rhs: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err> {
+    ) -> Result<(), Error> {
         let bwd_lhs_fn = self
             .dev
             .get_func(K::MODULE_NAME, K::BWD_LHS_FN_NAME)
diff --git a/dfdx-core/src/tensor_ops/utilities/ops.rs b/dfdx-core/src/tensor_ops/utilities/ops.rs
index 15a8250d6..e8a3dcfef 100644
--- a/dfdx-core/src/tensor_ops/utilities/ops.rs
+++ b/dfdx-core/src/tensor_ops/utilities/ops.rs
@@ -1,6 +1,6 @@
 use crate::{
     shapes::{Dtype, HasShape, Shape},
-    tensor::{Merge, PutTape, SplitTape, Storage, Tape, Tensor, Tensorlike},
+    tensor::*,
 };
 use std::borrow::Cow;
 
@@ -11,7 +11,7 @@ pub trait UnaryKernel<Op, E: Dtype>: Storage<E> {
         &self,
         op: Op,
         inp: Cow<Tensor<S, E, Self>>,
-    ) -> Result<Tensor<S, E, Self>, Self::Err>;
+    ) -> Result<Tensor<S, E, Self>, Error>;
     fn backward<S: Shape>(
         &self,
         op: Op,
@@ -19,7 +19,7 @@ pub trait UnaryKernel<Op, E: Dtype>: Storage<E> {
         grad_inp: &mut Self::Vec,
         out: &impl Tensorlike<S, E, Self>,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 }
 
 pub trait BinaryKernel<Op, E: Dtype>: Storage<E> {
@@ -29,7 +29,7 @@ pub trait BinaryKernel<Op, E: Dtype>: Storage<E> {
         op: Op,
         lhs: Cow<Tensor<S, E, Self>>,
         rhs: Cow<Tensor<S, E, Self>>,
-    ) -> Result<Tensor<S, E, Self>, Self::Err>;
+    ) -> Result<Tensor<S, E, Self>, Error>;
     fn backward<S: Shape>(
         &self,
         op: Op,
@@ -38,7 +38,7 @@ pub trait BinaryKernel<Op, E: Dtype>: Storage<E> {
         rhs: &impl Tensorlike<S, E, Self>,
         grad_rhs: &mut Self::Vec,
         grad_out: &Self::Vec,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Error>;
 }
 
 pub(crate) fn try_unary_op<
@@ -50,7 +50,7 @@ pub(crate) fn try_unary_op<
 >(
     op: Op,
     inp: Tensor<S, E, D, T>,
-) -> Result<Tensor<S, E, D, T>, D::Err> {
+) -> Result<Tensor<S, E, D, T>, crate::tensor::Error> {
     let (inp, mut tape) = inp.split_tape();
     let inp_ghost = inp.ghost();
     let dev = inp.device.clone();
@@ -99,7 +99,7 @@ pub(crate) fn try_binary_op<
     op: Op,
     lhs: Tensor<S, E, D, LhsTape>,
     rhs: Tensor<S, E, D, RhsTape>,
-) -> Result<Tensor<S, E, D, LhsTape>, D::Err> {
+) -> Result<Tensor<S, E, D, LhsTape>, crate::tensor::Error> {
     assert_eq!(lhs.shape(), rhs.shape());
     let (lhs, ltape) = lhs.split_tape();
     let (rhs, rtape) = rhs.split_tape();
diff --git a/dfdx-core/src/tensor_ops/var_to.rs b/dfdx-core/src/tensor_ops/var_to.rs
index 1a593655c..0a7ef618b 100644
--- a/dfdx-core/src/tensor_ops/var_to.rs
+++ b/dfdx-core/src/tensor_ops/var_to.rs
@@ -2,7 +2,7 @@ use super::*;
 use crate::{shapes::*, tensor::*};
 
 /// Reduction alogn multiple axes using variance
-pub trait VarTo: HasErr + HasShape {
+pub trait VarTo: Sized + HasShape {
     /// Result [Tensor] has smaller number of dimensions.
     ///
     /// **Pytorch equivalent**: `t.var(Axes, unbiased=False)`
@@ -22,13 +22,13 @@ pub trait VarTo: HasErr + HasShape {
         self.try_var().unwrap()
     }
     /// Fallible version of [VarTo::var]
-    fn try_var<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Self::Err>
+    fn try_var<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Error>
     where
         Self::Shape: HasAxes<Ax> + ReduceShapeTo<Dst, Ax>;
 }
 
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> VarTo for Tensor<S, E, D, T> {
-    fn try_var<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Self::Err>
+    fn try_var<Dst: Shape, Ax: Axes>(self) -> Result<Self::WithShape<Dst>, Error>
     where
         Self::Shape: HasAxes<Ax> + ReduceShapeTo<Dst, Ax>,
     {
diff --git a/dfdx-derives/src/lib.rs b/dfdx-derives/src/lib.rs
index f258c679c..60da4982e 100644
--- a/dfdx-derives/src/lib.rs
+++ b/dfdx-derives/src/lib.rs
@@ -31,8 +31,7 @@ macro_rules! has_attr {
 /// pub struct Abs;
 /// impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>> for Abs {
 ///     type Output = Tensor<S, E, D, T>;
-///     type Error = D::Err;
-///     fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Self::Error> {
+///     fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Error> {
 ///         x.try_abs()
 ///     }
 /// }
@@ -50,8 +49,7 @@ macro_rules! has_attr {
 ///     for Reshape<Dst>
 /// {
 ///     type Output = Tensor<Dst, E, D, T>;
-///     type Error = D::Err;
-///     fn try_forward(&self, x: Tensor<Src, E, D, T>) -> Result<Self::Output, Self::Error> {
+///     fn try_forward(&self, x: Tensor<Src, E, D, T>) -> Result<Self::Output, Error> {
 ///         x.try_reshape_like(&self.0)
 ///     }
 /// }
@@ -77,8 +75,7 @@ macro_rules! has_attr {
 ///     X: TryAdd<X, Output = X>,
 /// {
 ///     type Output = X;
-///     type Error = D::Err;
-///     fn try_forward(&self, x: X) -> Result<Self::Output, Self::Error> {
+///     fn try_forward(&self, x: X) -> Result<Self::Output, Error> {
 ///         self.matmul.try_forward(x.with_empty_tape())?.try_add(x)
 ///     }
 /// }
@@ -206,7 +203,7 @@ pub fn custom_module(input: proc_macro::TokenStream) -> proc_macro::TokenStream
                 }
 
                 impl #build_impl dfdx_core::nn_traits::ResetParams<Elem, Dev> for #builder_name #built_ty #built_where {
-                    fn try_reset_params(&mut self) -> Result<(), Dev::Err> {
+                    fn try_reset_params(&mut self) -> Result<(), dfdx_core::tensor::Error> {
                         Ok(())
                     }
                 }
@@ -217,13 +214,13 @@ pub fn custom_module(input: proc_macro::TokenStream) -> proc_macro::TokenStream
                         optimizer: &mut Optim,
                         gradients: &dfdx_core::tensor::Gradients<Elem, Dev>,
                         missing_tensors: &mut Vec<dfdx_core::tensor::UniqueId>,
-                    ) -> Result<(), Dev::Err> {
+                    ) -> Result<(), dfdx_core::tensor::Error> {
                         Ok(())
                     }
                 }
 
                 impl #build_impl dfdx_core::nn_traits::ZeroGrads<Elem, Dev> for #builder_name #built_ty #built_where {
-                    fn try_zero_grads(&self, grads: &mut dfdx_core::tensor::Gradients<Elem, Dev>) -> Result<(), Dev::Err> {
+                    fn try_zero_grads(&self, grads: &mut dfdx_core::tensor::Gradients<Elem, Dev>) -> Result<(), dfdx_core::tensor::Error> {
                         Ok(())
                     }
                 }
@@ -261,7 +258,7 @@ pub fn custom_module(input: proc_macro::TokenStream) -> proc_macro::TokenStream
                     quote! {
                         impl #build_impl dfdx_core::nn_traits::BuildOnDevice<Elem, Dev> for #builder_name #builder_ty #built_where {
                             type Built = #built_name #built_ty;
-                            fn try_build_on_device(&self, device: &Dev) -> Result<Self::Built, Dev::Err> {
+                            fn try_build_on_device(&self, device: &Dev) -> Result<Self::Built, dfdx_core::tensor::Error> {
                                 let built = #built_name { #(#recurse)* };
                                 Ok(built)
                             }
@@ -280,7 +277,7 @@ pub fn custom_module(input: proc_macro::TokenStream) -> proc_macro::TokenStream
                     quote! {
                         impl #build_impl dfdx_core::nn_traits::BuildOnDevice<Elem, Dev> for #builder_name #builder_ty #built_where {
                             type Built = #built_name #built_ty;
-                            fn try_build_on_device(&self, device: &Dev) -> Result<Self::Built, Dev::Err> {
+                            fn try_build_on_device(&self, device: &Dev) -> Result<Self::Built, dfdx_core::tensor::Error> {
                                 let built = #built_name(#(#recurse)*);
                                 Ok(built)
                             }
@@ -291,7 +288,7 @@ pub fn custom_module(input: proc_macro::TokenStream) -> proc_macro::TokenStream
                     quote! {
                         impl #build_impl dfdx_core::nn_traits::BuildOnDevice<Elem, Dev> for #builder_name #builder_ty #built_where {
                             type Built = #built_name #built_ty;
-                            fn try_build_on_device(&self, device: &Dev) -> Result<Self::Built, Dev::Err> {
+                            fn try_build_on_device(&self, device: &Dev) -> Result<Self::Built, dfdx_core::tensor::Error> {
                                 Ok(#built_name)
                             }
                         }
@@ -422,7 +419,7 @@ pub fn sequential(input: proc_macro::TokenStream) -> proc_macro::TokenStream {
                     quote! {
                         impl #built_impl dfdx_core::nn_traits::BuildOnDevice<Elem, Dev> for #builder_name #builder_ty #built_where {
                             type Built = #built_name #built_ty;
-                            fn try_build_on_device(&self, device: &Dev) -> Result<Self::Built, Dev::Err> {
+                            fn try_build_on_device(&self, device: &Dev) -> Result<Self::Built, dfdx_core::tensor::Error> {
                                 let built = #built_name {
                                     #(#recurse)*
                                 };
@@ -439,7 +436,7 @@ pub fn sequential(input: proc_macro::TokenStream) -> proc_macro::TokenStream {
                     quote! {
                         impl #built_impl dfdx_core::nn_traits::BuildOnDevice<Elem, Dev> for #builder_name #builder_ty #built_where {
                             type Built = #built_name #built_ty;
-                            fn try_build_on_device(&self, device: &Dev) -> Result<Self::Built, Dev::Err> {
+                            fn try_build_on_device(&self, device: &Dev) -> Result<Self::Built, dfdx_core::tensor::Error> {
                                 #built_name(
                                     #(#recurse)*
                                 )
@@ -455,12 +452,8 @@ pub fn sequential(input: proc_macro::TokenStream) -> proc_macro::TokenStream {
 
     // Get's the output type of the sequential. Also adds Module bounds to the where clause.
     let mut last_ty = quote!(Input);
-    let err = quote!(<Input as dfdx_core::prelude::HasErr>::Err);
     let output_ty = {
         let where_clause = module_generics.make_where_clause();
-        where_clause
-            .predicates
-            .push(parse_quote!(Input: dfdx_core::prelude::HasErr));
         match &input.data {
             Data::Struct(ref obj) => match obj.fields {
                 Fields::Named(ref fields) => {
@@ -471,7 +464,7 @@ pub fn sequential(input: proc_macro::TokenStream) -> proc_macro::TokenStream {
                             .push(parse_quote!(#ty: dfdx_core::nn_traits::BuildOnDevice<Elem, Dev>));
                         where_clause
                             .predicates
-                            .push(parse_quote!(<#ty as dfdx_core::nn_traits::BuildOnDevice<Elem, Dev>>::Built: dfdx_core::nn_traits::Module<#last_ty, Error = #err>));
+                            .push(parse_quote!(<#ty as dfdx_core::nn_traits::BuildOnDevice<Elem, Dev>>::Built: dfdx_core::nn_traits::Module<#last_ty>));
                         last_ty = parse_quote!(<<#ty as dfdx_core::nn_traits::BuildOnDevice<Elem, Dev>>::Built as dfdx_core::nn_traits::Module<#last_ty>>::Output);
                     });
                 }
@@ -483,7 +476,7 @@ pub fn sequential(input: proc_macro::TokenStream) -> proc_macro::TokenStream {
                             .push(parse_quote!(#ty: dfdx_core::nn_traits::BuildOnDevice<Elem, Dev>));
                         where_clause
                             .predicates
-                            .push(parse_quote!(<#ty as dfdx_core::nn_traits::BuildOnDevice<Elem, Dev>>::Built: dfdx_core::nn_traits::Module<#last_ty, Error = #err>));
+                            .push(parse_quote!(<#ty as dfdx_core::nn_traits::BuildOnDevice<Elem, Dev>>::Built: dfdx_core::nn_traits::Module<#last_ty>));
                         last_ty = parse_quote!(<<#ty as dfdx_core::nn_traits::BuildOnDevice<Elem, Dev>>::Built as dfdx_core::nn_traits::Module<#last_ty>>::Output);
                     });
                 }
@@ -523,8 +516,7 @@ pub fn sequential(input: proc_macro::TokenStream) -> proc_macro::TokenStream {
         quote! {
             impl #module_impl dfdx_core::nn_traits::Module<Input> for #built_name #built_ty #module_where {
                 type Output = #output_ty;
-                type Error = #err;
-                fn try_forward(&self, x: Input) -> Result<Self::Output, Self::Error> {
+                fn try_forward(&self, x: Input) -> Result<Self::Output, Error> {
                     #src
                     Ok(x)
                 }
@@ -606,7 +598,7 @@ pub fn reset_params(input: proc_macro::TokenStream) -> proc_macro::TokenStream {
 
     proc_macro::TokenStream::from(quote! {
         impl #impl_generics dfdx_core::nn_traits::ResetParams<Elem, Dev> for #name #ty_generics #where_clause {
-            fn try_reset_params(&mut self) -> Result<(), Dev::Err> {
+            fn try_reset_params(&mut self) -> Result<(), dfdx_core::tensor::Error> {
                 #resets
                 Ok(())
             }
@@ -690,7 +682,7 @@ pub fn update_params(input: proc_macro::TokenStream) -> proc_macro::TokenStream
                 optimizer: &mut Optim,
                 gradients: &dfdx_core::tensor::Gradients<Elem, Dev>,
                 missing_tensors: &mut Vec<dfdx_core::tensor::UniqueId>,
-            ) -> Result<(), Dev::Err> {
+            ) -> Result<(), dfdx_core::tensor::Error> {
                 #updates
                 Ok(())
             }
@@ -773,7 +765,7 @@ pub fn zero_grads(input: proc_macro::TokenStream) -> proc_macro::TokenStream {
 
     proc_macro::TokenStream::from(quote! {
         impl #impl_generics dfdx_core::nn_traits::ZeroGrads<Elem, Dev> for #name #ty_generics #where_clause {
-            fn try_zero_grads(&self, grads: &mut dfdx_core::prelude::Gradients<Elem, Dev>) -> Result<(), Dev::Err> {
+            fn try_zero_grads(&self, grads: &mut dfdx_core::prelude::Gradients<Elem, Dev>) -> Result<(), dfdx_core::tensor::Error> {
                 #zero_grads
                 Ok(())
             }
diff --git a/dfdx/examples/advanced-train-loop.rs b/dfdx/examples/advanced-train-loop.rs
index 9e130a3fb..5c5ed0ed2 100644
--- a/dfdx/examples/advanced-train-loop.rs
+++ b/dfdx/examples/advanced-train-loop.rs
@@ -12,7 +12,7 @@ fn classification_train<
     Lbl,
     // Our model just needs to implement these two things! ModuleMut for forward
     // and TensorCollection for optimizer/alloc_grads/zero_grads
-    Model: Module<Inp::Traced, Error = D::Err> + ZeroGrads<E, D> + UpdateParams<E, D>,
+    Model: Module<Inp::Traced> + ZeroGrads<E, D> + UpdateParams<E, D>,
     // optimizer, pretty straight forward
     Opt: Optimizer<Model, E, D>,
     // our data will just be any iterator over these items. easy!
@@ -22,7 +22,7 @@ fn classification_train<
     Criterion: FnMut(Model::Output, Lbl) -> Loss,
     // the Loss needs to be able to call backward, and we also use
     // this generic as an output
-    Loss: Backward<E, D, Err = D::Err> + AsArray<Array = E>,
+    Loss: Backward<E, D> + AsArray<Array = E>,
     // Dtype & Device to tie everything together
     E: Dtype,
     D: Device<E>,
@@ -32,7 +32,7 @@ fn classification_train<
     mut criterion: Criterion,
     data: Data,
     batch_accum: usize,
-) -> Result<(), D::Err> {
+) -> Result<(), Error> {
     let mut grads = model.try_alloc_grads()?;
     for (i, (inp, lbl)) in data.enumerate() {
         let y = model.try_forward_mut(inp.traced(grads))?;
diff --git a/dfdx/src/nn/layers/abs.rs b/dfdx/src/nn/layers/abs.rs
index 46f0932ea..f9398e6d3 100644
--- a/dfdx/src/nn/layers/abs.rs
+++ b/dfdx/src/nn/layers/abs.rs
@@ -7,8 +7,7 @@ impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> crate::nn::Module<Tensor<S
     for Abs
 {
     type Output = Tensor<S, E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Error> {
         x.try_abs()
     }
 }
diff --git a/dfdx/src/nn/layers/add_into.rs b/dfdx/src/nn/layers/add_into.rs
index e1d46629f..983103396 100644
--- a/dfdx/src/nn/layers/add_into.rs
+++ b/dfdx/src/nn/layers/add_into.rs
@@ -31,7 +31,7 @@ pub struct AddInto<T>(
 
 impl<E: Dtype, D: Device<E>, T: BuildOnDevice<E, D>> BuildOnDevice<E, D> for AddInto<T> {
     type Built = AddInto<T::Built>;
-    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, <D>::Err> {
+    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, crate::tensor::Error> {
         let t = self.0.try_build_on_device(device)?;
         Ok(AddInto(t))
     }
@@ -45,17 +45,16 @@ macro_rules! sum {
 macro_rules! add_into_impls {
     ($([$Mod:tt $ModVar:tt $Inp:tt $InpVar:tt]),+) => {
         impl<
-            Out: TryAdd<Out, Output = Out, Err = A::Error>,
+            Out: TryAdd<Out, Output = Out>,
             Ai, $($Inp, )+
             A: Module<Ai, Output = Out>,
-            $($Mod: Module<$Inp, Output = Out, Error = A::Error>, )+
+            $($Mod: Module<$Inp, Output = Out>, )+
         > Module<(Ai, $($Inp, )+)> for AddInto<(A, $($Mod, )+)>
         {
             type Output = Out;
-            type Error = A::Error;
 
             #[allow(clippy::needless_question_mark)]
-            fn try_forward(&self, x: (Ai, $($Inp, )+)) -> Result<Self::Output, Self::Error> {
+            fn try_forward(&self, x: (Ai, $($Inp, )+)) -> Result<Self::Output, Error> {
                 let (a, $($ModVar, )+) = &self.0;
                 let (a_i, $($InpVar, )+) = x;
                 let a_i = a.try_forward(a_i)?;
@@ -63,7 +62,7 @@ macro_rules! add_into_impls {
                 Ok(sum!(a_i, $($InpVar),*))
             }
             #[allow(clippy::needless_question_mark)]
-            fn try_forward_mut(&mut self, x: (Ai, $($Inp, )+)) -> Result<Self::Output, Self::Error> {
+            fn try_forward_mut(&mut self, x: (Ai, $($Inp, )+)) -> Result<Self::Output, Error> {
                 let (a, $($ModVar, )+) = &mut self.0;
                 let (a_i, $($InpVar, )+) = x;
                 let a_i = a.try_forward_mut(a_i)?;
diff --git a/dfdx/src/nn/layers/batch_norm1d.rs b/dfdx/src/nn/layers/batch_norm1d.rs
index e0cf2da07..e6186365d 100644
--- a/dfdx/src/nn/layers/batch_norm1d.rs
+++ b/dfdx/src/nn/layers/batch_norm1d.rs
@@ -42,7 +42,7 @@ pub type BatchNorm1DConstConfig<const C: usize> = BatchNorm1DConfig<Const<C>>;
 
 impl<C: Dim, E: Dtype, D: Device<E>> BuildOnDevice<E, D> for BatchNorm1DConfig<C> {
     type Built = BatchNorm1D<C, E, D>;
-    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, D::Err> {
+    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, crate::tensor::Error> {
         Ok(BatchNorm1D {
             scale: device.try_ones_like(&(self.0,))?,
             bias: device.try_zeros_like(&(self.0,))?,
@@ -82,7 +82,7 @@ pub struct BatchNorm1D<C: Dim, Elem: Dtype, Dev: Device<Elem>> {
 }
 
 impl<C: Dim, E: Dtype, D: Device<E>> crate::nn::ResetParams<E, D> for BatchNorm1D<C, E, D> {
-    fn try_reset_params(&mut self) -> Result<(), D::Err> {
+    fn try_reset_params(&mut self) -> Result<(), crate::tensor::Error> {
         self.scale.try_fill_with_ones()?;
         self.bias.try_fill_with_zeros()?;
         self.running_mean.try_fill_with_zeros()?;
@@ -94,12 +94,11 @@ impl<B: Dim, C: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>>
     crate::nn::Module<Tensor<(B, C), E, D, T>> for BatchNorm1D<C, E, D>
 {
     type Output = Tensor<(B, C), E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<(B, C), E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<(B, C), E, D, T>) -> Result<Self::Output, Error> {
         assert!(!T::OWNS_TAPE);
         self.infer_fwd(x)
     }
-    fn try_forward_mut(&mut self, x: Tensor<(B, C), E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward_mut(&mut self, x: Tensor<(B, C), E, D, T>) -> Result<Self::Output, Error> {
         assert!(T::OWNS_TAPE);
         self.train_fwd(x)
     }
@@ -109,15 +108,11 @@ impl<B: Dim, C: Dim, L: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>>
     crate::nn::Module<Tensor<(B, C, L), E, D, T>> for BatchNorm1D<C, E, D>
 {
     type Output = Tensor<(B, C, L), E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<(B, C, L), E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<(B, C, L), E, D, T>) -> Result<Self::Output, Error> {
         assert!(!T::OWNS_TAPE);
         self.infer_fwd(x)
     }
-    fn try_forward_mut(
-        &mut self,
-        x: Tensor<(B, C, L), E, D, T>,
-    ) -> Result<Self::Output, Self::Error> {
+    fn try_forward_mut(&mut self, x: Tensor<(B, C, L), E, D, T>) -> Result<Self::Output, Error> {
         assert!(T::OWNS_TAPE);
         self.train_fwd(x)
     }
@@ -128,7 +123,7 @@ impl<C: Dim, E: Dtype, D: Device<E>> BatchNorm1D<C, E, D> {
     fn train_fwd<S: Shape, T: Tape<E, D>, Ax: Axes>(
         &mut self,
         x: Tensor<S, E, D, T>,
-    ) -> Result<Tensor<S, E, D, T>, D::Err>
+    ) -> Result<Tensor<S, E, D, T>, crate::tensor::Error>
     where
         S: HasAxes<Ax> + ReduceShapeTo<(C,), Ax>,
     {
@@ -175,7 +170,7 @@ impl<C: Dim, E: Dtype, D: Device<E>> BatchNorm1D<C, E, D> {
     pub fn infer_fwd<S: Shape, T: Tape<E, D>, Ax: Axes>(
         &self,
         x: Tensor<S, E, D, T>,
-    ) -> Result<Tensor<S, E, D, T>, D::Err>
+    ) -> Result<Tensor<S, E, D, T>, crate::tensor::Error>
     where
         (C,): BroadcastShapeTo<S, Ax>,
     {
diff --git a/dfdx/src/nn/layers/batch_norm2d.rs b/dfdx/src/nn/layers/batch_norm2d.rs
index 40ca6243b..c6f592d38 100644
--- a/dfdx/src/nn/layers/batch_norm2d.rs
+++ b/dfdx/src/nn/layers/batch_norm2d.rs
@@ -44,7 +44,7 @@ pub type BatchNorm2DConstConfig<const C: usize> = BatchNorm2DConfig<Const<C>>;
 
 impl<C: Dim, E: Dtype, D: Device<E>> crate::nn::BuildOnDevice<E, D> for BatchNorm2DConfig<C> {
     type Built = BatchNorm2D<C, E, D>;
-    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, D::Err> {
+    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, crate::tensor::Error> {
         Ok(BatchNorm2D {
             scale: device.try_ones_like(&(self.0,))?,
             bias: device.try_zeros_like(&(self.0,))?,
@@ -76,7 +76,7 @@ pub struct BatchNorm2D<C: Dim, Elem: Dtype, Dev: Device<Elem>> {
 }
 
 impl<C: Dim, E: Dtype, D: Device<E>> crate::nn::ResetParams<E, D> for BatchNorm2D<C, E, D> {
-    fn try_reset_params(&mut self) -> Result<(), D::Err> {
+    fn try_reset_params(&mut self) -> Result<(), crate::tensor::Error> {
         self.scale.try_fill_with_ones()?;
         self.bias.try_fill_with_zeros()?;
         self.running_mean.try_fill_with_zeros()?;
@@ -88,15 +88,11 @@ impl<C: Dim, H: Dim, W: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>>
     crate::nn::Module<Tensor<(C, H, W), E, D, T>> for BatchNorm2D<C, E, D>
 {
     type Output = Tensor<(C, H, W), E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<(C, H, W), E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<(C, H, W), E, D, T>) -> Result<Self::Output, Error> {
         assert!(!T::OWNS_TAPE);
         self.infer_fwd(x)
     }
-    fn try_forward_mut(
-        &mut self,
-        x: Tensor<(C, H, W), E, D, T>,
-    ) -> Result<Self::Output, Self::Error> {
+    fn try_forward_mut(&mut self, x: Tensor<(C, H, W), E, D, T>) -> Result<Self::Output, Error> {
         assert!(T::OWNS_TAPE);
         self.train_fwd(x)
     }
@@ -106,18 +102,14 @@ impl<Batch: Dim, C: Dim, H: Dim, W: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>>
     crate::nn::Module<Tensor<(Batch, C, H, W), E, D, T>> for BatchNorm2D<C, E, D>
 {
     type Output = Tensor<(Batch, C, H, W), E, D, T>;
-    type Error = D::Err;
-    fn try_forward(
-        &self,
-        x: Tensor<(Batch, C, H, W), E, D, T>,
-    ) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<(Batch, C, H, W), E, D, T>) -> Result<Self::Output, Error> {
         assert!(!T::OWNS_TAPE);
         self.infer_fwd(x)
     }
     fn try_forward_mut(
         &mut self,
         x: Tensor<(Batch, C, H, W), E, D, T>,
-    ) -> Result<Self::Output, Self::Error> {
+    ) -> Result<Self::Output, Error> {
         assert!(T::OWNS_TAPE);
         self.train_fwd(x)
     }
@@ -128,7 +120,7 @@ impl<C: Dim, E: Dtype, D: Device<E>> BatchNorm2D<C, E, D> {
     fn train_fwd<S: Shape, T: Tape<E, D>, Ax: Axes>(
         &mut self,
         x: Tensor<S, E, D, T>,
-    ) -> Result<Tensor<S, E, D, T>, D::Err>
+    ) -> Result<Tensor<S, E, D, T>, crate::tensor::Error>
     where
         S: HasAxes<Ax> + ReduceShapeTo<(C,), Ax>,
     {
@@ -175,7 +167,7 @@ impl<C: Dim, E: Dtype, D: Device<E>> BatchNorm2D<C, E, D> {
     pub fn infer_fwd<S: Shape, T: Tape<E, D>, Ax: Axes>(
         &self,
         x: Tensor<S, E, D, T>,
-    ) -> Result<Tensor<S, E, D, T>, D::Err>
+    ) -> Result<Tensor<S, E, D, T>, crate::tensor::Error>
     where
         (C,): BroadcastShapeTo<S, Ax>,
     {
diff --git a/dfdx/src/nn/layers/bias1d.rs b/dfdx/src/nn/layers/bias1d.rs
index feeed96fd..eb68a67bb 100644
--- a/dfdx/src/nn/layers/bias1d.rs
+++ b/dfdx/src/nn/layers/bias1d.rs
@@ -28,7 +28,7 @@ pub type Bias1DConstConfig<const I: usize> = Bias1DConfig<Const<I>>;
 
 impl<I: Dim, E: Dtype, D: Device<E>> BuildOnDevice<E, D> for Bias1DConfig<I> {
     type Built = Bias1D<I, E, D>;
-    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, D::Err> {
+    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, crate::tensor::Error> {
         Ok(Bias1D {
             bias: device.try_zeros_like(&(self.0,))?,
         })
@@ -44,7 +44,7 @@ pub struct Bias1D<I: Dim, Elem: Dtype, Dev: Device<Elem>> {
 }
 
 impl<I: Dim, E: Dtype, D: Device<E>> ResetParams<E, D> for Bias1D<I, E, D> {
-    fn try_reset_params(&mut self) -> Result<(), D::Err> {
+    fn try_reset_params(&mut self) -> Result<(), crate::tensor::Error> {
         self.bias.try_fill_with_zeros()
     }
 }
@@ -53,8 +53,7 @@ impl<I: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<(I,), E, D, T>
     for Bias1D<I, E, D>
 {
     type Output = Tensor<(I,), E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<(I,), E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<(I,), E, D, T>) -> Result<Self::Output, Error> {
         x.try_add(self.bias.clone())
     }
 }
@@ -63,8 +62,7 @@ impl<Batch: Dim, I: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<(B
     for Bias1D<I, E, D>
 {
     type Output = Tensor<(Batch, I), E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<(Batch, I), E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<(Batch, I), E, D, T>) -> Result<Self::Output, Error> {
         self.bias.retaped::<T>().broadcast_like(&x).try_add(x)
     }
 }
@@ -73,11 +71,7 @@ impl<Batch: Dim, Seq: Dim, I: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>>
     Module<Tensor<(Batch, Seq, I), E, D, T>> for Bias1D<I, E, D>
 {
     type Output = Tensor<(Batch, Seq, I), E, D, T>;
-    type Error = D::Err;
-    fn try_forward(
-        &self,
-        x: Tensor<(Batch, Seq, I), E, D, T>,
-    ) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<(Batch, Seq, I), E, D, T>) -> Result<Self::Output, Error> {
         self.bias.retaped::<T>().broadcast_like(&x).try_add(x)
     }
 }
diff --git a/dfdx/src/nn/layers/bias2d.rs b/dfdx/src/nn/layers/bias2d.rs
index 497ceb196..1b6ac42ac 100644
--- a/dfdx/src/nn/layers/bias2d.rs
+++ b/dfdx/src/nn/layers/bias2d.rs
@@ -28,7 +28,7 @@ pub type Bias2DConstConfig<const C: usize> = Bias2DConfig<Const<C>>;
 
 impl<C: Dim, E: Dtype, D: Device<E>> BuildOnDevice<E, D> for Bias2DConfig<C> {
     type Built = Bias2D<C, E, D>;
-    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, D::Err> {
+    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, crate::tensor::Error> {
         Ok(Bias2D {
             bias: device.try_zeros_like(&(self.0,))?,
         })
@@ -44,7 +44,7 @@ pub struct Bias2D<C: Dim, Elem: Dtype, Dev: Device<Elem>> {
 }
 
 impl<C: Dim, E: Dtype, D: Device<E>> ResetParams<E, D> for Bias2D<C, E, D> {
-    fn try_reset_params(&mut self) -> Result<(), D::Err> {
+    fn try_reset_params(&mut self) -> Result<(), crate::tensor::Error> {
         self.bias.try_fill_with_zeros()
     }
 }
@@ -53,8 +53,7 @@ impl<C: Dim, H: Dim, W: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>>
     Module<Tensor<(C, H, W), E, D, T>> for Bias2D<C, E, D>
 {
     type Output = Tensor<(C, H, W), E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<(C, H, W), E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<(C, H, W), E, D, T>) -> Result<Self::Output, Error> {
         self.bias.retaped::<T>().broadcast_like(&x).try_add(x)
     }
 }
@@ -63,8 +62,7 @@ impl<B: Dim, C: Dim, H: Dim, W: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>>
     Module<Tensor<(B, C, H, W), E, D, T>> for Bias2D<C, E, D>
 {
     type Output = Tensor<(B, C, H, W), E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<(B, C, H, W), E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<(B, C, H, W), E, D, T>) -> Result<Self::Output, Error> {
         self.bias.retaped::<T>().broadcast_like(&x).try_add(x)
     }
 }
diff --git a/dfdx/src/nn/layers/conv1d.rs b/dfdx/src/nn/layers/conv1d.rs
index ba69e7672..793b326a9 100644
--- a/dfdx/src/nn/layers/conv1d.rs
+++ b/dfdx/src/nn/layers/conv1d.rs
@@ -62,7 +62,7 @@ where
     <I as std::ops::Div<G>>::Output: Dim,
 {
     type Built = Conv1D<I, O, K, S, P, L, G, E, D>;
-    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, <D>::Err> {
+    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, crate::tensor::Error> {
         assert_eq!(self.in_chan.size() % self.groups.size(), 0);
         assert_eq!(self.out_chan.size() % self.groups.size(), 0);
         let i_over_g = self.in_chan / self.groups;
@@ -119,7 +119,7 @@ where
     E: Dtype + num_traits::Float + rand_distr::uniform::SampleUniform,
     D: Device<E>,
 {
-    fn try_reset_params(&mut self) -> Result<(), D::Err> {
+    fn try_reset_params(&mut self) -> Result<(), crate::tensor::Error> {
         let (_, i_over_g, k) = self.weight.shape();
         let scale = (1.0 / (k.size() * i_over_g.size()) as f64).sqrt();
         let b = E::from_f64(scale).unwrap();
@@ -143,14 +143,7 @@ where
         L,
         G,
     >>::Convolved;
-    type Error = <(Img, Tensor<(O, <I as std::ops::Div<G>>::Output, K), E, D>) as TryConv1D<
-        S,
-        P,
-        L,
-        G,
-    >>::Error;
-
-    fn try_forward(&self, x: Img) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Img) -> Result<Self::Output, Error> {
         (x, self.weight.clone()).try_conv1d(self.stride, self.padding, self.dilation, self.groups)
     }
 }
diff --git a/dfdx/src/nn/layers/conv2d.rs b/dfdx/src/nn/layers/conv2d.rs
index 28a2552d6..9e65f41ff 100644
--- a/dfdx/src/nn/layers/conv2d.rs
+++ b/dfdx/src/nn/layers/conv2d.rs
@@ -78,7 +78,7 @@ where
     <I as std::ops::Div<G>>::Output: Dim,
 {
     type Built = Conv2D<I, O, K, S, P, L, G, E, D>;
-    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, <D>::Err> {
+    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, crate::tensor::Error> {
         assert_eq!(self.in_chan.size() % self.groups.size(), 0);
         assert_eq!(self.out_chan.size() % self.groups.size(), 0);
         let i_over_g = self.in_chan / self.groups;
@@ -141,7 +141,7 @@ where
     E: Dtype + num_traits::Float + rand_distr::uniform::SampleUniform,
     D: Device<E>,
 {
-    fn try_reset_params(&mut self) -> Result<(), D::Err> {
+    fn try_reset_params(&mut self) -> Result<(), crate::tensor::Error> {
         let (_, i_over_g, k, _) = self.weight.shape();
         let scale = E::from_f64(1.0 / (k.size() * k.size() * i_over_g.size()) as f64).unwrap();
         let b = scale.sqrt();
@@ -166,12 +166,7 @@ where
         Img,
         Tensor<(O, <I as std::ops::Div<G>>::Output, K, K), E, D>,
     ) as TryConv2D<S, P, L, G>>::Convolved;
-    type Error = <(
-        Img,
-        Tensor<(O, <I as std::ops::Div<G>>::Output, K, K), E, D>,
-    ) as TryConv2D<S, P, L, G>>::Error;
-
-    fn try_forward(&self, x: Img) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Img) -> Result<Self::Output, Error> {
         (x, self.weight.clone()).try_conv2d(self.stride, self.padding, self.dilation, self.groups)
     }
 }
diff --git a/dfdx/src/nn/layers/conv_trans2d.rs b/dfdx/src/nn/layers/conv_trans2d.rs
index bfc2bf822..b4730a924 100644
--- a/dfdx/src/nn/layers/conv_trans2d.rs
+++ b/dfdx/src/nn/layers/conv_trans2d.rs
@@ -60,7 +60,7 @@ where
     <O as std::ops::Div<G>>::Output: Dim,
 {
     type Built = ConvTrans2D<I, O, K, S, P, L, G, E, D>;
-    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, <D>::Err> {
+    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, crate::tensor::Error> {
         assert_eq!(self.in_chan.size() % self.groups.size(), 0);
         assert_eq!(self.out_chan.size() % self.groups.size(), 0);
         let o_over_g = self.out_chan / self.groups;
@@ -119,7 +119,7 @@ where
     E: Dtype + num_traits::Float + rand_distr::uniform::SampleUniform,
     D: Device<E>,
 {
-    fn try_reset_params(&mut self) -> Result<(), D::Err> {
+    fn try_reset_params(&mut self) -> Result<(), crate::tensor::Error> {
         let (_, o_over_g, k, _) = self.weight.shape();
         let b = (1.0 / (k.size() * k.size() * o_over_g.size()) as f64).sqrt();
         let b = E::from_f64(b).unwrap();
@@ -144,12 +144,7 @@ where
         Img,
         Tensor<(I, <O as std::ops::Div<G>>::Output, K, K), E, D>,
     ) as TryConvTrans2D<S, P, L, G>>::Convolved;
-    type Error = <(
-        Img,
-        Tensor<(I, <O as std::ops::Div<G>>::Output, K, K), E, D>,
-    ) as TryConvTrans2D<S, P, L, G>>::Error;
-
-    fn try_forward(&self, x: Img) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Img) -> Result<Self::Output, Error> {
         (x, self.weight.clone()).try_convtrans2d(
             self.stride,
             self.padding,
diff --git a/dfdx/src/nn/layers/cos.rs b/dfdx/src/nn/layers/cos.rs
index c0aa5fa5c..d9abf4e26 100644
--- a/dfdx/src/nn/layers/cos.rs
+++ b/dfdx/src/nn/layers/cos.rs
@@ -5,8 +5,7 @@ use crate::prelude::*;
 pub struct Cos;
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>> for Cos {
     type Output = Tensor<S, E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Error> {
         x.try_cos()
     }
 }
diff --git a/dfdx/src/nn/layers/dropout.rs b/dfdx/src/nn/layers/dropout.rs
index 741623ec3..a446ff912 100644
--- a/dfdx/src/nn/layers/dropout.rs
+++ b/dfdx/src/nn/layers/dropout.rs
@@ -23,10 +23,9 @@ impl<const N: usize, S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Ten
     for DropoutOneIn<N>
 {
     type Output = Tensor<S, E, D, T>;
-    type Error = D::Err;
 
     /// Does nothing
-    fn try_forward(&self, input: Tensor<S, E, D, T>) -> Result<Self::Output, D::Err> {
+    fn try_forward(&self, input: Tensor<S, E, D, T>) -> Result<Self::Output, crate::tensor::Error> {
         assert!(
             !T::OWNS_TAPE,
             "DropoutOneIn::try_forward input must not be traced."
@@ -35,7 +34,7 @@ impl<const N: usize, S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Ten
     }
 
     /// Applies dropout to the input tensor.
-    fn try_forward_mut(&mut self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward_mut(&mut self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Error> {
         assert!(
             T::OWNS_TAPE,
             "DropoutOneIn::try_forward_mut input must be traced."
@@ -70,10 +69,9 @@ impl Default for Dropout {
 
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>> for Dropout {
     type Output = Tensor<S, E, D, T>;
-    type Error = D::Err;
 
     /// Does nothing
-    fn try_forward(&self, input: Tensor<S, E, D, T>) -> Result<Self::Output, D::Err> {
+    fn try_forward(&self, input: Tensor<S, E, D, T>) -> Result<Self::Output, crate::tensor::Error> {
         assert!(
             !T::OWNS_TAPE,
             "Dropout::try_forward input must not be traced."
@@ -82,7 +80,7 @@ impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>>
     }
 
     /// Applies dropout to the input tensor.
-    fn try_forward_mut(&mut self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward_mut(&mut self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Error> {
         assert!(
             T::OWNS_TAPE,
             "Dropout::try_forward_mut input must be traced."
diff --git a/dfdx/src/nn/layers/embedding.rs b/dfdx/src/nn/layers/embedding.rs
index 9f7767a48..2979c9d70 100644
--- a/dfdx/src/nn/layers/embedding.rs
+++ b/dfdx/src/nn/layers/embedding.rs
@@ -43,7 +43,7 @@ pub type EmbeddingConstConfig<const VOCAB: usize, const MODEL: usize> =
 
 impl<V: Dim, M: Dim, E: Dtype, D: Device<E>> BuildOnDevice<E, D> for EmbeddingConfig<V, M> {
     type Built = Embedding<V, M, E, D>;
-    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, D::Err> {
+    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, crate::tensor::Error> {
         Ok(Embedding {
             weight: device.try_zeros_like(&(self.vocab, self.model))?,
         })
@@ -62,7 +62,7 @@ impl<V: Dim, M: Dim, E: Dtype, D: Device<E>> ResetParams<E, D> for Embedding<V,
 where
     rand_distr::StandardNormal: rand_distr::Distribution<E>,
 {
-    fn try_reset_params(&mut self) -> Result<(), D::Err> {
+    fn try_reset_params(&mut self) -> Result<(), crate::tensor::Error> {
         self.weight.try_fill_with_distr(rand_distr::StandardNormal)
     }
 }
@@ -71,9 +71,11 @@ impl<V: Dim, M: Dim, Seq: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>>
     Module<Tensor<(Seq,), usize, D, T>> for Embedding<V, M, E, D>
 {
     type Output = Tensor<(Seq, M), E, D, T>;
-    type Error = D::Err;
 
-    fn try_forward(&self, input: Tensor<(Seq,), usize, D, T>) -> Result<Self::Output, D::Err> {
+    fn try_forward(
+        &self,
+        input: Tensor<(Seq,), usize, D, T>,
+    ) -> Result<Self::Output, crate::tensor::Error> {
         let (input, tape) = input.split_tape();
         self.weight.clone().put_tape(tape).try_gather(input)
     }
@@ -83,12 +85,11 @@ impl<Batch: Dim, Seq: Dim, V: Dim, M: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>
     Module<Tensor<(Batch, Seq), usize, D, T>> for Embedding<V, M, E, D>
 {
     type Output = Tensor<(Batch, Seq, M), E, D, T>;
-    type Error = D::Err;
 
     fn try_forward(
         &self,
         input: Tensor<(Batch, Seq), usize, D, T>,
-    ) -> Result<Self::Output, D::Err> {
+    ) -> Result<Self::Output, crate::tensor::Error> {
         let (input, tape) = input.split_tape();
         self.weight.clone().put_tape(tape).try_gather(input)
     }
diff --git a/dfdx/src/nn/layers/exp.rs b/dfdx/src/nn/layers/exp.rs
index a34c9b6fa..fd8b02492 100644
--- a/dfdx/src/nn/layers/exp.rs
+++ b/dfdx/src/nn/layers/exp.rs
@@ -5,8 +5,7 @@ use crate::prelude::*;
 pub struct Exp;
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>> for Exp {
     type Output = Tensor<S, E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Error> {
         x.try_exp()
     }
 }
diff --git a/dfdx/src/nn/layers/flatten2d.rs b/dfdx/src/nn/layers/flatten2d.rs
index a47b18875..bc414e3d9 100644
--- a/dfdx/src/nn/layers/flatten2d.rs
+++ b/dfdx/src/nn/layers/flatten2d.rs
@@ -14,9 +14,11 @@ where
     <<C as Mul<H>>::Output as Mul<W>>::Output: Dim,
 {
     type Output = Tensor<(<<C as Mul<H>>::Output as Mul<W>>::Output,), E, D, T>;
-    type Error = D::Err;
 
-    fn try_forward(&self, input: Tensor<(C, H, W), E, D, T>) -> Result<Self::Output, D::Err> {
+    fn try_forward(
+        &self,
+        input: Tensor<(C, H, W), E, D, T>,
+    ) -> Result<Self::Output, crate::tensor::Error> {
         let (c, h, w) = *input.shape();
         let dst = (c * h * w,);
         input.try_reshape_like(&dst)
@@ -31,12 +33,11 @@ where
     <<C as Mul<H>>::Output as Mul<W>>::Output: Dim,
 {
     type Output = Tensor<(Batch, <<C as Mul<H>>::Output as Mul<W>>::Output), E, D, T>;
-    type Error = D::Err;
 
     fn try_forward(
         &self,
         input: Tensor<(Batch, C, H, W), E, D, T>,
-    ) -> Result<Self::Output, D::Err> {
+    ) -> Result<Self::Output, crate::tensor::Error> {
         let (batch, c, h, w) = *input.shape();
         let dst = (batch, c * h * w);
         input.try_reshape_like(&dst)
diff --git a/dfdx/src/nn/layers/gelu.rs b/dfdx/src/nn/layers/gelu.rs
index 673094ead..4fa2188d0 100644
--- a/dfdx/src/nn/layers/gelu.rs
+++ b/dfdx/src/nn/layers/gelu.rs
@@ -5,8 +5,7 @@ use crate::prelude::*;
 pub struct FastGeLU;
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>> for FastGeLU {
     type Output = Tensor<S, E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Error> {
         x.try_fast_gelu()
     }
 }
@@ -16,8 +15,7 @@ impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>>
 pub struct AccurateGeLU;
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>> for AccurateGeLU {
     type Output = Tensor<S, E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Error> {
         x.try_accurate_gelu()
     }
 }
diff --git a/dfdx/src/nn/layers/generalized_add.rs b/dfdx/src/nn/layers/generalized_add.rs
index 6ab0b8886..fb60b9c37 100644
--- a/dfdx/src/nn/layers/generalized_add.rs
+++ b/dfdx/src/nn/layers/generalized_add.rs
@@ -34,27 +34,25 @@ impl<E: Dtype, D: Device<E>, T: BuildOnDevice<E, D>, U: BuildOnDevice<E, D>> Bui
     for GeneralizedAdd<T, U>
 {
     type Built = GeneralizedAdd<T::Built, U::Built>;
-    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, <D>::Err> {
+    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, crate::tensor::Error> {
         let t = self.t.try_build_on_device(device)?;
         let u = self.u.try_build_on_device(device)?;
         Ok(GeneralizedAdd { t, u })
     }
 }
 
-impl<X: WithEmptyTape, T: Module<X>, U: Module<X, Error = T::Error>> Module<X>
-    for GeneralizedAdd<T, U>
+impl<X: WithEmptyTape, T: Module<X>, U: Module<X>> Module<X> for GeneralizedAdd<T, U>
 where
-    T::Output: TryAdd<U::Output, Err = T::Error>,
+    T::Output: TryAdd<U::Output>,
 {
     type Output = <T::Output as TryAdd<U::Output>>::Output;
-    type Error = T::Error;
-    fn try_forward(&self, x: X) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: X) -> Result<Self::Output, Error> {
         let t = self.t.try_forward(x.with_empty_tape())?;
         let u = self.u.try_forward(x)?;
         t.try_add(u)
     }
 
-    fn try_forward_mut(&mut self, x: X) -> Result<Self::Output, Self::Error> {
+    fn try_forward_mut(&mut self, x: X) -> Result<Self::Output, Error> {
         let t = self.t.try_forward_mut(x.with_empty_tape())?;
         let u = self.u.try_forward_mut(x)?;
         t.try_add(u)
diff --git a/dfdx/src/nn/layers/generalized_mul.rs b/dfdx/src/nn/layers/generalized_mul.rs
index a253d5f08..b6b5b5bae 100644
--- a/dfdx/src/nn/layers/generalized_mul.rs
+++ b/dfdx/src/nn/layers/generalized_mul.rs
@@ -33,27 +33,25 @@ impl<E: Dtype, D: Device<E>, T: BuildOnDevice<E, D>, U: BuildOnDevice<E, D>> Bui
     for GeneralizedMul<T, U>
 {
     type Built = GeneralizedMul<T::Built, U::Built>;
-    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, <D>::Err> {
+    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, crate::tensor::Error> {
         let t = self.t.try_build_on_device(device)?;
         let u = self.u.try_build_on_device(device)?;
         Ok(GeneralizedMul { t, u })
     }
 }
 
-impl<X: WithEmptyTape, T: Module<X>, U: Module<X, Error = T::Error>> Module<X>
-    for GeneralizedMul<T, U>
+impl<X: WithEmptyTape, T: Module<X>, U: Module<X>> Module<X> for GeneralizedMul<T, U>
 where
-    T::Output: TryMul<U::Output, Err = T::Error>,
+    T::Output: TryMul<U::Output>,
 {
     type Output = <T::Output as TryMul<U::Output>>::Output;
-    type Error = T::Error;
-    fn try_forward(&self, x: X) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: X) -> Result<Self::Output, Error> {
         let t = self.t.try_forward(x.with_empty_tape())?;
         let u = self.u.try_forward(x)?;
         t.try_mul(u)
     }
 
-    fn try_forward_mut(&mut self, x: X) -> Result<Self::Output, Self::Error> {
+    fn try_forward_mut(&mut self, x: X) -> Result<Self::Output, Error> {
         let t = self.t.try_forward_mut(x.with_empty_tape())?;
         let u = self.u.try_forward_mut(x)?;
         t.try_mul(u)
diff --git a/dfdx/src/nn/layers/layer_norm1d.rs b/dfdx/src/nn/layers/layer_norm1d.rs
index 1bdb1a381..a1381534a 100644
--- a/dfdx/src/nn/layers/layer_norm1d.rs
+++ b/dfdx/src/nn/layers/layer_norm1d.rs
@@ -28,7 +28,7 @@ pub type LayerNorm1DConstConfig<const M: usize> = LayerNorm1DConfig<Const<M>>;
 
 impl<M: Dim, E: Dtype, D: Device<E>> BuildOnDevice<E, D> for LayerNorm1DConfig<M> {
     type Built = LayerNorm1D<M, E, D>;
-    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, D::Err> {
+    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, crate::tensor::Error> {
         Ok(LayerNorm1D {
             gamma: device.try_ones_like(&(self.0,))?,
             beta: device.try_zeros_like(&(self.0,))?,
@@ -51,7 +51,7 @@ pub struct LayerNorm1D<M: Dim, Elem: Dtype, Dev: Device<Elem>> {
 }
 
 impl<M: Dim, E: Dtype, D: Device<E>> ResetParams<E, D> for LayerNorm1D<M, E, D> {
-    fn try_reset_params(&mut self) -> Result<(), D::Err> {
+    fn try_reset_params(&mut self) -> Result<(), crate::tensor::Error> {
         self.gamma.try_fill_with_ones()?;
         self.beta.try_fill_with_zeros()
     }
@@ -61,8 +61,7 @@ impl<M: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<(M,), E, D, T>
     for LayerNorm1D<M, E, D>
 {
     type Output = Tensor<(M,), E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<(M,), E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<(M,), E, D, T>) -> Result<Self::Output, Error> {
         x.try_normalize(self.epsilon)?
             .try_mul(self.gamma.clone())?
             .try_add(self.beta.clone())
@@ -73,8 +72,7 @@ impl<Batch: Dim, M: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<(B
     for LayerNorm1D<M, E, D>
 {
     type Output = Tensor<(Batch, M), E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<(Batch, M), E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<(Batch, M), E, D, T>) -> Result<Self::Output, Error> {
         let x = x.try_normalize::<Axis<1>>(self.epsilon)?;
         let x = self.gamma.retaped::<T>().broadcast_like(&x).try_mul(x)?;
         self.beta.retaped::<T>().broadcast_like(&x).try_add(x)
@@ -85,11 +83,7 @@ impl<Batch: Dim, Seq: Dim, M: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>>
     Module<Tensor<(Batch, Seq, M), E, D, T>> for LayerNorm1D<M, E, D>
 {
     type Output = Tensor<(Batch, Seq, M), E, D, T>;
-    type Error = D::Err;
-    fn try_forward(
-        &self,
-        x: Tensor<(Batch, Seq, M), E, D, T>,
-    ) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<(Batch, Seq, M), E, D, T>) -> Result<Self::Output, Error> {
         let x = x.try_normalize::<Axis<2>>(self.epsilon)?;
         let x = self.gamma.retaped::<T>().broadcast_like(&x).try_mul(x)?;
         self.beta.retaped::<T>().broadcast_like(&x).try_add(x)
diff --git a/dfdx/src/nn/layers/leaky_relu.rs b/dfdx/src/nn/layers/leaky_relu.rs
index 27ee7c214..a4ab83aaf 100644
--- a/dfdx/src/nn/layers/leaky_relu.rs
+++ b/dfdx/src/nn/layers/leaky_relu.rs
@@ -12,8 +12,7 @@ impl Default for LeakyReLU {
 
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>> for LeakyReLU {
     type Output = Tensor<S, E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Error> {
         x.try_prelu(E::from_f64(self.0).unwrap())
     }
 }
diff --git a/dfdx/src/nn/layers/linear.rs b/dfdx/src/nn/layers/linear.rs
index 74d6c711c..ce63b63af 100644
--- a/dfdx/src/nn/layers/linear.rs
+++ b/dfdx/src/nn/layers/linear.rs
@@ -38,7 +38,7 @@ pub type LinearConstConfig<const I: usize, const O: usize> = LinearConfig<Const<
 
 impl<I: Dim, O: Dim, E: Dtype, D: Device<E>> BuildOnDevice<E, D> for LinearConfig<I, O> {
     type Built = Linear<I, O, E, D>;
-    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, D::Err> {
+    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, crate::tensor::Error> {
         Ok(Linear {
             weight: device.try_zeros_like(&(self.out, self.inp))?,
             bias: device.try_zeros_like(&(self.out,))?,
@@ -61,7 +61,7 @@ impl<I: Dim, O: Dim, E, D: Device<E>> ResetParams<E, D> for Linear<I, O, E, D>
 where
     E: Dtype + num_traits::Float + rand_distr::uniform::SampleUniform,
 {
-    fn try_reset_params(&mut self) -> Result<(), D::Err> {
+    fn try_reset_params(&mut self) -> Result<(), crate::tensor::Error> {
         let (_o, i) = self.weight.shape();
         let b = E::from_f64(1.0 / (i.size() as f64).sqrt()).unwrap();
         self.weight.try_fill_with_distr(Uniform::new(-b, b))?;
@@ -72,15 +72,13 @@ where
 impl<S: Shape, I: Dim, O: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>>
     for Linear<I, O, E, D>
 where
-    Tensor<S, E, D, T>: TryMatMul<Tensor<(I, O), E, D, T>, Err = D::Err>,
-    Bias1D<O, E, D>:
-        Module<<Tensor<S, E, D, T> as TryMatMul<Tensor<(I, O), E, D, T>>>::Output, Error = D::Err>,
+    Tensor<S, E, D, T>: TryMatMul<Tensor<(I, O), E, D, T>>,
+    Bias1D<O, E, D>: Module<<Tensor<S, E, D, T> as TryMatMul<Tensor<(I, O), E, D, T>>>::Output>,
 {
     type Output = <Bias1D<O, E, D> as Module<
         <Tensor<S, E, D, T> as TryMatMul<Tensor<(I, O), E, D, T>>>::Output,
     >>::Output;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Error> {
         let weight = self.weight.retaped::<T>().try_permute()?;
         let bias = Bias1D {
             bias: self.bias.clone(),
diff --git a/dfdx/src/nn/layers/ln.rs b/dfdx/src/nn/layers/ln.rs
index 8d7124845..a493a9fc7 100644
--- a/dfdx/src/nn/layers/ln.rs
+++ b/dfdx/src/nn/layers/ln.rs
@@ -5,8 +5,7 @@ use crate::prelude::*;
 pub struct Ln;
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>> for Ln {
     type Output = Tensor<S, E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Error> {
         x.try_ln()
     }
 }
diff --git a/dfdx/src/nn/layers/log_softmax.rs b/dfdx/src/nn/layers/log_softmax.rs
index 47c58e783..c99a20fb5 100644
--- a/dfdx/src/nn/layers/log_softmax.rs
+++ b/dfdx/src/nn/layers/log_softmax.rs
@@ -5,8 +5,7 @@ use crate::prelude::*;
 pub struct LogSoftmax;
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>> for LogSoftmax {
     type Output = Tensor<S, E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Error> {
         x.try_log_softmax()
     }
 }
diff --git a/dfdx/src/nn/layers/matmul.rs b/dfdx/src/nn/layers/matmul.rs
index 7b1de6694..86a3fa52f 100644
--- a/dfdx/src/nn/layers/matmul.rs
+++ b/dfdx/src/nn/layers/matmul.rs
@@ -28,7 +28,7 @@ pub type MatMulConstConfig<const I: usize, const O: usize> = MatMulConfig<Const<
 
 impl<I: Dim, O: Dim, E: Dtype, D: Device<E>> BuildOnDevice<E, D> for MatMulConfig<I, O> {
     type Built = MatMul<I, O, E, D>;
-    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, D::Err> {
+    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, crate::tensor::Error> {
         Ok(MatMul {
             weight: device.try_zeros_like(&(self.out, self.inp))?,
         })
@@ -47,7 +47,7 @@ impl<I: Dim, O: Dim, E, D: Device<E>> ResetParams<E, D> for MatMul<I, O, E, D>
 where
     E: Dtype + num_traits::Float + rand_distr::uniform::SampleUniform,
 {
-    fn try_reset_params(&mut self) -> Result<(), D::Err> {
+    fn try_reset_params(&mut self) -> Result<(), Error> {
         let (_o, i) = self.weight.shape();
         let scale = E::from_f64(1.0 / (i.size() as f64).sqrt()).unwrap();
         self.weight.try_fill_with_distr(Uniform::new(-scale, scale))
@@ -57,11 +57,10 @@ where
 impl<S: Shape, I: Dim, O: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>>
     for MatMul<I, O, E, D>
 where
-    Tensor<S, E, D, T>: TryMatMul<Tensor<(I, O), E, D, T>, Err = D::Err>,
+    Tensor<S, E, D, T>: TryMatMul<Tensor<(I, O), E, D, T>>,
 {
     type Output = <Tensor<S, E, D, T> as TryMatMul<Tensor<(I, O), E, D, T>>>::Output;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Error> {
         x.try_matmul(self.weight.retaped::<T>().try_permute()?)
     }
 }
diff --git a/dfdx/src/nn/layers/multi_head_attention.rs b/dfdx/src/nn/layers/multi_head_attention.rs
index 0ccde7b9f..1232b4337 100644
--- a/dfdx/src/nn/layers/multi_head_attention.rs
+++ b/dfdx/src/nn/layers/multi_head_attention.rs
@@ -64,7 +64,6 @@ where
     T: Tape<E, D>,
 {
     type Output = Tensor<(S1, M), E, D, T>;
-    type Error = D::Err;
 
     /// Encoder-Decoder style self attention where one set of tensors is used for values and keys, and another is used for queries
     fn try_forward(
@@ -74,7 +73,7 @@ where
             Tensor<(S2, M), E, D>,
             Tensor<(S2, M), E, D>,
         ),
-    ) -> Result<Self::Output, D::Err> {
+    ) -> Result<Self::Output, crate::tensor::Error> {
         assert_eq!(k.shape().0, v.shape().0);
         let (s1, m) = *q.shape();
         let s2 = k.shape().0;
@@ -101,7 +100,6 @@ where
     T: Tape<E, D>,
 {
     type Output = Tensor<(B, S1, M), E, D, T>;
-    type Error = D::Err;
 
     /// Batched Encoder-Decoder style self attention where one set of tensors is used for values and keys, and another is used for queries
     fn try_forward(
@@ -111,7 +109,7 @@ where
             Tensor<(B, S2, M), E, D>,
             Tensor<(B, S2, M), E, D>,
         ),
-    ) -> Result<Self::Output, D::Err> {
+    ) -> Result<Self::Output, crate::tensor::Error> {
         assert_eq!(q.shape().0, k.shape().0);
         assert_eq!(q.shape().0, v.shape().0);
         assert_eq!(k.shape().1, v.shape().1);
@@ -153,12 +151,11 @@ where
     E: Dtype,
     D: Device<E>,
     Src: SplitTape,
-    Self: Module<(Src, Src::NoTape, Src::NoTape), Output = Src, Error = D::Err>,
+    Self: Module<(Src, Src::NoTape, Src::NoTape), Output = Src>,
 {
     type Output = Src;
-    type Error = D::Err;
 
-    fn try_forward(&self, src: Src) -> Result<Self::Output, D::Err> {
+    fn try_forward(&self, src: Src) -> Result<Self::Output, crate::tensor::Error> {
         let (src, tape) = src.split_tape();
         self.try_forward((src.clone().put_tape(tape), src.clone(), src))
     }
diff --git a/dfdx/src/nn/layers/pool_2d_avg.rs b/dfdx/src/nn/layers/pool_2d_avg.rs
index 39e2d33da..6a32e86df 100644
--- a/dfdx/src/nn/layers/pool_2d_avg.rs
+++ b/dfdx/src/nn/layers/pool_2d_avg.rs
@@ -32,9 +32,8 @@ impl<K: Dim, S: Dim, P: Dim, L: Dim, Img: TryPool2D<K, S, P, L>> Module<Img>
     for AvgPool2D<K, S, P, L>
 {
     type Output = Img::Pooled;
-    type Error = Img::Error;
 
-    fn try_forward(&self, x: Img) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Img) -> Result<Self::Output, Error> {
         x.try_pool2d(
             crate::tensor_ops::Pool2DKind::Avg,
             self.kernel_size,
diff --git a/dfdx/src/nn/layers/pool_2d_max.rs b/dfdx/src/nn/layers/pool_2d_max.rs
index 09fd1ca4e..8a9ac78fd 100644
--- a/dfdx/src/nn/layers/pool_2d_max.rs
+++ b/dfdx/src/nn/layers/pool_2d_max.rs
@@ -32,9 +32,7 @@ impl<K: Dim, S: Dim, P: Dim, L: Dim, Img: TryPool2D<K, S, P, L>> Module<Img>
     for MaxPool2D<K, S, P, L>
 {
     type Output = Img::Pooled;
-    type Error = Img::Error;
-
-    fn try_forward(&self, x: Img) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Img) -> Result<Self::Output, Error> {
         x.try_pool2d(
             crate::tensor_ops::Pool2DKind::Max,
             self.kernel_size,
diff --git a/dfdx/src/nn/layers/pool_2d_min.rs b/dfdx/src/nn/layers/pool_2d_min.rs
index af0726a80..18a194e63 100644
--- a/dfdx/src/nn/layers/pool_2d_min.rs
+++ b/dfdx/src/nn/layers/pool_2d_min.rs
@@ -32,9 +32,7 @@ impl<K: Dim, S: Dim, P: Dim, L: Dim, Img: TryPool2D<K, S, P, L>> Module<Img>
     for MinPool2D<K, S, P, L>
 {
     type Output = Img::Pooled;
-    type Error = Img::Error;
-
-    fn try_forward(&self, x: Img) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Img) -> Result<Self::Output, Error> {
         x.try_pool2d(
             crate::tensor_ops::Pool2DKind::Min,
             self.kernel_size,
diff --git a/dfdx/src/nn/layers/pool_global_avg.rs b/dfdx/src/nn/layers/pool_global_avg.rs
index 599e55f91..cad457535 100644
--- a/dfdx/src/nn/layers/pool_global_avg.rs
+++ b/dfdx/src/nn/layers/pool_global_avg.rs
@@ -23,9 +23,10 @@ impl<C: Dim, H: Dim, W: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>>
     Module<Tensor<(C, H, W), E, D, T>> for AvgPoolGlobal
 {
     type Output = Tensor<(C,), E, D, T>;
-    type Error = D::Err;
-
-    fn try_forward(&self, input: Tensor<(C, H, W), E, D, T>) -> Result<Self::Output, D::Err> {
+    fn try_forward(
+        &self,
+        input: Tensor<(C, H, W), E, D, T>,
+    ) -> Result<Self::Output, crate::tensor::Error> {
         input.try_mean()
     }
 }
@@ -34,9 +35,10 @@ impl<B: Dim, C: Dim, H: Dim, W: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>>
     Module<Tensor<(B, C, H, W), E, D, T>> for AvgPoolGlobal
 {
     type Output = Tensor<(B, C), E, D, T>;
-    type Error = D::Err;
-
-    fn try_forward(&self, input: Tensor<(B, C, H, W), E, D, T>) -> Result<Self::Output, D::Err> {
+    fn try_forward(
+        &self,
+        input: Tensor<(B, C, H, W), E, D, T>,
+    ) -> Result<Self::Output, crate::tensor::Error> {
         input.try_mean()
     }
 }
diff --git a/dfdx/src/nn/layers/pool_global_max.rs b/dfdx/src/nn/layers/pool_global_max.rs
index e0eea18b6..1bf6de3d4 100644
--- a/dfdx/src/nn/layers/pool_global_max.rs
+++ b/dfdx/src/nn/layers/pool_global_max.rs
@@ -23,9 +23,10 @@ impl<C: Dim, H: Dim, W: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>>
     Module<Tensor<(C, H, W), E, D, T>> for MaxPoolGlobal
 {
     type Output = Tensor<(C,), E, D, T>;
-    type Error = D::Err;
-
-    fn try_forward(&self, input: Tensor<(C, H, W), E, D, T>) -> Result<Self::Output, D::Err> {
+    fn try_forward(
+        &self,
+        input: Tensor<(C, H, W), E, D, T>,
+    ) -> Result<Self::Output, crate::tensor::Error> {
         input.try_max()
     }
 }
@@ -34,9 +35,10 @@ impl<B: Dim, C: Dim, H: Dim, W: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>>
     Module<Tensor<(B, C, H, W), E, D, T>> for MaxPoolGlobal
 {
     type Output = Tensor<(B, C), E, D, T>;
-    type Error = D::Err;
-
-    fn try_forward(&self, input: Tensor<(B, C, H, W), E, D, T>) -> Result<Self::Output, D::Err> {
+    fn try_forward(
+        &self,
+        input: Tensor<(B, C, H, W), E, D, T>,
+    ) -> Result<Self::Output, crate::tensor::Error> {
         input.try_max()
     }
 }
diff --git a/dfdx/src/nn/layers/pool_global_min.rs b/dfdx/src/nn/layers/pool_global_min.rs
index 71f80ced8..e002dd554 100644
--- a/dfdx/src/nn/layers/pool_global_min.rs
+++ b/dfdx/src/nn/layers/pool_global_min.rs
@@ -23,9 +23,10 @@ impl<C: Dim, H: Dim, W: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>>
     Module<Tensor<(C, H, W), E, D, T>> for MinPoolGlobal
 {
     type Output = Tensor<(C,), E, D, T>;
-    type Error = D::Err;
-
-    fn try_forward(&self, input: Tensor<(C, H, W), E, D, T>) -> Result<Self::Output, D::Err> {
+    fn try_forward(
+        &self,
+        input: Tensor<(C, H, W), E, D, T>,
+    ) -> Result<Self::Output, crate::tensor::Error> {
         input.try_min()
     }
 }
@@ -34,9 +35,10 @@ impl<B: Dim, C: Dim, H: Dim, W: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>>
     Module<Tensor<(B, C, H, W), E, D, T>> for MinPoolGlobal
 {
     type Output = Tensor<(B, C), E, D, T>;
-    type Error = D::Err;
-
-    fn try_forward(&self, input: Tensor<(B, C, H, W), E, D, T>) -> Result<Self::Output, D::Err> {
+    fn try_forward(
+        &self,
+        input: Tensor<(B, C, H, W), E, D, T>,
+    ) -> Result<Self::Output, crate::tensor::Error> {
         input.try_min()
     }
 }
diff --git a/dfdx/src/nn/layers/prelu.rs b/dfdx/src/nn/layers/prelu.rs
index ca5eab481..6af121519 100644
--- a/dfdx/src/nn/layers/prelu.rs
+++ b/dfdx/src/nn/layers/prelu.rs
@@ -12,7 +12,7 @@ impl Default for PReLUConfig {
 
 impl<E: Dtype, D: Device<E>> BuildOnDevice<E, D> for PReLUConfig {
     type Built = PReLU<E, D>;
-    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, <D>::Err> {
+    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, crate::tensor::Error> {
         let a = device.try_tensor(E::from_f64(self.0).unwrap())?;
         Ok(PReLU { a })
     }
@@ -28,16 +28,14 @@ pub struct PReLU<Elem: Dtype, Dev: Device<Elem>> {
 
 impl<E: Dtype, D: Device<E>> ResetParams<E, D> for PReLU<E, D> {
     /// Does nothing.
-    fn try_reset_params(&mut self) -> Result<(), D::Err> {
+    fn try_reset_params(&mut self) -> Result<(), crate::tensor::Error> {
         Ok(())
     }
 }
 
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>> for PReLU<E, D> {
     type Output = Tensor<S, E, D, T>;
-    type Error = D::Err;
-
-    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Error> {
         let a = self.a.retaped::<T>().broadcast_like(&x);
         x.try_prelu(a)
     }
diff --git a/dfdx/src/nn/layers/prelu1d.rs b/dfdx/src/nn/layers/prelu1d.rs
index e4c362523..b1a405c11 100644
--- a/dfdx/src/nn/layers/prelu1d.rs
+++ b/dfdx/src/nn/layers/prelu1d.rs
@@ -18,7 +18,7 @@ impl<C: Dim + Default> Default for PReLU1DConfig<C> {
 
 impl<C: Dim, E: Dtype, D: Device<E>> BuildOnDevice<E, D> for PReLU1DConfig<C> {
     type Built = PReLU1D<C, E, D>;
-    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, <D>::Err> {
+    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, crate::tensor::Error> {
         let a = device.try_ones_like(&(self.c,))?.try_mul(self.a)?;
         Ok(PReLU1D { a })
     }
@@ -34,7 +34,7 @@ pub struct PReLU1D<C: Dim, Elem: Dtype, Dev: Device<Elem>> {
 
 impl<C: Dim, E: Dtype, D: Device<E>> ResetParams<E, D> for PReLU1D<C, E, D> {
     /// Does nothing.
-    fn try_reset_params(&mut self) -> Result<(), D::Err> {
+    fn try_reset_params(&mut self) -> Result<(), crate::tensor::Error> {
         Ok(())
     }
 }
@@ -43,9 +43,7 @@ impl<C: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<(C,), E, D, T>
     for PReLU1D<C, E, D>
 {
     type Output = Tensor<(C,), E, D, T>;
-    type Error = D::Err;
-
-    fn try_forward(&self, x: Tensor<(C,), E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<(C,), E, D, T>) -> Result<Self::Output, Error> {
         x.try_prelu(self.a.clone())
     }
 }
@@ -54,9 +52,7 @@ impl<B: Dim, C: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<(B, C)
     for PReLU1D<C, E, D>
 {
     type Output = Tensor<(B, C), E, D, T>;
-    type Error = D::Err;
-
-    fn try_forward(&self, x: Tensor<(B, C), E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<(B, C), E, D, T>) -> Result<Self::Output, Error> {
         let a = self.a.retaped::<T>().broadcast_like(&x);
         x.try_prelu(a)
     }
@@ -66,9 +62,7 @@ impl<B: Dim, C: Dim, H: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>>
     Module<Tensor<(B, C, H), E, D, T>> for PReLU1D<C, E, D>
 {
     type Output = Tensor<(B, C, H), E, D, T>;
-    type Error = D::Err;
-
-    fn try_forward(&self, x: Tensor<(B, C, H), E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<(B, C, H), E, D, T>) -> Result<Self::Output, Error> {
         let a = self.a.retaped::<T>().broadcast_like(&x);
         x.try_prelu(a)
     }
@@ -78,9 +72,7 @@ impl<B: Dim, C: Dim, H: Dim, W: Dim, E: Dtype, D: Device<E>, T: Tape<E, D>>
     Module<Tensor<(B, C, H, W), E, D, T>> for PReLU1D<C, E, D>
 {
     type Output = Tensor<(B, C, H, W), E, D, T>;
-    type Error = D::Err;
-
-    fn try_forward(&self, x: Tensor<(B, C, H, W), E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<(B, C, H, W), E, D, T>) -> Result<Self::Output, Error> {
         let a = self.a.retaped::<T>().broadcast_like(&x);
         x.try_prelu(a)
     }
diff --git a/dfdx/src/nn/layers/relu.rs b/dfdx/src/nn/layers/relu.rs
index fee9cded8..d753ff7a9 100644
--- a/dfdx/src/nn/layers/relu.rs
+++ b/dfdx/src/nn/layers/relu.rs
@@ -5,8 +5,7 @@ use crate::prelude::*;
 pub struct ReLU;
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>> for ReLU {
     type Output = Tensor<S, E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Error> {
         x.try_relu()
     }
 }
diff --git a/dfdx/src/nn/layers/reshape.rs b/dfdx/src/nn/layers/reshape.rs
index 83be2344a..83d977bab 100644
--- a/dfdx/src/nn/layers/reshape.rs
+++ b/dfdx/src/nn/layers/reshape.rs
@@ -18,8 +18,7 @@ impl<Src: Shape, Dst: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tenso
     for Reshape<Dst>
 {
     type Output = Tensor<Dst, E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<Src, E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<Src, E, D, T>) -> Result<Self::Output, Error> {
         x.try_reshape_like(&self.0)
     }
 }
diff --git a/dfdx/src/nn/layers/residual_add.rs b/dfdx/src/nn/layers/residual_add.rs
index d9c707167..5da249a9f 100644
--- a/dfdx/src/nn/layers/residual_add.rs
+++ b/dfdx/src/nn/layers/residual_add.rs
@@ -27,10 +27,9 @@ pub struct ResidualAdd<T>(
     pub T,
 );
 
-// TODO derive this
 impl<E: Dtype, D: Device<E>, T: BuildOnDevice<E, D>> BuildOnDevice<E, D> for ResidualAdd<T> {
     type Built = ResidualAdd<T::Built>;
-    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, <D>::Err> {
+    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, crate::tensor::Error> {
         let t = self.0.try_build_on_device(device)?;
         Ok(ResidualAdd(t))
     }
@@ -38,15 +37,14 @@ impl<E: Dtype, D: Device<E>, T: BuildOnDevice<E, D>> BuildOnDevice<E, D> for Res
 
 impl<X: WithEmptyTape, T: Module<X>> Module<X> for ResidualAdd<T>
 where
-    T::Output: TryAdd<X, Err = T::Error>,
+    T::Output: TryAdd<X>,
 {
     type Output = <T::Output as TryAdd<X>>::Output;
-    type Error = T::Error;
-    fn try_forward(&self, x: X) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: X) -> Result<Self::Output, Error> {
         let t = self.0.try_forward(x.with_empty_tape())?;
         t.try_add(x)
     }
-    fn try_forward_mut(&mut self, x: X) -> Result<Self::Output, Self::Error> {
+    fn try_forward_mut(&mut self, x: X) -> Result<Self::Output, Error> {
         let t = self.0.try_forward_mut(x.with_empty_tape())?;
         t.try_add(x)
     }
diff --git a/dfdx/src/nn/layers/residual_mul.rs b/dfdx/src/nn/layers/residual_mul.rs
index 428c1c214..c55787ffa 100644
--- a/dfdx/src/nn/layers/residual_mul.rs
+++ b/dfdx/src/nn/layers/residual_mul.rs
@@ -26,10 +26,9 @@ pub struct ResidualMul<T>(
     pub T,
 );
 
-// TODO derive this
 impl<E: Dtype, D: Device<E>, T: BuildOnDevice<E, D>> BuildOnDevice<E, D> for ResidualMul<T> {
     type Built = ResidualMul<T::Built>;
-    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, <D>::Err> {
+    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, crate::tensor::Error> {
         let t = self.0.try_build_on_device(device)?;
         Ok(ResidualMul(t))
     }
@@ -37,15 +36,14 @@ impl<E: Dtype, D: Device<E>, T: BuildOnDevice<E, D>> BuildOnDevice<E, D> for Res
 
 impl<X: WithEmptyTape, T: Module<X>> Module<X> for ResidualMul<T>
 where
-    T::Output: TryMul<X, Err = T::Error>,
+    T::Output: TryMul<X>,
 {
     type Output = <T::Output as TryMul<X>>::Output;
-    type Error = T::Error;
-    fn try_forward(&self, x: X) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: X) -> Result<Self::Output, Error> {
         let t = self.0.try_forward(x.with_empty_tape())?;
         t.try_mul(x)
     }
-    fn try_forward_mut(&mut self, x: X) -> Result<Self::Output, Self::Error> {
+    fn try_forward_mut(&mut self, x: X) -> Result<Self::Output, Error> {
         let t = self.0.try_forward_mut(x.with_empty_tape())?;
         t.try_mul(x)
     }
diff --git a/dfdx/src/nn/layers/sigmoid.rs b/dfdx/src/nn/layers/sigmoid.rs
index 56b09d74a..3eef17462 100644
--- a/dfdx/src/nn/layers/sigmoid.rs
+++ b/dfdx/src/nn/layers/sigmoid.rs
@@ -5,8 +5,7 @@ use crate::prelude::*;
 pub struct Sigmoid;
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>> for Sigmoid {
     type Output = Tensor<S, E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Error> {
         x.try_sigmoid()
     }
 }
diff --git a/dfdx/src/nn/layers/sin.rs b/dfdx/src/nn/layers/sin.rs
index 57074fd1c..eae38f486 100644
--- a/dfdx/src/nn/layers/sin.rs
+++ b/dfdx/src/nn/layers/sin.rs
@@ -5,8 +5,7 @@ use crate::prelude::*;
 pub struct Sin;
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>> for Sin {
     type Output = Tensor<S, E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Error> {
         x.try_sin()
     }
 }
diff --git a/dfdx/src/nn/layers/softmax.rs b/dfdx/src/nn/layers/softmax.rs
index 18a28544a..145a054b2 100644
--- a/dfdx/src/nn/layers/softmax.rs
+++ b/dfdx/src/nn/layers/softmax.rs
@@ -5,8 +5,7 @@ use crate::prelude::*;
 pub struct Softmax;
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>> for Softmax {
     type Output = Tensor<S, E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Error> {
         x.try_softmax()
     }
 }
diff --git a/dfdx/src/nn/layers/split_into.rs b/dfdx/src/nn/layers/split_into.rs
index 1e66d68fe..de905a0df 100644
--- a/dfdx/src/nn/layers/split_into.rs
+++ b/dfdx/src/nn/layers/split_into.rs
@@ -33,7 +33,7 @@ pub struct SplitInto<T>(
 
 impl<E: Dtype, D: Device<E>, T: BuildOnDevice<E, D>> BuildOnDevice<E, D> for SplitInto<T> {
     type Built = SplitInto<T::Built>;
-    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, <D>::Err> {
+    fn try_build_on_device(&self, device: &D) -> Result<Self::Built, crate::tensor::Error> {
         let t = self.0.try_build_on_device(device)?;
         Ok(SplitInto(t))
     }
@@ -44,29 +44,26 @@ macro_rules! tuple_impls {
         impl<
             Input: WithEmptyTape,
             $head: Module<Input>,
-            $($tails : Module<Input, Error = $head::Error>,)+
+            $($tails : Module<Input>,)+
         > Module<Input> for SplitInto<($head, $($tails,)+)> {
             type Output = (
                 $head::Output,
                 $($tails::Output),+
             );
-            type Error = $head::Error;
 
             #[allow(non_snake_case)]
-            fn try_forward(&self, x: Input) -> Result<Self::Output, $head::Error> {
+            fn try_forward(&self, x: Input) -> Result<Self::Output, Error> {
                 let ($head, $($tails,)+) = &self.0;
                 let ($($tails,)+) = ($($tails.try_forward(x.with_empty_tape())?,)+);
                 let $head = $head.try_forward(x)?;
-
                 Ok(($head, $($tails,)+))
             }
 
             #[allow(non_snake_case)]
-            fn try_forward_mut(&mut self, x: Input) -> Result<Self::Output, $head::Error> {
+            fn try_forward_mut(&mut self, x: Input) -> Result<Self::Output, Error> {
                 let ($head, $($tails,)+) = &mut self.0;
                 let ($($tails,)+) = ($($tails.try_forward_mut(x.with_empty_tape())?,)+);
                 let $head = $head.try_forward_mut(x)?;
-
                 Ok(($head, $($tails,)+))
             }
         }
diff --git a/dfdx/src/nn/layers/sqrt.rs b/dfdx/src/nn/layers/sqrt.rs
index cb9246bf8..4621b7092 100644
--- a/dfdx/src/nn/layers/sqrt.rs
+++ b/dfdx/src/nn/layers/sqrt.rs
@@ -6,8 +6,7 @@ pub struct Sqrt;
 
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>> for Sqrt {
     type Output = Tensor<S, E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Error> {
         x.try_sqrt()
     }
 }
diff --git a/dfdx/src/nn/layers/square.rs b/dfdx/src/nn/layers/square.rs
index b9e030d39..5b2ac5d51 100644
--- a/dfdx/src/nn/layers/square.rs
+++ b/dfdx/src/nn/layers/square.rs
@@ -5,8 +5,7 @@ use crate::prelude::*;
 pub struct Square;
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>> for Square {
     type Output = Tensor<S, E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Error> {
         x.try_square()
     }
 }
diff --git a/dfdx/src/nn/layers/tanh.rs b/dfdx/src/nn/layers/tanh.rs
index bba87199e..6507f5807 100644
--- a/dfdx/src/nn/layers/tanh.rs
+++ b/dfdx/src/nn/layers/tanh.rs
@@ -5,8 +5,7 @@ use crate::prelude::*;
 pub struct Tanh;
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>> for Tanh {
     type Output = Tensor<S, E, D, T>;
-    type Error = D::Err;
-    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<S, E, D, T>) -> Result<Self::Output, Error> {
         x.try_tanh()
     }
 }
diff --git a/dfdx/src/nn/layers/transformer.rs b/dfdx/src/nn/layers/transformer.rs
index e4a985a12..fa7ab76a3 100644
--- a/dfdx/src/nn/layers/transformer.rs
+++ b/dfdx/src/nn/layers/transformer.rs
@@ -100,17 +100,15 @@ impl<Model: Dim, NumHeads: Dim, F: Dim> DecoderBlockConfig<Model, NumHeads, F> {
 impl<M: Dim, H: Dim, F: Dim, E: Dtype, D: Device<E>, Tgt, Mem> Module<(Tgt, Mem)>
     for DecoderBlock<M, H, F, E, D>
 where
-    Tgt: WithEmptyTape + SplitTape + TryAdd<Tgt::NoTape, Output = Tgt> + HasErr<Err = D::Err>,
+    Tgt: WithEmptyTape + SplitTape + TryAdd<Tgt::NoTape, Output = Tgt>,
     Mem: Clone,
-    ResidualAdd<MultiHeadAttention<M, H, M, M, E, D>>: Module<Tgt, Output = Tgt, Error = D::Err>,
-    MultiHeadAttention<M, H, M, M, E, D>: Module<(Tgt, Mem, Mem), Output = Tgt, Error = D::Err>,
-    LayerNorm1D<M, E, D>: Module<Tgt, Output = Tgt, Error = D::Err>,
-    ResidualAdd<FeedForward<M, F, E, D>>: Module<Tgt, Output = Tgt, Error = D::Err>,
+    ResidualAdd<MultiHeadAttention<M, H, M, M, E, D>>: Module<Tgt, Output = Tgt>,
+    MultiHeadAttention<M, H, M, M, E, D>: Module<(Tgt, Mem, Mem), Output = Tgt>,
+    LayerNorm1D<M, E, D>: Module<Tgt, Output = Tgt>,
+    ResidualAdd<FeedForward<M, F, E, D>>: Module<Tgt, Output = Tgt>,
 {
     type Output = Tgt;
-    type Error = D::Err;
-
-    fn try_forward(&self, (tgt, mem): (Tgt, Mem)) -> Result<Self::Output, D::Err> {
+    fn try_forward(&self, (tgt, mem): (Tgt, Mem)) -> Result<Self::Output, crate::tensor::Error> {
         let x = self.self_attn.try_forward(tgt)?;
         let x = self.norm1.try_forward(x)?;
 
@@ -179,17 +177,14 @@ impl<Model: Dim, NumHeads: Dim, F: Dim> TransformerConfig<Model, NumHeads, F> {
 impl<M: Dim, H: Dim, F: Dim, E: Dtype, D: Device<E>, Src: SplitTape, Tgt: PutTape<Src::Tape>>
     Module<(Src, Tgt)> for Transformer<M, H, F, E, D>
 where
-    Vec<EncoderBlock<M, H, F, E, D>>: Module<Src, Output = Src, Error = D::Err>,
+    Vec<EncoderBlock<M, H, F, E, D>>: Module<Src, Output = Src>,
     DecoderBlock<M, H, F, E, D>: Module<
         (<Tgt as PutTape<Src::Tape>>::Output, Src::NoTape),
         Output = <Tgt as PutTape<Src::Tape>>::Output,
-        Error = D::Err,
     >,
 {
     type Output = <Tgt as PutTape<Src::Tape>>::Output;
-    type Error = D::Err;
-
-    fn try_forward(&self, (src, tgt): (Src, Tgt)) -> Result<Self::Output, D::Err> {
+    fn try_forward(&self, (src, tgt): (Src, Tgt)) -> Result<Self::Output, crate::tensor::Error> {
         let (mem, tape) = self.encoder.try_forward(src)?.split_tape();
         let mut tgt = tgt.put_tape(tape);
         for block in self.decoder.iter() {
diff --git a/dfdx/src/nn/layers/upscale2d.rs b/dfdx/src/nn/layers/upscale2d.rs
index d8bdd31d7..33f28cd64 100644
--- a/dfdx/src/nn/layers/upscale2d.rs
+++ b/dfdx/src/nn/layers/upscale2d.rs
@@ -18,9 +18,7 @@ impl<H: Dim, W: Dim, M: UpscaleMethod, Img: GenericUpscale2D<M>> Module<Img>
     for Upscale2D<H, W, M>
 {
     type Output = Img::Output<H, W>;
-    type Error = Img::Err;
-
-    fn try_forward(&self, x: Img) -> Result<Self::Output, Img::Err> {
+    fn try_forward(&self, x: Img) -> Result<Self::Output, Error> {
         x.generic_upscale2d_like(self.method, self.out_height, self.out_width)
     }
 }
@@ -45,9 +43,7 @@ where
     D: Device<E> + Upscale2DKernel<E, M>,
 {
     type Output = Tensor<(C, H::Output, W::Output), E, D, T>;
-    type Error = D::Err;
-
-    fn try_forward(&self, x: Tensor<(C, H, W), E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<(C, H, W), E, D, T>) -> Result<Self::Output, Error> {
         let (_c, h, w) = *x.shape();
         let h = h * self.height_factor;
         let w = w * self.width_factor;
@@ -66,9 +62,7 @@ where
     T: 'static + Tape<E, D>,
 {
     type Output = Tensor<(B, C, H::Output, W::Output), E, D, T>;
-    type Error = D::Err;
-
-    fn try_forward(&self, x: Tensor<(B, C, H, W), E, D, T>) -> Result<Self::Output, Self::Error> {
+    fn try_forward(&self, x: Tensor<(B, C, H, W), E, D, T>) -> Result<Self::Output, Error> {
         let (_b, _c, h, w) = *x.shape();
         let h = h * self.height_factor;
         let w = w * self.width_factor;
diff --git a/dfdx/src/nn/optim/adam.rs b/dfdx/src/nn/optim/adam.rs
index 8fb4cc383..ff1d86b5f 100644
--- a/dfdx/src/nn/optim/adam.rs
+++ b/dfdx/src/nn/optim/adam.rs
@@ -2,7 +2,7 @@ use std::marker::PhantomData;
 
 use crate::{
     shapes::{Dtype, Shape},
-    tensor::{Gradients, Storage, Tensor, Tensorlike, UniqueId},
+    tensor::{Error, Gradients, Storage, Tensor, Tensorlike, UniqueId},
     tensor_ops::{AdamConfig, Device},
 };
 
@@ -55,7 +55,7 @@ impl<M, E: Dtype, D: Device<E>> crate::nn::Optimizer<M, E, D> for Adam<M, E, D>
         t: &mut Tensor<S, E, D>,
         gradients: &Gradients<E, D>,
         missing_params: &mut Vec<UniqueId>,
-    ) -> Result<(), D::Err> {
+    ) -> Result<(), crate::tensor::Error> {
         let g = gradients.get_ref_checked(t);
         match g {
             None => missing_params.push(t.id()),
@@ -68,11 +68,7 @@ impl<M, E: Dtype, D: Device<E>> crate::nn::Optimizer<M, E, D> for Adam<M, E, D>
         Ok(())
     }
 
-    fn update(
-        &mut self,
-        module: &mut M,
-        gradients: &Gradients<E, D>,
-    ) -> Result<(), crate::nn::OptimizerUpdateError<<D>::Err>>
+    fn update(&mut self, module: &mut M, gradients: &Gradients<E, D>) -> Result<(), Error>
     where
         M: crate::nn::UpdateParams<E, D>,
     {
@@ -80,15 +76,11 @@ impl<M, E: Dtype, D: Device<E>> crate::nn::Optimizer<M, E, D> for Adam<M, E, D>
 
         // NOTE: the rest of this is identical to default implementation of update.
         let mut missing_tensors = Vec::new();
-        module
-            .try_update_params(self, gradients, &mut missing_tensors)
-            .map_err(crate::nn::OptimizerUpdateError::DeviceError)?;
+        module.try_update_params(self, gradients, &mut missing_tensors)?;
         if missing_tensors.is_empty() {
             Ok(())
         } else {
-            Err(crate::nn::OptimizerUpdateError::UnusedTensors(
-                missing_tensors,
-            ))
+            Err(Error::UnusedTensors(missing_tensors))
         }
     }
 }
diff --git a/dfdx/src/nn/optim/mod.rs b/dfdx/src/nn/optim/mod.rs
index 790b6fceb..e80e549ee 100644
--- a/dfdx/src/nn/optim/mod.rs
+++ b/dfdx/src/nn/optim/mod.rs
@@ -39,5 +39,5 @@ pub use adam::Adam;
 pub use rmsprop::RMSprop;
 pub use sgd::Sgd;
 // re-exports
-pub use super::{Optimizer, OptimizerUpdateError};
+pub use super::Optimizer;
 pub use crate::tensor_ops::{AdamConfig, Momentum, RMSpropConfig, SgdConfig, WeightDecay};
diff --git a/dfdx/src/nn/optim/rmsprop.rs b/dfdx/src/nn/optim/rmsprop.rs
index af7d6493b..92db198a3 100644
--- a/dfdx/src/nn/optim/rmsprop.rs
+++ b/dfdx/src/nn/optim/rmsprop.rs
@@ -2,7 +2,7 @@ use std::marker::PhantomData;
 
 use crate::{
     shapes::{Dtype, Shape},
-    tensor::{Gradients, Storage, Tensor, Tensorlike, UniqueId},
+    tensor::{Error, Gradients, Storage, Tensor, Tensorlike, UniqueId},
     tensor_ops::{Device, RMSpropConfig},
 };
 
@@ -66,7 +66,7 @@ impl<M, E: Dtype, D: Device<E>> crate::nn::Optimizer<M, E, D> for RMSprop<M, E,
         t: &mut Tensor<S, E, D>,
         gradients: &Gradients<E, D>,
         missing_params: &mut Vec<UniqueId>,
-    ) -> Result<(), D::Err> {
+    ) -> Result<(), Error> {
         let g = gradients.get_ref_checked(t);
         match g {
             None => missing_params.push(t.id()),
@@ -85,25 +85,17 @@ impl<M, E: Dtype, D: Device<E>> crate::nn::Optimizer<M, E, D> for RMSprop<M, E,
         Ok(())
     }
 
-    fn update(
-        &mut self,
-        module: &mut M,
-        gradients: &Gradients<E, D>,
-    ) -> Result<(), crate::nn::OptimizerUpdateError<<D>::Err>>
+    fn update(&mut self, module: &mut M, gradients: &Gradients<E, D>) -> Result<(), Error>
     where
         M: crate::nn::UpdateParams<E, D>,
     {
         // NOTE: the rest of this is identical to default implementation of update.
         let mut missing_tensors = Vec::new();
-        module
-            .try_update_params(self, gradients, &mut missing_tensors)
-            .map_err(crate::nn::OptimizerUpdateError::DeviceError)?;
+        module.try_update_params(self, gradients, &mut missing_tensors)?;
         let r = if missing_tensors.is_empty() {
             Ok(())
         } else {
-            Err(crate::nn::OptimizerUpdateError::UnusedTensors(
-                missing_tensors,
-            ))
+            Err(Error::UnusedTensors(missing_tensors))
         };
         self.step += 1;
         r
diff --git a/dfdx/src/nn/optim/sgd.rs b/dfdx/src/nn/optim/sgd.rs
index 33d00c7a0..9cad6e0e9 100644
--- a/dfdx/src/nn/optim/sgd.rs
+++ b/dfdx/src/nn/optim/sgd.rs
@@ -51,7 +51,7 @@ impl<M, E: Dtype, D: Device<E>> crate::nn::Optimizer<M, E, D> for Sgd<M, E, D> {
         t: &mut Tensor<S, E, D>,
         gradients: &Gradients<E, D>,
         missing_params: &mut Vec<UniqueId>,
-    ) -> Result<(), D::Err> {
+    ) -> Result<(), crate::tensor::Error> {
         let g = gradients.get_ref_checked(t);
         match g {
             None => missing_params.push(t.id()),