ooples
diff --git a/‎src/ActivationFunctions/ActivationFunctionBase.cs
+86 b/‎src/ActivationFunctions/ActivationFunctionBase.cs
+86
diff --git a/‎src/ActivationFunctions/BentIdentityActivation.cs
+56-1 b/‎src/ActivationFunctions/BentIdentityActivation.cs
+56-1
diff --git a/‎src/ActivationFunctions/BipolarSigmoidActivation.cs
-34 b/‎src/ActivationFunctions/BipolarSigmoidActivation.cs
-34
diff --git a/‎src/ActivationFunctions/CELUActivation.cs
+81 b/‎src/ActivationFunctions/CELUActivation.cs
+81
@@ -1,31 +1,112 @@
 namespace AiDotNet.ActivationFunctions;
 
+/// <summary>
+/// Base class for all activation functions used in neural networks.
+/// </summary>
+/// <typeparam name="T">The numeric type used for calculations (e.g., float, double).</typeparam>
+/// <remarks>
+/// <para>
+/// For Beginners: Activation functions are mathematical operations that determine the output
+/// of a neural network node. They introduce non-linearity into the network, allowing it to
+/// learn complex patterns. Think of them as decision-makers that determine how strongly a
+/// neuron "fires" based on its inputs.
+/// 
+/// Common activation functions include:
+/// - Sigmoid: Outputs values between 0 and 1 (like probabilities)
+/// - ReLU: Returns 0 for negative inputs, or the input value for positive inputs
+/// - Tanh: Similar to sigmoid but outputs values between -1 and 1
+/// 
+/// The "derivative" methods are used during training to determine how to adjust the network's
+/// weights to improve its accuracy.
+/// </para>
+/// </remarks>
 public abstract class ActivationFunctionBase<T> : IActivationFunction<T>, IVectorActivationFunction<T>
 {
+    /// <summary>
+    /// Provides mathematical operations for the numeric type T.
+    /// </summary>
     protected static readonly INumericOperations<T> NumOps = MathHelper.GetNumericOperations<T>();
 
+    /// <summary>
+    /// Determines if the activation function supports operations on individual scalar values.
+    /// </summary>
+    /// <returns>True if scalar operations are supported; otherwise, false.</returns>
     protected abstract bool SupportsScalarOperations();
 
+    /// <summary>
+    /// Applies the activation function to a single input value.
+    /// </summary>
+    /// <param name="input">The input value.</param>
+    /// <returns>The activated output value.</returns>
+    /// <remarks>
+    /// <para>
+    /// For Beginners: This method transforms a single number using the activation function.
+    /// The default implementation is the identity function (returns the input unchanged).
+    /// Derived classes will override this with specific activation functions like sigmoid or ReLU.
+    /// </para>
+    /// </remarks>
     public virtual T Activate(T input)
     {
         return input; // Default to identity function
     }
 
+    /// <summary>
+    /// Calculates the derivative of the activation function for a single input value.
+    /// </summary>
+    /// <param name="input">The input value.</param>
+    /// <returns>The derivative value at the input point.</returns>
+    /// <remarks>
+    /// <para>
+    /// For Beginners: The derivative measures how much the activation function's output changes
+    /// when its input changes slightly. This is essential for training neural networks through
+    /// backpropagation. The default implementation returns 1, meaning the output changes at the
+    /// same rate as the input.
+    /// </para>
+    /// </remarks>
     public virtual T Derivative(T input)
     {
         return NumOps.One; // Default to constant derivative of 1
     }
 
+    /// <summary>
+    /// Applies the activation function to each element in a vector.
+    /// </summary>
+    /// <param name="input">The input vector.</param>
+    /// <returns>A new vector with the activation function applied to each element.</returns>
     public virtual Vector<T> Activate(Vector<T> input)
     {
         return input.Transform(Activate);
     }
 
+    /// <summary>
+    /// Calculates the derivative matrix for a vector input.
+    /// </summary>
+    /// <param name="input">The input vector.</param>
+    /// <returns>A diagonal matrix containing derivatives for each input element.</returns>
+    /// <remarks>
+    /// <para>
+    /// For Beginners: This creates a special matrix where the diagonal contains the derivatives
+    /// for each input value. This matrix is used during backpropagation to efficiently calculate
+    /// how errors propagate through the network.
+    /// </para>
+    /// </remarks>
     public virtual Matrix<T> Derivative(Vector<T> input)
     {
         return Matrix<T>.CreateDiagonal(input.Transform(Derivative));
     }
 
+    /// <summary>
+    /// Applies the activation function to each element in a tensor.
+    /// </summary>
+    /// <param name="input">The input tensor.</param>
+    /// <returns>A new tensor with the activation function applied to each element.</returns>
+    /// <remarks>
+    /// <para>
+    /// For Beginners: A tensor is a multi-dimensional array that can represent complex data
+    /// structures like images (3D tensors) or video (4D tensors). This method applies the
+    /// activation function to every single value in the tensor.
+    /// </para>
+    /// </remarks>
     public virtual Tensor<T> Activate(Tensor<T> input)
     {
         Tensor<T> output = new Tensor<T>(input.Shape);
@@ -37,6 +118,11 @@ public virtual Tensor<T> Activate(Tensor<T> input)
         return output;
     }
 
+    /// <summary>
+    /// Calculates the derivative for each element in a tensor.
+    /// </summary>
+    /// <param name="input">The input tensor.</param>
+    /// <returns>A new tensor containing derivatives for each input element.</returns>
     public virtual Tensor<T> Derivative(Tensor<T> input)
     {
         Tensor<T> output = new Tensor<T>(input.Shape);
 
@@ -1,9 +1,47 @@
-namespace AiDotNet.ActivationFunctions;
+namespace AiDotNet.ActivationFunctions;
 
+/// <summary>
+/// Implements the Bent Identity activation function for neural networks.
+/// </summary>
+/// <typeparam name="T">The numeric type used for calculations (e.g., float, double).</typeparam>
+/// <remarks>
+/// <para>
+/// For Beginners: The Bent Identity activation function is a smoother alternative to the ReLU function.
+/// It behaves similarly to a linear function for positive inputs but has a gentle curve for negative inputs.
+/// This helps prevent the "dying neuron" problem that can occur with ReLU, where neurons can get stuck
+/// outputting zero.
+/// 
+/// The mathematical formula is: f(x) = ((√(x² + 1) - 1) / 2) + x
+/// 
+/// Key properties:
+/// - Always produces a non-zero gradient, helping with training
+/// - Approximates linear behavior for large positive values
+/// - Provides a smooth transition around zero
+/// - Has no upper or lower bounds (unlike sigmoid or tanh)
+/// </para>
+/// </remarks>
 public class BentIdentityActivation<T> : ActivationFunctionBase<T>
 {
+    /// <summary>
+    /// Indicates that this activation function supports operations on individual scalar values.
+    /// </summary>
+    /// <returns>Always returns true as Bent Identity can be applied to scalar values.</returns>
     protected override bool SupportsScalarOperations() => true;
 
+    /// <summary>
+    /// Applies the Bent Identity activation function to a single input value.
+    /// </summary>
+    /// <param name="input">The input value.</param>
+    /// <returns>The activated output value using the Bent Identity function.</returns>
+    /// <remarks>
+    /// <para>
+    /// For Beginners: This method transforms an input value using the formula:
+    /// f(x) = ((√(x² + 1) - 1) / 2) + x
+    /// 
+    /// The function adds a non-linear component to the identity function (x),
+    /// making it bend slightly while maintaining good gradient properties.
+    /// </para>
+    /// </remarks>
     public override T Activate(T input)
     {
         // f(x) = (sqrt(x^2 + 1) - 1) / 2 + x
@@ -14,6 +52,23 @@ public override T Activate(T input)
         return NumOps.Add(firstTerm, input);
     }
 
+    /// <summary>
+    /// Calculates the derivative of the Bent Identity function for a single input value.
+    /// </summary>
+    /// <param name="input">The input value.</param>
+    /// <returns>The derivative value at the input point.</returns>
+    /// <remarks>
+    /// <para>
+    /// For Beginners: The derivative measures how much the Bent Identity function's output changes
+    /// when its input changes slightly. This is used during neural network training to determine
+    /// how to adjust weights.
+    /// 
+    /// The derivative formula is: f'(x) = x / (2 * √(x² + 1)) + 1
+    /// 
+    /// An important property is that this derivative is always greater than 1, which helps prevent
+    /// the vanishing gradient problem during training.
+    /// </para>
+    /// </remarks>
     public override T Derivative(T input)
     {
         // f'(x) = x / (2 * sqrt(x^2 + 1)) + 1
 
@@ -1,16 +1,78 @@
 namespace AiDotNet.ActivationFunctions;
 
+/// <summary>
+/// Implements the Continuously Differentiable Exponential Linear Unit (CELU) activation function for neural networks.
+/// </summary>
+/// <typeparam name="T">The numeric type used for calculations (e.g., float, double).</typeparam>
+/// <remarks>
+/// <para>
+/// For Beginners: The CELU activation function is an improved version of the popular ReLU function.
+/// While ReLU simply turns negative values to zero (which can cause "dead neurons"), CELU replaces
+/// negative values with a smooth exponential curve that approaches a negative limit.
+/// 
+/// Key benefits of CELU:
+/// - For positive inputs, it behaves exactly like ReLU (returns the input value)
+/// - For negative inputs, it returns a negative value that smoothly approaches -α
+/// - This smooth transition helps prevent "dead neurons" during training
+/// - The α parameter controls how quickly the function approaches its negative limit
+/// 
+/// CELU is particularly useful in deep neural networks where maintaining gradient flow
+/// through all neurons is important for effective learning.
+/// </para>
+/// </remarks>
 public class CELUActivation<T> : ActivationFunctionBase<T>
 {
+    /// <summary>
+    /// The alpha parameter that controls the negative saturation value of the function.
+    /// </summary>
     private readonly T _alpha;
 
+    /// <summary>
+    /// Initializes a new instance of the CELUActivation class with the specified alpha parameter.
+    /// </summary>
+    /// <param name="alpha">The alpha parameter that controls the negative saturation value. Default is 1.0.</param>
+    /// <remarks>
+    /// <para>
+    /// For Beginners: The alpha parameter determines how steeply the function curves for negative inputs
+    /// and what negative value it will approach as inputs become more negative.
+    /// 
+    /// - A larger alpha (e.g., 2.0) means the function can reach more negative values
+    /// - A smaller alpha (e.g., 0.5) limits the function to less negative values
+    /// 
+    /// The default value of 1.0 works well for most applications, but you might adjust it if:
+    /// - Your network is learning too slowly (try increasing alpha)
+    /// - Your network is becoming unstable during training (try decreasing alpha)
+    /// </para>
+    /// </remarks>
     public CELUActivation(double alpha = 1.0)
     {
         _alpha = NumOps.FromDouble(alpha);
     }
 
+    /// <summary>
+    /// Indicates that this activation function supports operations on individual scalar values.
+    /// </summary>
+    /// <returns>Always returns true as CELU can be applied to scalar values.</returns>
     protected override bool SupportsScalarOperations() => true;
 
+    /// <summary>
+    /// Applies the CELU activation function to a single input value.
+    /// </summary>
+    /// <param name="input">The input value.</param>
+    /// <returns>The activated output value using the CELU function.</returns>
+    /// <remarks>
+    /// <para>
+    /// For Beginners: This method transforms an input value using the formula:
+    /// f(x) = max(0, x) + min(0, α * (exp(x/α) - 1))
+    /// 
+    /// In simpler terms:
+    /// - For positive inputs (x ≥ 0): the output is just x (like ReLU)
+    /// - For negative inputs (x &lt; 0): the output follows a smooth curve that approaches -α
+    /// 
+    /// This combination gives CELU the benefits of ReLU for positive values while avoiding
+    /// the "dead neuron" problem for negative values.
+    /// </para>
+    /// </remarks>
     public override T Activate(T input)
     {
         // CELU: max(0, x) + min(0, α * (exp(x/α) - 1))
@@ -23,6 +85,25 @@ public override T Activate(T input)
         );
     }
 
+    /// <summary>
+    /// Calculates the derivative of the CELU function for a single input value.
+    /// </summary>
+    /// <param name="input">The input value.</param>
+    /// <returns>The derivative value at the input point.</returns>
+    /// <remarks>
+    /// <para>
+    /// For Beginners: The derivative measures how much the CELU function's output changes
+    /// when its input changes slightly. This is used during neural network training to determine
+    /// how to adjust weights.
+    /// 
+    /// The derivative of CELU has these properties:
+    /// - For positive inputs (x ≥ 0): the derivative is 1 (constant slope)
+    /// - For negative inputs (x &lt; 0): the derivative is exp(x/α) (gradually decreasing)
+    /// 
+    /// Unlike ReLU, the derivative is never exactly zero, which helps prevent neurons from
+    /// becoming completely inactive ("dead") during training.
+    /// </para>
+    /// </remarks>
     public override T Derivative(T input)
     {
         // Derivative of CELU: