From ea7ca1401a976fbed7e735c515e0511fdbb92d81 Mon Sep 17 00:00:00 2001
From: tostenzel <tobias.stenzel@mailbox.org>
Date: Fri, 5 Jan 2024 04:44:04 +0100
Subject: [PATCH] Write or improve docstrings and comments

---
 applications/learn_mnist.py                   |  2 +
 .../tensor_broadcasted_binary_mlops.py        | 49 +++++++++++++
 edugrad/_tensor/tensor_combine_segment.py     | 73 +++++++++++++++++--
 edugrad/dtypes.py                             |  6 ++
 edugrad/helpers.py                            |  2 +
 edugrad/tensor.py                             | 33 +++++++--
 tests/gradcheck.py                            | 56 ++++++++++++--
 7 files changed, 201 insertions(+), 20 deletions(-)

diff --git a/applications/learn_mnist.py b/applications/learn_mnist.py
index ce6692b..d87a9a2 100755
--- a/applications/learn_mnist.py
+++ b/applications/learn_mnist.py
@@ -1,3 +1,5 @@
+"""Train a classifier to recognize the hand-written digit on gray-scale images. and evaluate the results."""
+
 import os
 import gzip
 
diff --git a/edugrad/_tensor/tensor_broadcasted_binary_mlops.py b/edugrad/_tensor/tensor_broadcasted_binary_mlops.py
index 40a8cde..be8a83f 100644
--- a/edugrad/_tensor/tensor_broadcasted_binary_mlops.py
+++ b/edugrad/_tensor/tensor_broadcasted_binary_mlops.py
@@ -1,3 +1,8 @@
+"""
+This module implements broadcasted binary operations for Tensors, providing
+element-wise arithmetic operations that support broadcasting for tensors of different shapes.
+
+"""
 from __future__ import annotations
 
 import math
@@ -10,27 +15,51 @@
 
 
 def _broadcasted(tensor: Tensor, y: Tensor | float, reverse: bool = False) -> tuple[Tensor, Tensor]:
+    """Prepares two tensors for broadcasting to a common shape.
+    
+    Args:
+        tensor (Tensor): The first tensor.
+        y (Tensor | float): The second tensor or a scalar value.
+        reverse (bool): If True, swaps the tensors before broadcasting.
+    
+    Returns:
+        tuple[Tensor, Tensor]: A tuple of two tensors broadcasted to a common shape.
+    """
     from edugrad.tensor import Tensor
 
     x: Tensor = tensor
+    # If y is not a tensor, convert it to a tensor with the same dtype as the input tensor.
+    # If the input tensor is empty, return a tensor full of the scalar value y.
     if not isinstance(y, Tensor):
         if 0 in x.shape:
             return x, x.full_like(y)
         y = Tensor(y, requires_grad=False, dtype=tensor.dtype if tensor.dtype != dtypes.bool else dtypes.float32)
+
+    # Swap tensors if reverse is True.
     if reverse:
         x, y = y, x
+
+    # Directly return tensors if they are already the same shape.
     if (xshape := x.shape) == (yshape := y.shape):
         return (x, y)
 
+    # Adjust shapes to make them broadcastable. This is done by prepending 1's to the shape
+    # of the shorter tensor until both shapes have the same length.
     shape_delta = len(xshape) - len(yshape)
     if shape_delta > 0:
         y = y.reshape((1,) * shape_delta + yshape)
     elif shape_delta < 0:
         x = x.reshape((1,) * -shape_delta + xshape)
+
+    # Check if tensors are now the same shape. If yes, return them.
     if (xshape := x.shape) == (yshape := y.shape):
         return (x, y)
 
+    # Determine the final shape after broadcasting. This is the element-wise maximum
+    # of the shapes of the two tensors.
     shape_ret = tuple([max(x, y) for x, y in zip(xshape, yshape)])
+
+    # Expand tensors to the final broadcasted shape.
     if xshape != shape_ret:
         x = x.expand(shape_ret)
     if yshape != shape_ret:
@@ -39,6 +68,17 @@ def _broadcasted(tensor: Tensor, y: Tensor | float, reverse: bool = False) -> tu
 
 
 def _to_float(tensor: Tensor, x: Tensor | float):
+    """
+    Converts a tensor to float32 dtype if it is not already a Tensor and
+    if it is suitable for certain operations where float32 dtype is required.
+
+    Args:
+        tensor (Tensor): The reference tensor to check compatibility.
+        x (Tensor | float): The tensor or scalar to be converted.
+
+    Returns:
+        The converted tensor or the original scalar.
+    """
     from edugrad.tensor import Tensor
 
     return (
@@ -52,6 +92,7 @@ def _to_float(tensor: Tensor, x: Tensor | float):
 
 
 def add(tensor: Tensor, x: Tensor | float, reverse=False) -> Tensor:
+    """Adds two tensors or a tensor and a scalar."""
     from edugrad.tensor import Tensor
 
     x = tensor._to_float(x)
@@ -59,6 +100,7 @@ def add(tensor: Tensor, x: Tensor | float, reverse=False) -> Tensor:
 
 
 def sub(tensor: Tensor, x: Tensor | float, reverse=False) -> Tensor:
+    """Subtracts two tensors or a tensor and a scalar."""
     from edugrad.tensor import Tensor
 
     x = tensor._to_float(x)
@@ -70,6 +112,7 @@ def sub(tensor: Tensor, x: Tensor | float, reverse=False) -> Tensor:
 
 
 def mul(tensor: Tensor, x: Tensor | float, reverse=False) -> Tensor:
+    """Multiplies two tensors or a tensor and a scalar."""
     from edugrad.tensor import Tensor
 
     x = tensor._to_float(x)
@@ -81,6 +124,7 @@ def mul(tensor: Tensor, x: Tensor | float, reverse=False) -> Tensor:
 
 
 def div(tensor: Tensor, x: Tensor | float, reverse=False) -> Tensor:
+    """Divides two tensors or a tensor and a scalar."""
     from edugrad.tensor import Tensor
 
     x = tensor._to_float(x)
@@ -92,6 +136,7 @@ def div(tensor: Tensor, x: Tensor | float, reverse=False) -> Tensor:
 
 
 def pow(tensor: Tensor, x: Tensor | float, reverse=False) -> Tensor:
+    """Raises a tensor to the power of another tensor or a scalar."""
     from edugrad.tensor import Tensor
 
     x = tensor._to_float(x)
@@ -140,18 +185,22 @@ def pow(tensor: Tensor, x: Tensor | float, reverse=False) -> Tensor:
 
 
 def matmul(tensor: Tensor, x: Tensor, reverse=False) -> Tensor:
+    """Performs matrix multiplication."""
     return x.dot(tensor) if reverse else tensor.dot(x)
 
 
 def maximum(tensor: Tensor, x: Tensor | float) -> Tensor:
+    """Computes the element-wise maximum of two tensors."""
     return (tensor < x).detach().where(x, (tensor > x).detach().where(tensor, (tensor + x) / 2))
 
 
 def minimum(tensor: Tensor, x: Tensor | float) -> Tensor:
+    """Computes the element-wise minimum of two tensors."""
     return -((-tensor).maximum(-x))
 
 
 def where(tensor: Tensor, input_: Tensor | float, other: Tensor | float):
+    """Selects elements from two tensors based on a condition tensor."""
     x_, y = tensor._broadcasted(input_)
     x, z = x_._broadcasted(other)
     return function.Where.apply(x, *y._broadcasted(z))
diff --git a/edugrad/_tensor/tensor_combine_segment.py b/edugrad/_tensor/tensor_combine_segment.py
index 32eb2e2..fb0bdba 100644
--- a/edugrad/_tensor/tensor_combine_segment.py
+++ b/edugrad/_tensor/tensor_combine_segment.py
@@ -1,3 +1,5 @@
+"""Contains tensor operations like concatenation, stacking, repeating, and chunking."""
+
 from __future__ import annotations
 import math
 from functools import reduce
@@ -5,43 +7,102 @@
 
 from edugrad.helpers import all_int
 
+def cat(tensor: Tensor, *args: Tensor, dim: int) -> Tensor:
+    """Concatenates the given tensors along a specified dimension.
+
+    Args:
+        tensor (Tensor): The first tensor to concatenate.
+        *args (Tensor): Additional tensors to concatenate.
+        dim (int): The dimension along which to concatenate.
 
-def cat(tensor, *args, dim) -> Tensor:
+    Returns:
+        Tensor: A new tensor resulting from concatenating the given tensors.
+    """
     from edugrad.tensor import Tensor
 
+    # Adjust the dimension if negative.
     dim = (dim + len(tensor.shape)) if dim < 0 else dim
+
+    # Ensure all tensors have compatible shapes for concatenation.
     assert all(
         len(y.shape) == len(tensor.shape) and all(y.shape[i] == s for i, s in enumerate(tensor.shape) if i != dim)
         for y in args
     )
+
+    # Prepare arguments for concatenation.
     catargs = [tensor, *args]
+
+    # Assert that tensors are not zero-dimensional.
     assert all(t.shape for t in catargs), "zero-dimensional tensor cannot be concatenated"
+
+    # Calculate shapes and cumulative shapes for slicing.
     shapes = [s.shape[dim] for s in catargs]
     shape_cumsum = [0, *accumulate(shapes)]
     slc = [[(0, 0) for _ in tensor.shape] for _ in catargs]
+
+    # Adjust slices for each tensor.
     for shp, k, s in zip(shapes, shape_cumsum[:-1], slc):
         s[dim] = (k, shape_cumsum[-1] - k - shp)
+
+    # Concatenate by padding and adding tensors.
     return reduce(Tensor.__add__, [arg.pad(tuple(s)) for arg, s in zip(catargs, slc)])
 
 
-@staticmethod
-def stack(tensors, dim) -> Tensor:
+def stack(tensors: list[Tensor], dim: int) -> Tensor:
+    """Stacks a list of tensors along a new dimension.
+
+    Args:
+        tensors (list[Tensor]): The list of tensors to stack.
+        dim (int): The dimension along which to stack.
+
+    Returns:
+        Tensor: A new tensor resulting from stacking the given tensors.
+    """
+    from edugrad.tensor import Tensor
+
+    # Unsqueeze the first tensor and prepare the rest.
     first = tensors[0].unsqueeze(dim)
     unsqueezed_tensors = [tensor.unsqueeze(dim) for tensor in tensors[1:]]
-    # checks for shapes and number of Falsedimensions delegated to cat
+
+    # Delegate checks for shapes and number of dimensions to cat.
     return first.cat(*unsqueezed_tensors, dim=dim)
 
 
-def repeat(tensor: Tensor, repeats) -> Tensor:
+def repeat(tensor: Tensor, repeats: list[int]) -> Tensor:
+    """Repeats a tensor along specified dimensions.
+
+    Args:
+        tensor (Tensor): The tensor to repeat.
+        repeats (list[int]): The number of repetitions for each dimension.
+
+    Returns:
+        Tensor: A new tensor with repeated values.
+    """
     base_shape = (1,) * (len(repeats) - tensor.ndim) + tensor.shape
     new_shape = [x for b in base_shape for x in (1, b)]
     expand_shape = [x for rs in zip(repeats, base_shape) for x in rs]
     final_shape = [r * s for r, s in zip(repeats, base_shape)]
+
+    # Repeat the tensor by reshaping, expanding, and reshaping again.
     return tensor.reshape(new_shape).expand(expand_shape).reshape(final_shape)
 
 
 def chunk(tensor: Tensor, num: int, dim: int) -> list[Tensor]:
+    """Splits a tensor into a specified number of chunks along a given dimension.
+
+    Args:
+        tensor (Tensor): The tensor to split.
+        num (int): The number of chunks to create.
+        dim (int): The dimension along which to split the tensor.
+
+    Returns:
+        list[Tensor]: A list of tensors representing the chunks.
+    """
     assert all_int(tensor.shape), f"does not support symbolic shape {tensor.shape}"
-    dim, step = dim + tensor.ndim if dim < 0 else dim, math.ceil(tensor.shape[dim] / num)
+    dim, step = (dim + tensor.ndim if dim < 0 else dim), math.ceil(tensor.shape[dim] / num)
+
+    # Generate slice parameters for each chunk.
     slice_params = [[slice(None)] * dim + [slice(k, k + step)] for k in range(0, tensor.shape[dim], step)]
+
+    # Create each chunk by slicing the tensor.
     return [tensor[tuple(sl)] for sl in slice_params]
diff --git a/edugrad/dtypes.py b/edugrad/dtypes.py
index 3bb7fe1..685de23 100644
--- a/edugrad/dtypes.py
+++ b/edugrad/dtypes.py
@@ -1,3 +1,9 @@
+"""Defines the allowed datatypes for intializing and casting Tensors.
+
+For simplicity we only use bool, int32 and float32. Note that after applying operations, the results are usually
+float32 (see `data.TensorData.elementwise()`).
+
+"""
 from typing import ClassVar, Dict, Optional, Final
 import numpy as np
 from dataclasses import dataclass
diff --git a/edugrad/helpers.py b/edugrad/helpers.py
index 025e2e3..4e4bff6 100644
--- a/edugrad/helpers.py
+++ b/edugrad/helpers.py
@@ -1,3 +1,5 @@
+"""Contains helper functions and DEBUG integer for verbose debugging used throughout the package."""
+
 from typing import Union, Tuple, Iterator, Any
 import os
 import functools
diff --git a/edugrad/tensor.py b/edugrad/tensor.py
index 6a1b2b7..3d7a40b 100644
--- a/edugrad/tensor.py
+++ b/edugrad/tensor.py
@@ -1,4 +1,4 @@
-"""Contain the tensor class that can be used for building neural networks with forward and backward pass.
+"""Contains the tensor class that can be used for building neural networks with forward and backward pass.
 
 The module contains the "high-level ops". These are syntax sugar and built on top of the "mid-level ops" containing the
 the functions with forward and backward passes in Function.function which is build on top of the "low-level ops"
@@ -120,16 +120,25 @@ def dtype(self) -> DType:
     # ------------------------------------------------------------------------------------------------------------------
     # data handlers
 
-    def assign(self, x) -> Tensor:
-        # TODO: this is a hack for writing to DISK
-        if x.__class__ is not Tensor:
+    def assign(self, x: Any) -> Tensor:
+        """Assigns the value of another tensor or array to the current tensor.
+
+        This method is a workaround for writing to disk and is used for in-place modification of tensor data.
+        """
+        if not isinstance(x, Tensor):
+            # Convert x to a Tensor if it's not already one
             x = Tensor(x, dtype=self.dtype)
+
         assert self.shape == x.shape, f"assign shape mismatch {self.shape} != {x.shape}"
-        assert not x.requires_grad  # tensor requires_grad is okay?
+        assert not x.requires_grad  # Ensure x doesn't require gradient computation
+        
         if DEBUG >= 4:
             print(f"assign {self.data} <- {x.data}")
+
+        # If dtype matches and assignment is allowed, perform the assignment
         if self.dtype == x.dtype and self.data is not None and not getenv("DISALLOW_ASSIGN"):
             x.data.output_buffer = self.data
+
         self.data = x.data
         return self
 
@@ -202,13 +211,25 @@ def uniform(*shape, low=0.0, high=1.0, **kwargs) -> Tensor:
     @staticmethod
     def scaled_uniform(*shape, **kwargs) -> Tensor: return scaled_uniform(*shape, **kwargs)
 
-    def multinomial(self:Tensor, num_samples:int = 1, replacement:bool = False) -> Tensor:
+    def multinomial(self: Tensor, num_samples: int = 1, replacement: bool = False) -> Tensor:
+        """Draws samples from the multinomial distribution based on the probability values in the tensor."""
+        # Validate input dimensions and sample count
         assert 1 <= self.ndim <= 2 and num_samples > 0, f"{self.ndim=} must be 1 or 2 dim, {num_samples=} must be positive"
         assert replacement or num_samples == 1, "no replacement only supports num_samples = 1"
+
+        # If tensor is 1D, add a new dimension at the beginning
         weight = self.unsqueeze(0) if self.ndim == 1 else self
+
+        # Compute the cumulative distribution function (CDF) for the weights
         cdf = (cw := weight.cumsum(1)) / cw[:, -1].unsqueeze(1)
+
+        # Generate uniform random samples
         unif_samples = Tensor.rand(num_samples, cdf.shape[0], 1)
+
+        # Determine indices based on CDF
         indices = (unif_samples.expand((-1, -1, cdf.shape[1])) >= cdf).sum(2).permute((1, 0))
+
+        # If the original tensor was 1D, squeeze the resulting indices tensor
         return (indices.squeeze(0) if self.ndim == 1 else indices).cast(dtypes.int32)
 
     # ------------------------------------------------------------------------------------------------------------------
diff --git a/tests/gradcheck.py b/tests/gradcheck.py
index 3f207c0..a19a002 100644
--- a/tests/gradcheck.py
+++ b/tests/gradcheck.py
@@ -1,16 +1,36 @@
+"""Contains functions for computing the Jacobian and performing gradient checks."""
+
 import numpy as np
 from edugrad.tensor import Tensor
+from typing import Callable, List, Union
+
+def mask_like(like: np.ndarray, mask_inx: Union[int, List[int]], mask_value: float = 1.0) -> np.ndarray:
 
+    """Creates a mask array that is like the input array but with specified values masked.
 
-def mask_like(like, mask_inx, mask_value=1.0):
+    Args:
+        like (array): The array to mimic in terms of shape.
+        mask_inx (int or array-like): Indices to mask.
+        mask_value (float, optional): The value to set at the masked indices. Defaults to 1.0.
+
+    Returns:
+        array: Masked array with the same shape as `like`.
+    """
     mask = np.zeros_like(like).reshape(-1)
     mask[mask_inx] = mask_value
     return mask.reshape(like.shape)
 
+def jacobian(func: Callable, input: Tensor):
+    """Computes the Jacobian matrix for a function at a given input.
 
-def jacobian(func, input):
-    output = func(input)
+    Args:
+        func: The function for which to compute the Jacobian.
+        input: The input tensor at which to evaluate the Jacobian.
 
+    Returns:
+        array: Jacobian matrix evaluated at the given input.
+    """
+    output = func(input)
     ji = input.numpy().reshape(-1).shape[-1]
     jo = output.numpy().reshape(-1).shape[-1]
     J = np.zeros((jo, ji), dtype=np.float32)
@@ -19,8 +39,8 @@ def jacobian(func, input):
         input.grad = None
         output = func(input)
 
-        # edugrad doesn't support slicing, tiny-hack to select
-        # the needed scalar an backpropagate only through it
+        # edugrad doesn't support slicing, workaround to select
+        # the needed scalar and backpropagate only through it.
         o_scalar = Tensor(mask_like(output.numpy(), o, 1.0)).mul(output).sum()
         o_scalar.backward()
 
@@ -28,8 +48,17 @@ def jacobian(func, input):
             J[o, i] = grad
     return J
 
+def numerical_jacobian(func: Callable, input: Tensor, eps:float=1e-3):
+    """Computes an approximation of the Jacobian matrix using finite differences.
+
+    Args:
+        func: The function for which to approximate the Jacobian.
+        input: The input tensor at which to approximate the Jacobian.
+        eps: The epsilon for finite differences. Defaults to 1e-3.
 
-def numerical_jacobian(func, input, eps=1e-3):
+    Returns:
+        array: Approximated Jacobian matrix.
+    """
     output = func(input)
 
     ji = input.numpy().reshape(-1).shape[-1]
@@ -47,8 +76,19 @@ def numerical_jacobian(func, input, eps=1e-3):
         NJ[:, i] = grad_approx
     return NJ
 
+def gradcheck(func: Callable, input: Tensor, eps: float=1e-3, atol: float=1e-3, rtol: float=1e-3):
+    """Performs a gradient check by comparing the Jacobian to its numerical approximation.
+
+    Args:
+        func: The function for which to perform the gradient check.
+        input : The input tensor for the function.
+        eps: Epsilon for finite differences in numerical Jacobian. Defaults to 1e-3.
+        atol: Absolute tolerance for np.allclose. Defaults to 1e-3.
+        rtol: Relative tolerance for np.allclose. Defaults to 1e-3.
 
-def gradcheck(func, input, eps=1e-3, atol=1e-3, rtol=1e-3):
+    Returns:
+        bool: True if the computed Jacobian is close to its numerical approximation.
+    """
     NJ = numerical_jacobian(func, input, eps)
     J = jacobian(func, input)
-    return np.allclose(J, NJ, atol=atol, rtol=rtol)
\ No newline at end of file
+    return np.allclose(J, NJ, atol=atol, rtol=rtol)