Edge case where broadcast with scalar returns float grad (#373)

* add prepare_op alias * clean up documented examples * ensure backprop through broadcast resulting in scalar produces 0D array, not float * add write your own operation docs
rsokl · Apr 1, 2021 · f91410e · f91410e
1 parent 9137ee8
commit f91410e
Show file tree

Hide file tree

Showing 8 changed files with 123 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -16,8 +16,9 @@ MyGrad is a lightweight library that adds automatic differentiation to NumPy –
 >>> import numpy as np
 
 >>> x = mg.tensor([1., 2., 3.])  # like numpy.array, but supports backprop!
->>> np.sum(x * x).backward()  # works natively with numpy functions!
->>> x.grad
+>>> f = np.sum(x * x)  # tensors work with numpy functions!
+>>> f.backward() # triggers automatic differentiation 
+>>> x.grad  # stores [df/dx0, df/dx1, df/dx2]
 array([2., 4., 6.])
 ```
 

diff --git a/docs/source/changes.rst b/docs/source/changes.rst
@@ -78,6 +78,22 @@ You can now control the dimensionality of a tensor and whether or not a tensor c
 +-------------------------------------------------------+-------------------------------------------------------+-------------------------------------------------+
 
 
+Support for dtype, where, and out in ufuncs
+-------------------------------------------
+
+MyGrad now implements ufuncs with support for specifying dtype, boolean masks, and in-place targets. The
+additional methods, such as ``mygrad.add.reduce``, are not yet implemented.
+
++---------------------------------------------------------------+
+| MyGrad 2.0                                                    |
++===============================================================+
+| .. code:: python                                              |
+|                                                               |
+|    >>> mg.add([1, 2],[0, 2], where=[True, False], dtype=float)|
+|    Tensor([3., 1.])                                           |
++---------------------------------------------------------------+
+
+
 Augmented Updates on Tensors Now Match NumPy's Behavior
 -------------------------------------------------------
 

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -13,8 +13,9 @@ MyGrad is a lightweight library that adds automatic differentiation to NumPy –
    >>> import numpy as np
 
    >>> x = mg.tensor([1., 2., 3.])  # like numpy.array, but supports backprop!
-   >>> np.sum(x * x).backward()  # works natively with numpy functions!
-   >>> x.grad
+   >>> f = np.sum(x * x)  # tensors work with numpy functions!
+   >>> f.backward() # triggers automatic differentiation
+   >>> x.grad  # stores [df/dx0, df/dx1, df/dx2]
    array([2., 4., 6.])
 
 

diff --git a/docs/source/intro.rst b/docs/source/intro.rst
@@ -10,8 +10,9 @@ MyGrad is a lightweight library that adds automatic differentiation to NumPy –
    >>> import numpy as np
 
    >>> x = mg.tensor([1., 2., 3.])  # like numpy.array, but supports backprop!
-   >>> np.sum(x * x).backward()  # works natively with numpy functions!
-   >>> x.grad
+   >>> f = np.sum(x * x)  # tensors work with numpy functions!
+   >>> f.backward() # triggers automatic differentiation
+   >>> x.grad  # stores [df/dx0, df/dx1, df/dx2]
    array([2., 4., 6.])
 
 

diff --git a/docs/source/operation.rst b/docs/source/operation.rst
@@ -1,22 +1,89 @@
-MyGrad's Operation Class
-************************
-Base class for all tensor operations that support back-propagation
-of gradients.
+Writing Your Own Operations
+***************************
 
-Consider the Operation-instance ``f``. A forward-pass through ``f`` is defined
-via ``f.__call__``. Thus, given tensors ``a`` and ``b``, a computational
-graph is defined ``f.__call__(a, b) -> c``, where the "creator" of tensor ``c``
-is recorded as ``f``.::
+Let's write our own "multiply" operation.
 
-       (tensor: a) --+
-                      |-> [operation: f(a, b)] --> (tensor: c)
-       (tensor: b) --+
+.. code:: python
 
-Thus back-propagating through ``c`` will instruct ``f`` to back-propagate
-the gradient to its inputs, which are recorded as ``a`` and ``b``. Each
-node then back-propagates to any Operation-instance that is recorded
-as its creator, and so on.
+   import numpy as np
 
+   import mygrad as mg
+   from mygrad import prepare_op
+   from mygrad.operation_base import Operation
+   from mygrad.typing import ArrayLike
+
+   # All operations should inherit from Operation, or one of its subclasses
+   class CustomMultiply(Operation):
+       """ Performs f(x, y) = x * y """
+
+       def __call__(self, x: mg.Tensor, y: mg.Tensor) -> np.ndarray:
+           # This method defines the "forward pass" of the operation.
+           # It must bind the variable tensors to the op and compute
+           # the output of the operation as a numpy array
+
+           # All tensors must be bound as a tuple to the `variables`
+           # instance variable.
+           self.variables = (x, y)
+
+           # The forward pass should be performed using numpy arrays,
+           # not the tensors themselves.
+           x_arr = x.data
+           y_arr = y.data
+           return x_arr * y_arr
+
+       def backward_var(self, grad, index, **kwargs):
+           """Given ``grad = dℒ/df``, computes ``∂ℒ/∂x`` and ``∂ℒ/∂y``
+
+           ``ℒ`` is assumed to be the terminal node from which ``ℒ.backward()`` was
+           called.
+
+           Parameters
+           ----------
+           grad : numpy.ndarray
+               The back-propagated total derivative with respect to the present
+               operation: dℒ/df. This will have the same shape as f, the result
+               of the forward pass.
+
+           index : Literal[0, 1]
+               The index-location of ``var`` in ``self.variables``
+
+           Returns
+           -------
+           numpy.ndarray
+               ∂ℒ/∂x_{i}
+
+           Raises
+           ------
+           SkipGradient"""
+           x, y = self.variables
+           x_arr = x.data
+           y_arr = y.data
+
+           if index == 0:  # backprop through a
+               return grad * y.data  # ∂ℒ/∂x = (∂ℒ/∂f)(∂f/∂x)
+           elif index == 1:  # backprop through b
+               return grad * x.data  # ∂ℒ/∂y = (∂ℒ/∂f)(∂f/∂y)
+
+
+   # Our function stitches together our operation class with the
+   # operation arguments via `mygrad.prepare_op`
+   def custom_multiply(x: ArrayLike, y: ArrayLike) -> mg.Tensor:
+       # `prepare_op` will take care of casting `x` and `y` to tensors if
+       # they are not already tensors.
+       return prepare_op(CustomMultiply, x, y)
+
+We can now use our differentiable function! It will automatically be compatible
+with broadcasting; out operation need not account for broadcasting in either the
+forward pass or the backward pass.
+
+.. code:: pycon
+
+   >> x = mg.tensor(2.0)
+   >> y = mg.tensor([1.0, 2.0, 3.0])
+
+   >> custom_multiply(x, y).backward()
+   >> x.grad, y.grad
+   (array(6.), array([2., 2., 2.]))
 
 Documentation for mygrad.Operation
 ----------------------------------

diff --git a/src/mygrad/__init__.py b/src/mygrad/__init__.py
@@ -41,3 +41,4 @@
 
 
 setattr(Tensor, "clip", clip)
+prepare_op = Tensor._op
diff --git a/src/mygrad/operation_base.py b/src/mygrad/operation_base.py
@@ -93,7 +93,12 @@ def grad_post_process_fn(
         # the extra function call by doing the shape check upfront
         if grad.shape == var_shape:
             return grad
-        return reduce_broadcast(grad, var_shape)
+        out = reduce_broadcast(grad, var_shape)
+
+        if out.ndim == 0:
+            # sum-reduction to a scalar produces a float
+            out = np.array(out, copy=False)
+        return out
 
     @abstractmethod
     def __call__(self, *input_vars: "Tensor", **kwargs) -> np.ndarray:

diff --git a/tests/tensor_base/test_backward.py b/tests/tensor_base/test_backward.py
@@ -9,6 +9,15 @@
 from tests.custom_strategies import tensors
 
 
+def test_scalar_broadcasting_produces_narray_grad():
+    x = mg.tensor(2.0)
+    y = mg.tensor([1.0, 2.0, 3.0])
+
+    (x * y).backward()
+    assert isinstance(x.grad, np.ndarray)
+    assert isinstance(y.grad, np.ndarray)
+
+
 def test_simple_behavior():
     tensor = mg.Tensor([1.0, 2.0])
Original file line number	Diff line number	Diff line change
Expand Up		@@ -41,3 +41,4 @@


		setattr(Tensor, "clip", clip)
		prepare_op = Tensor._op