From a02f499dabb80d283da4c02d87f07c1e5160c211 Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Wed, 6 Nov 2024 18:34:22 -0800
Subject: [PATCH] Fix code generation of in-place multiply and divide

---
 CHANGELOG.md                       |   1 +
 docs/modules/differentiability.rst |  28 ++++++
 warp/codegen.py                    |   5 +-
 warp/tests/test_array.py           | 134 ++++++++++++++++++-----------
 4 files changed, 119 insertions(+), 49 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5695299d..7e9f9b7a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,7 @@
 - Fix an incorrect user function being sometimes resolved when multiple overloads are available with array parameters with different `dtype` values.
 - Fix error being raised when static and dynamic for-loops are written in sequence with the same iteration variable names ([GH-331](https://github.com/NVIDIA/warp/issues/331)).
 - Fix an issue with the `Texture Write` node, used in the Mandelbrot Omniverse sample, sometimes erroring out in multi-GPU environments.
+- Code generation of in-place multiplication and division operations (regression introduced in a69d061)([GH-342](https://github.com/NVIDIA/warp/issues/342)).
 
 ## [1.4.1] - 2024-10-15
 
diff --git a/docs/modules/differentiability.rst b/docs/modules/differentiability.rst
index d3db5c53..a6a3a15a 100644
--- a/docs/modules/differentiability.rst
+++ b/docs/modules/differentiability.rst
@@ -893,6 +893,34 @@ Warp uses a source-code transformation approach to auto-differentiation.
 In this approach, the backwards pass must keep a record of intermediate values computed during the forward pass.
 This imposes some restrictions on what kernels can do if they are to remain differentiable.
 
+In-Place Math Operations
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+In-place addition and subtraction can be used in kernels participating in the backward pass, e.g.
+
+.. code-block:: python
+
+    @wp.kernel
+    def inplace(a: wp.array(dtype=float), b: wp.array(dtype=float)):
+        i = wp.tid()
+
+        a[i] -= b[i]
+
+
+    a = wp.full(10, value=10.0, dtype=float, requires_grad=True)
+    b = wp.full(10, value=2.0, dtype=float, requires_grad=True)
+
+    with wp.Tape() as tape:
+        wp.launch(inplace, a.shape, inputs=[a, b])
+
+    tape.backward(grads={a: wp.ones_like(a)})
+
+    print(a.grad)  # [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
+    print(b.grad)  # [-1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
+
+In-place multiplication and division are *not* supported and incorrect results will be obtained in the backward pass.
+A warning will be emitted during code generation if ``wp.config.verbose = True``.
+
 Dynamic Loops
 ^^^^^^^^^^^^^
 Currently, dynamic loops are not replayed or unrolled in the backward pass, meaning intermediate values that are
diff --git a/warp/codegen.py b/warp/codegen.py
index 51c98c72..5a800888 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -2576,7 +2576,10 @@ def make_new_assign_statement():
                     if warp.config.verify_autograd_array_access:
                         target.mark_write(kernel_name=kernel_name, filename=filename, lineno=lineno)
                 else:
-                    print(f"Warning: in-place op {node.op} is not differentiable")
+                    if warp.config.verbose:
+                        print(f"Warning: in-place op {node.op} is not differentiable")
+                    make_new_assign_statement()
+                    return
 
             # TODO
             elif type_is_vector(target_type) or type_is_quaternion(target_type) or type_is_matrix(target_type):
diff --git a/warp/tests/test_array.py b/warp/tests/test_array.py
index 3ffddb71..66ad8f12 100644
--- a/warp/tests/test_array.py
+++ b/warp/tests/test_array.py
@@ -2361,64 +2361,75 @@ def first_row_plus_one(x: wp.array2d(dtype=float)):
     assert_np_equal(arr_warp.numpy(), np.array([[2, 1, 1], [1, 0, 0], [1, 0, 0]]))
 
 
-def test_array_inplace_ops(test, device):
-    @wp.kernel
-    def inplace_add_1d(x: wp.array(dtype=float), y: wp.array(dtype=float)):
-        i = wp.tid()
-        x[i] += y[i]
+@wp.kernel
+def inplace_add_1d(x: wp.array(dtype=float), y: wp.array(dtype=float)):
+    i = wp.tid()
+    x[i] += y[i]
 
-    @wp.kernel
-    def inplace_add_2d(x: wp.array2d(dtype=float), y: wp.array2d(dtype=float)):
-        i, j = wp.tid()
-        x[i, j] += y[i, j]
 
-    @wp.kernel
-    def inplace_add_3d(x: wp.array3d(dtype=float), y: wp.array3d(dtype=float)):
-        i, j, k = wp.tid()
-        x[i, j, k] += y[i, j, k]
+@wp.kernel
+def inplace_add_2d(x: wp.array2d(dtype=float), y: wp.array2d(dtype=float)):
+    i, j = wp.tid()
+    x[i, j] += y[i, j]
 
-    @wp.kernel
-    def inplace_add_4d(x: wp.array4d(dtype=float), y: wp.array4d(dtype=float)):
-        i, j, k, l = wp.tid()
-        x[i, j, k, l] += y[i, j, k, l]
 
-    @wp.kernel
-    def inplace_sub_1d(x: wp.array(dtype=float), y: wp.array(dtype=float)):
-        i = wp.tid()
-        x[i] -= y[i]
+@wp.kernel
+def inplace_add_3d(x: wp.array3d(dtype=float), y: wp.array3d(dtype=float)):
+    i, j, k = wp.tid()
+    x[i, j, k] += y[i, j, k]
 
-    @wp.kernel
-    def inplace_sub_2d(x: wp.array2d(dtype=float), y: wp.array2d(dtype=float)):
-        i, j = wp.tid()
-        x[i, j] -= y[i, j]
 
-    @wp.kernel
-    def inplace_sub_3d(x: wp.array3d(dtype=float), y: wp.array3d(dtype=float)):
-        i, j, k = wp.tid()
-        x[i, j, k] -= y[i, j, k]
+@wp.kernel
+def inplace_add_4d(x: wp.array4d(dtype=float), y: wp.array4d(dtype=float)):
+    i, j, k, l = wp.tid()
+    x[i, j, k, l] += y[i, j, k, l]
 
-    @wp.kernel
-    def inplace_sub_4d(x: wp.array4d(dtype=float), y: wp.array4d(dtype=float)):
-        i, j, k, l = wp.tid()
-        x[i, j, k, l] -= y[i, j, k, l]
 
-    @wp.kernel
-    def inplace_add_vecs(x: wp.array(dtype=wp.vec3), y: wp.array(dtype=wp.vec3)):
-        i = wp.tid()
-        x[i] += y[i]
+@wp.kernel
+def inplace_sub_1d(x: wp.array(dtype=float), y: wp.array(dtype=float)):
+    i = wp.tid()
+    x[i] -= y[i]
 
-    @wp.kernel
-    def inplace_add_mats(x: wp.array(dtype=wp.mat33), y: wp.array(dtype=wp.mat33)):
-        i = wp.tid()
-        x[i] += y[i]
 
-    @wp.kernel
-    def inplace_add_rhs(x: wp.array(dtype=float), y: wp.array(dtype=float), z: wp.array(dtype=float)):
-        i = wp.tid()
-        a = y[i]
-        a += x[i]
-        wp.atomic_add(z, 0, a)
+@wp.kernel
+def inplace_sub_2d(x: wp.array2d(dtype=float), y: wp.array2d(dtype=float)):
+    i, j = wp.tid()
+    x[i, j] -= y[i, j]
+
+
+@wp.kernel
+def inplace_sub_3d(x: wp.array3d(dtype=float), y: wp.array3d(dtype=float)):
+    i, j, k = wp.tid()
+    x[i, j, k] -= y[i, j, k]
+
 
+@wp.kernel
+def inplace_sub_4d(x: wp.array4d(dtype=float), y: wp.array4d(dtype=float)):
+    i, j, k, l = wp.tid()
+    x[i, j, k, l] -= y[i, j, k, l]
+
+
+@wp.kernel
+def inplace_add_vecs(x: wp.array(dtype=wp.vec3), y: wp.array(dtype=wp.vec3)):
+    i = wp.tid()
+    x[i] += y[i]
+
+
+@wp.kernel
+def inplace_add_mats(x: wp.array(dtype=wp.mat33), y: wp.array(dtype=wp.mat33)):
+    i = wp.tid()
+    x[i] += y[i]
+
+
+@wp.kernel
+def inplace_add_rhs(x: wp.array(dtype=float), y: wp.array(dtype=float), z: wp.array(dtype=float)):
+    i = wp.tid()
+    a = y[i]
+    a += x[i]
+    wp.atomic_add(z, 0, a)
+
+
+def test_array_inplace_diff_ops(test, device):
     N = 3
     x1 = wp.ones(N, dtype=float, requires_grad=True, device=device)
     x2 = wp.ones((N, N), dtype=float, requires_grad=True, device=device)
@@ -2528,6 +2539,32 @@ def inplace_add_rhs(x: wp.array(dtype=float), y: wp.array(dtype=float), z: wp.ar
     assert_np_equal(y.grad.numpy(), np.ones(1, dtype=float))
 
 
+@wp.kernel
+def inplace_mul_1d(x: wp.array(dtype=float), y: wp.array(dtype=float)):
+    i = wp.tid()
+    x[i] *= y[i]
+
+
+@wp.kernel
+def inplace_div_1d(x: wp.array(dtype=float), y: wp.array(dtype=float)):
+    i = wp.tid()
+    x[i] /= y[i]
+
+
+def test_array_inplace_non_diff_ops(test, device):
+    N = 3
+    x1 = wp.full(N, value=10.0, dtype=float, device=device)
+    y1 = wp.full(N, value=5.0, dtype=float, device=device)
+
+    wp.launch(inplace_mul_1d, N, inputs=[x1, y1], device=device)
+    assert_np_equal(x1.numpy(), np.full(N, fill_value=50.0, dtype=float))
+
+    x1.fill_(10.0)
+    y1.fill_(5.0)
+    wp.launch(inplace_div_1d, N, inputs=[x1, y1], device=device)
+    assert_np_equal(x1.numpy(), np.full(N, fill_value=2.0, dtype=float))
+
+
 @wp.kernel
 def inc_scalar(a: wp.array(dtype=float)):
     tid = wp.tid()
@@ -2750,7 +2787,8 @@ def test_array_new_del(self):
 add_function_test(TestArray, "test_array_aliasing_from_numpy", test_array_aliasing_from_numpy, devices=["cpu"])
 add_function_test(TestArray, "test_numpy_array_interface", test_numpy_array_interface, devices=["cpu"])
 
-add_function_test(TestArray, "test_array_inplace_ops", test_array_inplace_ops, devices=devices)
+add_function_test(TestArray, "test_array_inplace_diff_ops", test_array_inplace_diff_ops, devices=devices)
+add_function_test(TestArray, "test_array_inplace_non_diff_ops", test_array_inplace_non_diff_ops, devices=devices)
 add_function_test(TestArray, "test_direct_from_numpy", test_direct_from_numpy, devices=["cpu"])
 add_function_test(TestArray, "test_kernel_array_from_ptr", test_kernel_array_from_ptr, devices=devices)