Reuse LU decomposition in Solve

ricardoV94 · ricardoV94 · commit 8c84fcf7fdad · 2025-05-19T13:41:32.000+02:00
diff --git a/pytensor/compile/mode.py b/pytensor/compile/mode.py
@@ -490,6 +490,8 @@ def clone(self, link_kwargs=None, optimizer="", **kwargs):
             "fusion",
             "inplace",
             "scan_save_mem_prealloc",
+            "reuse_lu_decomposition_multiple_solves",
+            "scan_split_non_sequence_lu_decomposition_solve",
         ],
     ),
 )
diff --git a/pytensor/scan/rewriting.py b/pytensor/scan/rewriting.py
@@ -2561,26 +2561,24 @@ def scan_push_out_dot1(fgraph, node):
     position=1,
 )
 
-
 scan_seqopt1.register(
     "scan_push_out_non_seq",
     in2out(scan_push_out_non_seq, ignore_newtrees=True),
     "scan_pushout_nonseqs_ops",  # For backcompat: so it can be tagged with old name
     "fast_run",
     "scan",
     "scan_pushout",
-    position=2,
+    position=3,
 )
 
-
 scan_seqopt1.register(
     "scan_push_out_seq",
     in2out(scan_push_out_seq, ignore_newtrees=True),
     "scan_pushout_seqs_ops",  # For backcompat: so it can be tagged with old name
     "fast_run",
     "scan",
     "scan_pushout",
-    position=3,
+    position=4,
 )
 
 
@@ -2592,7 +2590,7 @@ def scan_push_out_dot1(fgraph, node):
     "more_mem",
     "scan",
     "scan_pushout",
-    position=4,
+    position=5,
 )
 
 
@@ -2605,7 +2603,7 @@ def scan_push_out_dot1(fgraph, node):
     "more_mem",
     "scan",
     "scan_pushout",
-    position=5,
+    position=6,
 )
 
 scan_eqopt2.register(
diff --git a/pytensor/tensor/__init__.py b/pytensor/tensor/__init__.py
@@ -114,6 +114,7 @@ def _get_vector_length_Constant(op: Op | Variable, var: Constant) -> int:
 
 
 # isort: off
+import pytensor.tensor._linalg
 from pytensor.tensor import linalg
 from pytensor.tensor import special
 from pytensor.tensor import signal
diff --git a/pytensor/tensor/_linalg/__init__.py b/pytensor/tensor/_linalg/__init__.py
@@ -0,0 +1,2 @@
+# Register rewrites
+import pytensor.tensor._linalg.solve
diff --git a/pytensor/tensor/_linalg/solve/__init__.py b/pytensor/tensor/_linalg/solve/__init__.py
@@ -0,0 +1,2 @@
+# Register rewrites in the database
+import pytensor.tensor._linalg.solve.rewriting
diff --git a/pytensor/tensor/_linalg/solve/rewriting.py b/pytensor/tensor/_linalg/solve/rewriting.py
@@ -0,0 +1,192 @@
+from copy import copy
+
+from pytensor.graph import Constant, graph_inputs
+from pytensor.graph.rewriting.basic import copy_stack_trace, in2out, node_rewriter
+from pytensor.scan.op import Scan
+from pytensor.scan.rewriting import scan_seqopt1
+from pytensor.tensor.basic import atleast_Nd
+from pytensor.tensor.blockwise import Blockwise
+from pytensor.tensor.elemwise import DimShuffle
+from pytensor.tensor.rewriting.basic import register_specialize
+from pytensor.tensor.rewriting.linalg import is_matrix_transpose
+from pytensor.tensor.slinalg import Solve, lu_factor, lu_solve
+from pytensor.tensor.variable import TensorVariable
+
+
+def decompose_A(A, assume_a):
+    if assume_a == "gen":
+        return lu_factor(A, check_finite=False)
+    else:
+        raise NotImplementedError
+
+
+def solve_lu_decomposed_system(A_decomp, b, b_ndim, assume_a, transposed=False):
+    if assume_a == "gen":
+        return lu_solve(A_decomp, b, b_ndim=b_ndim, trans=transposed)
+    else:
+        raise NotImplementedError
+
+
+_SPLITTABLE_SOLVE_ASSUME_A = {"gen"}
+
+
+def _split_lu_solve_steps(fgraph, node, *, eager: bool):
+    if not isinstance(node.op.core_op, Solve):
+        return None
+
+    def get_root_A(a: TensorVariable) -> tuple[TensorVariable, bool]:
+        # Find the root variable of the first input to Solve
+        # If `a` is a left expand_dims or matrix transpose (DimShuffle variants),
+        # the root variable is the pre-DimShuffled input.
+        # Otherwise, `a` is considered the root variable.
+        # We also return whether the root `a` is transposed.
+        transposed = False
+        if a.owner is not None and isinstance(a.owner.op, DimShuffle):
+            if a.owner.op.is_left_expand_dims:
+                [a] = a.owner.inputs
+            elif is_matrix_transpose(a):
+                [a] = a.owner.inputs
+                transposed = True
+        return a, transposed
+
+    def find_solve_clients(var, assume_a):
+        clients = []
+        for cl, idx in fgraph.clients[var]:
+            if (
+                idx == 0
+                and isinstance(cl.op, Blockwise)
+                and isinstance(cl.op.core_op, Solve)
+                and (cl.op.core_op.assume_a == assume_a)
+            ):
+                clients.append(cl)
+            elif isinstance(cl.op, DimShuffle) and cl.op.is_left_expand_dims:
+                # If it's a left expand_dims, recurse on the output
+                clients.extend(find_solve_clients(cl.outputs[0], assume_a))
+        return clients
+
+    assume_a = node.op.core_op.assume_a
+
+    if assume_a not in _SPLITTABLE_SOLVE_ASSUME_A:
+        return None
+
+    A, _ = get_root_A(node.inputs[0])
+
+    # Find Solve using A (or left expand_dims of A)
+    # TODO: We could handle arbitrary shuffle of the batch dimensions, just need to propagate
+    #  that to the A_decomp outputs
+    A_solve_clients_and_transpose = [
+        (client, False) for client in find_solve_clients(A, assume_a)
+    ]
+
+    # Find Solves using A.T
+    for cl, _ in fgraph.clients[A]:
+        if isinstance(cl.op, DimShuffle) and is_matrix_transpose(cl.out):
+            A_T = cl.out
+            A_solve_clients_and_transpose.extend(
+                (client, True) for client in find_solve_clients(A_T, assume_a)
+            )
+
+    if not eager and len(A_solve_clients_and_transpose) == 1:
+        # If theres' a single use don't do it... unless it's being broadcast in a Blockwise (or we're eager)
+        # That's a "reuse" inside the inner vectorized loop
+        batch_ndim = node.op.batch_ndim(node)
+        (client, _) = A_solve_clients_and_transpose[0]
+        original_A, b = client.inputs
+        if not any(
+            a_bcast and not b_bcast
+            for a_bcast, b_bcast in zip(
+                original_A.type.broadcastable[:batch_ndim],
+                b.type.broadcastable[:batch_ndim],
+                strict=True,
+            )
+        ):
+            return None
+
+    A_decomp = decompose_A(A, assume_a=assume_a)
+
+    replacements = {}
+    for client, transposed in A_solve_clients_and_transpose:
+        _, b = client.inputs
+        b_ndim = client.op.core_op.b_ndim
+        new_x = solve_lu_decomposed_system(
+            A_decomp, b, b_ndim=b_ndim, assume_a=assume_a, transposed=transposed
+        )
+        [old_x] = client.outputs
+        new_x = atleast_Nd(new_x, n=old_x.type.ndim).astype(old_x.type.dtype)
+        copy_stack_trace(old_x, new_x)
+        replacements[old_x] = new_x
+
+    return replacements
+
+
+@register_specialize
+@node_rewriter([Blockwise])
+def reuse_lu_decomposition_multiple_solves(fgraph, node):
+    return _split_lu_solve_steps(fgraph, node, eager=False)
+
+
+@node_rewriter([Blockwise])
+def eager_split_lu_solve_steps(fgraph, node):
+    return _split_lu_solve_steps(fgraph, node, eager=True)
+
+
+@node_rewriter([Scan])
+def scan_split_non_sequence_lu_decomposition_solve(fgraph, node):
+    """If the A of a Solve within a Scan is a function of non-sequences, split the LU decomposition step.
+
+    The LU decomposition step can then be pushed out of the inner loop by the `scan_pushout_non_sequences` rewrite.
+    """
+    scan_op: Scan = node.op
+    non_sequences = set(scan_op.inner_non_seqs(scan_op.inner_inputs))
+    new_scan_fgraph = scan_op.fgraph
+
+    changed = False
+    while True:
+        for inner_node in new_scan_fgraph.toposort():
+            if (
+                isinstance(inner_node.op, Blockwise)
+                and isinstance(inner_node.op.core_op, Solve)
+                and inner_node.op.core_op.assume_a in _SPLITTABLE_SOLVE_ASSUME_A
+            ):
+                A, b = inner_node.inputs
+                if all(
+                    (isinstance(root_inp, Constant) or (root_inp in non_sequences))
+                    for root_inp in graph_inputs([A])
+                ):
+                    if new_scan_fgraph is scan_op.fgraph:
+                        # Clone the first time to avoid mutating the original fgraph
+                        new_scan_fgraph, equiv = new_scan_fgraph.clone_get_equiv()
+                        non_sequences = {equiv[non_seq] for non_seq in non_sequences}
+                        inner_node = equiv[inner_node]
+
+                    replace_dict = eager_split_lu_solve_steps.transform(
+                        new_scan_fgraph, inner_node
+                    )
+                    assert (
+                        isinstance(replace_dict, dict) and len(replace_dict) > 0
+                    ), "Rewrite failed"
+                    new_scan_fgraph.replace_all(replace_dict.items())
+                    changed = True
+                    break  # Break to start over with a fresh toposort
+        else:  # no_break
+            break  # Nothing else changed
+
+    if not changed:
+        return
+
+    # Return a new scan to indicate that a rewrite was done
+    new_scan_op = copy(scan_op)
+    new_scan_op.fgraph = new_scan_fgraph
+    new_outs = new_scan_op.make_node(*node.inputs).outputs
+    copy_stack_trace(node.outputs, new_outs)
+    return new_outs
+
+
+scan_seqopt1.register(
+    scan_split_non_sequence_lu_decomposition_solve.__name__,
+    in2out(scan_split_non_sequence_lu_decomposition_solve, ignore_newtrees=True),
+    "fast_run",
+    "scan",
+    "scan_pushout",
+    position=2,
+)
diff --git a/pytensor/tensor/rewriting/linalg.py b/pytensor/tensor/rewriting/linalg.py
@@ -75,6 +75,13 @@ def is_matrix_transpose(x: TensorVariable) -> bool:
         if ndims < 2:
             return False
         transpose_order = (*range(ndims - 2), ndims - 1, ndims - 2)
+
+        # Allow expand_dims on the left of the transpose
+        if (diff := len(transpose_order) - len(node.op.new_order)) > 0:
+            transpose_order = (
+                *(["x"] * diff),
+                *transpose_order,
+            )
         return node.op.new_order == transpose_order
     return False
 
diff --git a/tests/tensor/linalg/__init__.py b/tests/tensor/linalg/__init__.py
diff --git a/tests/tensor/linalg/test_rewriting.py b/tests/tensor/linalg/test_rewriting.py
diff --git a/tests/tensor/test_blockwise.py b/tests/tensor/test_blockwise.py

Original file line number	Diff line number	Diff line change
`@@ -490,6 +490,8 @@ def clone(self, link_kwargs=None, optimizer="", **kwargs):`
`490`	`490`	`"fusion",`
`491`	`491`	`"inplace",`
`492`	`492`	`"scan_save_mem_prealloc",`
	`493`	`+ "reuse_lu_decomposition_multiple_solves",`
	`494`	`+ "scan_split_non_sequence_lu_decomposition_solve",`
`493`	`495`	`],`
`494`	`496`	`),`
`495`	`497`	`)`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Register rewrites`
	`2`	`+import pytensor.tensor._linalg.solve`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Register rewrites in the database`
	`2`	`+import pytensor.tensor._linalg.solve.rewriting`