Add rewrite for Blockwise with Alloc inputs

ricardoV94 · ricardoV94 · commit 9299e7b94095 · 2023-12-06T19:38:25.000+01:00
Also prevent Alloc from constant_folding when it's used by Elemwise and Blockwise to avoid creating useless large arrays
diff --git a/pytensor/tensor/basic.py b/pytensor/tensor/basic.py
@@ -42,6 +42,7 @@
     as_tensor_variable,
     get_vector_length,
 )
+from pytensor.tensor.blockwise import Blockwise
 from pytensor.tensor.elemwise import DimShuffle, Elemwise, scalar_elemwise
 from pytensor.tensor.exceptions import NotScalarConstantError
 from pytensor.tensor.shape import (
@@ -1663,6 +1664,9 @@ def do_constant_folding(self, fgraph, node):
                 # If the output is a constant, it will have to be deepcopied
                 # each time the function is called.  So we do not fold.
                 return False
+            # Allow alloc to be lifted out of Elemwise and Blockwise, before constant folding it
+            elif isinstance(client[0].op, (Elemwise, Blockwise)):
+                return None
             elif (
                 # The following ops work inplace of their input id 0.
                 client[1] == 0
diff --git a/pytensor/tensor/rewriting/blockwise.py b/pytensor/tensor/rewriting/blockwise.py
@@ -2,8 +2,9 @@
 from pytensor.graph import node_rewriter
 from pytensor.graph.replace import vectorize_node
 from pytensor.graph.rewriting.basic import copy_stack_trace, out2in
-from pytensor.tensor.basic import Alloc, ARange, shape_padleft
+from pytensor.tensor.basic import Alloc, ARange, alloc, shape_padleft
 from pytensor.tensor.blockwise import Blockwise
+from pytensor.tensor.extra_ops import broadcast_shape_iter
 from pytensor.tensor.math import _matrix_matrix_matmul
 from pytensor.tensor.rewriting.basic import (
     register_canonicalize,
@@ -75,3 +76,94 @@ def local_eager_useless_unbatched_blockwise(fgraph, node):
         )
     ):
         return local_useless_unbatched_blockwise.fn(fgraph, node)
+
+
+@register_specialize("shape_unsafe")
+@node_rewriter([Blockwise])
+def local_blockwise_alloc(fgraph, node):
+    """Push Allocs from the inputs to the output of Blockwise Ops."""
+
+    op: Blockwise = node.op  # type: ignore
+
+    batch_ndim = node.inputs[0].type.ndim - len(op.inputs_sig[0])
+    batch_axes = tuple(range(batch_ndim))
+    new_inputs = []
+    batch_shapes = []
+    can_lift_alloc = False
+    for inp, inp_sig in zip(node.inputs, op.inputs_sig):
+        if all(inp.type.broadcastable[:batch_ndim]):
+            # The input only has dummy batch dims (if it has any)
+            inp = inp.squeeze(batch_axes)
+            new_inputs.append(inp)
+            continue
+
+        core_ndim = len(inp_sig)
+        if inp.owner and isinstance(inp.owner.op, Alloc):
+            value, *shape = inp.owner.inputs
+            value_ndim = value.type.ndim
+            value_batch_ndim = value_ndim - core_ndim
+            if value_batch_ndim:
+                # The original value already has batch dims, let's see if it's just dummy ones
+                if all(value.type.broadcastable[:value_batch_ndim]):
+                    value = value.squeeze(axis=tuple(range(value_batch_ndim)))
+                else:
+                    # The original value has batch dims that are not dummy
+                    # We cannot lift this Alloc
+                    new_inputs.append(inp)
+                    continue
+
+            alloc_ndim = len(shape)
+            if alloc_ndim > core_ndim:
+                # The Alloc adds all the batch dims
+                batch_shape = shape[:batch_ndim]
+                core_shape = shape[batch_ndim:]
+                if any(value.type.broadcastable[batch_ndim:]):
+                    # We still need an Alloc for the core dims
+                    value = alloc(value, *core_shape)
+                new_inputs.append(value)
+                batch_shapes.append(
+                    [
+                        dim if not bcast else 1
+                        for dim, bcast in zip(
+                            batch_shape, inp.type.broadcastable[:batch_ndim]
+                        )
+                    ]
+                )
+                can_lift_alloc = True
+                continue
+
+        # Nothing to do with this input
+        new_inputs.append(inp)
+
+    if not can_lift_alloc:
+        return None
+
+    new_outs = node.op.make_node(*new_inputs).outputs
+
+    # Pushed Allocs are still needed
+    if new_outs[0].type.broadcastable != node.outputs[0].type.broadcastable:
+        out = new_outs[0]
+        batch_ndim = out.type.ndim - len(op.outputs_sig[0])
+        if batch_ndim:
+            # The new output already has batch dims, we need to consider this when broadcasting
+            bcast_shape = tuple(out.shape)[:batch_ndim]
+            batch_shapes.append(
+                [
+                    dim if not bcast else 1
+                    for dim, bcast in zip(
+                        bcast_shape, out.type.broadcastable[:batch_ndim]
+                    )
+                ]
+            )
+        if len(batch_shapes) == 1:
+            [batch_shape] = batch_shapes
+        else:
+            batch_shape = broadcast_shape_iter(batch_shapes, arrays_are_shapes=True)
+        core_shapes = [out.shape[batch_ndim:] for out in new_outs]
+        new_outs = [
+            alloc(new_out, *batch_shape, *core_shape)
+            for new_out, core_shape in zip(new_outs, core_shapes)
+        ]
+
+    copy_stack_trace(node.outputs, new_outs)
+    return new_outs