[mlir][tensor] Fold padding expand_shape into insert_slice (#93018)

adam-smnk · web-flow · commit d6541fc74b0a · 2024-05-24T08:56:56.000+02:00
diff --git a/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp
@@ -79,12 +79,42 @@ struct FoldInsertOfRankReducingInsert : public OpRewritePattern<OpTy> {
     return success();
   }
 };
+
+/// Fold expand_shape which only adds static dimensions of size `1`
+/// into insert_slice.
+template <typename OpTy>
+struct FoldPaddingExpandIntoInsert : public OpRewritePattern<OpTy> {
+  using OpRewritePattern<OpTy>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(OpTy insertSliceOp,
+                                PatternRewriter &rewriter) const override {
+    auto expandShapeOp = insertSliceOp.getSource()
+                             .template getDefiningOp<tensor::ExpandShapeOp>();
+    if (!expandShapeOp)
+      return failure();
+
+    // Only fold away simple expansion where all added dimensions have static
+    // size `1`.
+    SliceVerificationResult res = isRankReducedType(
+        expandShapeOp.getResultType(), expandShapeOp.getSrcType());
+    if (res != SliceVerificationResult::Success)
+      return rewriter.notifyMatchFailure(insertSliceOp,
+                                         "expected rank increasing expansion");
+
+    rewriter.modifyOpInPlace(insertSliceOp, [&]() {
+      insertSliceOp.getSourceMutable().assign(expandShapeOp.getSrc());
+    });
+    return success();
+  }
+};
 } // namespace
 
 void mlir::tensor::populateReassociativeReshapeFoldingPatterns(
     RewritePatternSet &patterns) {
   patterns.add<FoldExpandOfRankReducingExtract,
                FoldInsertOfRankReducingInsert<tensor::InsertSliceOp>,
-               FoldInsertOfRankReducingInsert<tensor::ParallelInsertSliceOp>>(
+               FoldInsertOfRankReducingInsert<tensor::ParallelInsertSliceOp>,
+               FoldPaddingExpandIntoInsert<tensor::InsertSliceOp>,
+               FoldPaddingExpandIntoInsert<tensor::ParallelInsertSliceOp>>(
       patterns.getContext());
 }
diff --git a/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir b/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir
@@ -54,3 +54,105 @@ func.func @rank_reducing_parallel_insert_of_collapse_shape(
   }
   return %1 : tensor<?x?x?x?xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func @insert_of_padding_expand_shape(
+//  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>
+//  CHECK-SAME:     %[[d:.*]]: tensor<?x?x?x?xf32>
+//  CHECK-SAME:     %[[x:[a-zA-Z0-9_]+]]: index
+//  CHECK-SAME:     %[[y:[a-zA-Z0-9_]+]]: index
+//       CHECK:   %[[insert:.*]] = tensor.insert_slice %[[t]] into %[[d]][%[[x]], %[[y]], 0, 0] [1, %{{.*}}, 1, %{{.*}}] [1, 1, 1, 1] : tensor<?x?xf32> into tensor<?x?x?x?xf32>
+//       CHECK:   return %[[insert]]
+func.func @insert_of_padding_expand_shape(
+    %t: tensor<?x?xf32>, %d: tensor<?x?x?x?xf32>, %x: index, %y: index)
+  -> tensor<?x?x?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %sz0 = tensor.dim %t, %c0 : tensor<?x?xf32>
+  %sz1 = tensor.dim %t, %c1 : tensor<?x?xf32>
+  %0 = tensor.expand_shape %t [[0, 1], [2, 3]] output_shape [1, %sz0, 1, %sz1]
+      : tensor<?x?xf32> into tensor<1x?x1x?xf32>
+  %1 = tensor.insert_slice %0 into %d[%x, %y, 0, 0][1, %sz0, 1, %sz1][1, 1, 1, 1]
+      : tensor<1x?x1x?xf32> into tensor<?x?x?x?xf32>
+  return %1 : tensor<?x?x?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @insert_of_non_padding_expand_shape(
+//  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>
+//  CHECK-SAME:     %[[d:.*]]: tensor<?x?x?x?xf32>
+//  CHECK-SAME:     %[[x:[a-zA-Z0-9_]+]]: index
+//  CHECK-SAME:     %[[y:[a-zA-Z0-9_]+]]: index
+//  CHECK-SAME:     %[[sz:[a-zA-Z0-9_]+]]: index
+//       CHECK:   %[[expand:.*]] = tensor.expand_shape %[[t]] {{\[}}[0, 1], [2]] output_shape [%[[sz]], %{{.*}}, %{{.*}}] : tensor<?x?xf32> into tensor<?x?x?xf32>
+//       CHECK:   %[[insert:.*]] = tensor.insert_slice %[[expand]] into %[[d]][%[[x]], %[[y]], 0, 0] [%[[sz]], 1, %{{.*}}, %{{.*}}] [1, 1, 1, 1] : tensor<?x?x?xf32> into tensor<?x?x?x?xf32>
+//       CHECK:   return %[[insert]]
+func.func @insert_of_non_padding_expand_shape(
+    %t: tensor<?x?xf32>, %d: tensor<?x?x?x?xf32>, %x: index, %y: index, %sz: index)
+  -> tensor<?x?x?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %sz0 = tensor.dim %t, %c0 : tensor<?x?xf32>
+  %sz1 = tensor.dim %t, %c1 : tensor<?x?xf32>
+  %0 = tensor.expand_shape %t [[0, 1], [2]] output_shape [%sz, %sz0, %sz1]
+      : tensor<?x?xf32> into tensor<?x?x?xf32>
+  %1 = tensor.insert_slice %0 into %d[%x, %y, 0, 0][%sz, 1, %sz0, %sz1][1, 1, 1, 1]
+      : tensor<?x?x?xf32> into tensor<?x?x?x?xf32>
+  return %1 : tensor<?x?x?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @parallel_insert_of_padding_expand_shape(
+//  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>
+//  CHECK-SAME:     %[[d:.*]]: tensor<?x?x?x?xf32>
+//  CHECK-SAME:     %[[x:[a-zA-Z0-9_]+]]: index
+//  CHECK-SAME:     %[[y:[a-zA-Z0-9_]+]]: index
+//       CHECK:   tensor.parallel_insert_slice %[[t]] into %{{.*}}[%{{.*}}, %{{.*}}, 0, 0] [1, %{{.*}}, 1, %{{.*}}] [1, 1, 1, 1] : tensor<?x?xf32> into tensor<?x?x?x?xf32>
+func.func @parallel_insert_of_padding_expand_shape(
+    %t: tensor<?x?xf32>, %d: tensor<?x?x?x?xf32>, %x: index, %y: index)
+  -> tensor<?x?x?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %sz0 = tensor.dim %t, %c0 : tensor<?x?xf32>
+  %sz1 = tensor.dim %t, %c1 : tensor<?x?xf32>
+  %0 = tensor.expand_shape %t [[0, 1], [2, 3]] output_shape [1, %sz0, 1, %sz1]
+      : tensor<?x?xf32> into tensor<1x?x1x?xf32>
+  %1 = scf.forall (%i, %j) in (%x, %y) shared_outs(%o = %d) -> (tensor<?x?x?x?xf32>) {
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %0 into %o[%i, %j, 0, 0][1, %sz0, 1, %sz1][1, 1, 1, 1]
+          : tensor<1x?x1x?xf32> into tensor<?x?x?x?xf32>
+    }
+  }
+  return %1 : tensor<?x?x?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @parallel_insert_of_non_padding_expand_shape(
+//  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>
+//  CHECK-SAME:     %[[d:.*]]: tensor<?x?x?x?xf32>
+//  CHECK-SAME:     %[[x:[a-zA-Z0-9_]+]]: index
+//  CHECK-SAME:     %[[y:[a-zA-Z0-9_]+]]: index
+//  CHECK-SAME:     %[[sz:[a-zA-Z0-9_]+]]: index
+//       CHECK:   %[[expand:.*]] = tensor.expand_shape %[[t]] {{\[}}[0, 1], [2]] output_shape [%[[sz]], %{{.*}}, %{{.*}}] : tensor<?x?xf32> into tensor<?x?x?xf32>
+//       CHECK:   tensor.parallel_insert_slice %[[expand]] into %{{.*}}[%{{.*}}, %{{.*}}, 0, 0] [%[[sz]], 1, %{{.*}}, %{{.*}}] [1, 1, 1, 1] : tensor<?x?x?xf32> into tensor<?x?x?x?xf32>
+func.func @parallel_insert_of_non_padding_expand_shape(
+    %t: tensor<?x?xf32>, %d: tensor<?x?x?x?xf32>, %x: index, %y: index, %sz: index)
+  -> tensor<?x?x?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %sz0 = tensor.dim %t, %c0 : tensor<?x?xf32>
+  %sz1 = tensor.dim %t, %c1 : tensor<?x?xf32>
+  %0 = tensor.expand_shape %t [[0, 1], [2]] output_shape [%sz, %sz0, %sz1]
+      : tensor<?x?xf32> into tensor<?x?x?xf32>
+  %1 = scf.forall (%i, %j) in (%x, %y) shared_outs(%o = %d) -> (tensor<?x?x?x?xf32>) {
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %0 into %o[%i, %j, 0, 0][%sz, 1, %sz0, %sz1][1, 1, 1, 1]
+          : tensor<?x?x?xf32> into tensor<?x?x?x?xf32>
+    }
+  }
+  return %1 : tensor<?x?x?x?xf32>
+}