[Dispatch Creation] Handle linalg.fill in collapse dimensions (#20863)

IanWood1 · web-flow · commit 81cfcaa9181f · 2025-05-19T23:48:28.000Z
Directly handle collapsing fill operations instead of relying on folding
to clean up reshapes. This prevents reshapes from getting stuck in
dispatches when the `linalg.fill` consumes a value produced by an
operation that can't fold with the tensor reshape (see
`collapse_fill_of_arg`).

Signed-off-by: Ian Wood &lt;ianwood2024@u.northwestern.edu&gt;
diff --git a/compiler/src/iree/compiler/DispatchCreation/CollapseDimensions.cpp b/compiler/src/iree/compiler/DispatchCreation/CollapseDimensions.cpp
@@ -168,7 +168,7 @@ static SmallVector<ReassociationIndices> getCollapsibleLoops(Operation *op) {
 
 /// Returns true if the given op is collapsable.
 static bool isEligibleForCollapse(Operation *op) {
-  if (isa<IREE::LinalgExt::AttentionOp>(op)) {
+  if (isa<IREE::LinalgExt::AttentionOp, linalg::FillOp>(op)) {
     return true;
   }
 
@@ -964,7 +964,7 @@ collapseDimensionsForDispatch(IRRewriter &rewriter,
     using ResultsType = FailureOr<SmallVector<Value>>;
     auto maybeReplacements =
         llvm::TypeSwitch<Operation *, ResultsType>(opToCollapse)
-            .Case<linalg::GenericOp>(
+            .Case<linalg::LinalgOp>(
                 [&, &info = info](auto genericOp) -> ResultsType {
                   FailureOr<linalg::CollapseResult> maybeReplacements =
                       mlir::linalg::collapseOpIterationDims(
diff --git a/compiler/src/iree/compiler/DispatchCreation/test/collapse_dimensions.mlir b/compiler/src/iree/compiler/DispatchCreation/test/collapse_dimensions.mlir
@@ -863,3 +863,51 @@ util.func public @multi_reduction(%arg0 : tensor<32x16x16384xf32>, %arg1 : tenso
 //       CHECK:   %[[GEN2:.+]] = linalg.generic
 //  CHECK-SAME:     ins(%[[GEN1]] : tensor<32xf32>)
 //       CHECK:   flow.return %[[GEN2]]
+
+// -----
+
+util.func public @collapse_single_fill(%arg0: tensor<11x470x725x224xf32>) -> tensor<11x470x725x224xf32> {
+  %0 = flow.dispatch.region -> (tensor<11x470x725x224xf32>) {
+    %cst = arith.constant 0.000000e+00 : f32
+    %1 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<11x470x725x224xf32>) -> tensor<11x470x725x224xf32>
+    flow.return %1 : tensor<11x470x725x224xf32>
+  }
+  util.return %0 : tensor<11x470x725x224xf32>
+}
+// CHECK-LABEL: util.func public @collapse_single_fill
+//  CHECK-SAME:   %[[ARG0:[0-9a-zA-Z]+]]
+//   CHECK-DAG:   %[[COLLAPSE0:.+]] = tensor.collapse_shape %[[ARG0]]
+//       CHECK:   flow.dispatch.region
+//       CHECK:   %[[FILL:.+]] = linalg.fill
+//  CHECK-SAME:     outs(%[[COLLAPSE0]] : tensor<839608000xf32>)
+//       CHECK:   flow.return %[[FILL]]
+
+// -----
+
+util.func public @collapse_fill_of_arg(%arg0: tensor<224x32xf32>, %arg1: tensor<11x470x725x224xf32>, %arg2: tensor<11x470x725x32xf32>) -> tensor<11x470x725x224xf32> {
+  %0 = flow.dispatch.region -> (tensor<11x470x725x224xf32>) {
+    %cst = arith.constant 0.000000e+00 : f32
+    %1 = linalg.fill ins(%cst : f32) outs(%arg1 : tensor<11x470x725x224xf32>) -> tensor<11x470x725x224xf32>
+    %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%arg2, %arg0 : tensor<11x470x725x32xf32>, tensor<224x32xf32>) outs(%1 : tensor<11x470x725x224xf32>) {
+    ^bb0(%in: f32, %in_0: f32, %out: f32):
+      %3 = arith.mulf %in, %in_0 : f32
+      %4 = arith.addf %out, %3 : f32
+      linalg.yield %4 : f32
+    } -> tensor<11x470x725x224xf32>
+    flow.return %2 : tensor<11x470x725x224xf32>
+  }
+  util.return %0 : tensor<11x470x725x224xf32>
+}
+// CHECK-LABEL: util.func public @collapse_fill_of_arg
+//  CHECK-SAME:   %[[ARG0:[0-9a-zA-Z]+]]
+//  CHECK-SAME:   %[[ARG1:[0-9a-zA-Z]+]]
+//  CHECK-SAME:   %[[ARG2:[0-9a-zA-Z]+]]
+//   CHECK-DAG:   %[[COLLAPSE1:.+]] = tensor.collapse_shape %[[ARG1]]
+//   CHECK-DAG:   %[[COLLAPSE2:.+]] = tensor.collapse_shape %[[ARG2]]
+//       CHECK:   flow.dispatch.region
+//       CHECK:   %[[FILL:.+]] = linalg.fill
+//  CHECK-SAME:     outs(%[[COLLAPSE1]] : tensor<3748250x224xf32>)
+//       CHECK:   %[[GEN0:.+]] = linalg.generic
+//  CHECK-SAME:     ins(%[[COLLAPSE2]], %[[ARG0]] : tensor<3748250x32xf32>, tensor<224x32xf32>)
+//  CHECK-SAME:     outs(%[[FILL]] : tensor<3748250x224xf32>)
+//       CHECK:   flow.return %[[GEN0]]