[Codegen] Use GPUPadLayoutAttr to resolve layouts. (#20565)

hanhanW · web-flow · commit e5ca8aebe5f9 · 2025-04-21T16:59:29.000-07:00
Instead of using the nop layout resolver, the revision explicitly uses the GPUPadLayoutAttr to resolve all the layouts. If the resolver is present in an executable target, it is prioritized. The revision exposes a "hidden" dependency between padding layout resolver and nop layout resolver. Because the current type converter is not decoupled enough from data-tiling usage. It is a step towards #20160 --------- Signed-off-by: hanhanW <hanhan0912@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel
@@ -189,6 +189,7 @@ iree_compiler_cc_library(
         "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
         "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils",
         "//compiler/src/iree/compiler/Codegen/Dialect/GPU/IR:IREEGPUDialect",
+        "//compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils:KnownTargets",
         "//compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR:IREEVectorExtDialect",
         "//compiler/src/iree/compiler/Codegen/Interfaces:BufferizationInterfaces",
         "//compiler/src/iree/compiler/Codegen/Interfaces:PartitionableLoopsInterface",
diff --git a/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
@@ -224,6 +224,7 @@ iree_cc_library(
     iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect
     iree::compiler::Codegen::Dialect::Codegen::Utils
     iree::compiler::Codegen::Dialect::GPU::IR::IREEGPUDialect
+    iree::compiler::Codegen::Dialect::GPU::TargetUtils::KnownTargets
     iree::compiler::Codegen::Dialect::VectorExt::IR::IREEVectorExtDialect
     iree::compiler::Codegen::Interfaces::BufferizationInterfaces
     iree::compiler::Codegen::Interfaces::PartitionableLoopsInterface
diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPadding.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPadding.cpp
@@ -10,11 +10,15 @@
 #include "iree/compiler/Codegen/Common/Passes.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
+#include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.h"
+#include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Dialect/Encoding/IR/EncodingTypes.h"
 #include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
 #include "iree/compiler/Dialect/Flow/IR/FlowTypes.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
 #include "mlir/Dialect/MemRef/Transforms/Transforms.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -23,6 +27,8 @@
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 
+#define DEBUG_TYPE "iree-codegen-materialize-encoding-into-padding"
+
 namespace mlir::iree_compiler {
 
 #define GEN_PASS_DEF_MATERIALIZEENCODINGINTOPADDINGPASS
@@ -34,24 +40,36 @@ namespace {
 
 // Returns the pad encoding layout, or nullptr if this is not the only layout or
 // if there's no encoding at all.
-static PadEncodingLayoutAttr getPadLayout(RankedTensorType type) {
-  auto encoding =
-      dyn_cast_or_null<IREE::Encoding::LayoutAttr>(type.getEncoding());
-  if (!encoding) {
+static PadEncodingLayoutAttr getPadLayout(Attribute layoutAttr,
+                                          RankedTensorType type) {
+  if (!type.getEncoding()) {
     return nullptr;
   }
-  ArrayAttr layouts = encoding.getLayouts();
-  if (!layouts || layouts.size() != 1) {
-    return nullptr;
+  auto encoding =
+      dyn_cast_or_null<IREE::Encoding::LayoutAttr>(type.getEncoding());
+  if (encoding) {
+    ArrayAttr layouts = encoding.getLayouts();
+    if (layouts.size() != 1) {
+      return nullptr;
+    }
+    return dyn_cast<PadEncodingLayoutAttr>(*layouts.begin());
   }
-
-  return dyn_cast<PadEncodingLayoutAttr>(*layouts.begin());
+  Attribute resolvedEncoding =
+      cast<IREE::Encoding::EncodingLayoutResolverAttrInterface>(layoutAttr)
+          .getLayout(type);
+  LLVM_DEBUG({
+    llvm::dbgs() << "Unresolved type: " << type << "\n";
+    llvm::dbgs() << "layoutAttr: " << layoutAttr << "\n";
+    llvm::dbgs() << "Resolved into: " << resolvedEncoding << "\n";
+  });
+  return dyn_cast<PadEncodingLayoutAttr>(resolvedEncoding);
 }
 
 // Returns a padded tensor type (without encoding) for tensor types with the pad
 // encoding layout, or the same type for all other tensors.
-static RankedTensorType getPaddedType(RankedTensorType type) {
-  PadEncodingLayoutAttr layout = getPadLayout(type);
+static RankedTensorType getPaddedType(Attribute layoutAttr,
+                                      RankedTensorType type) {
+  PadEncodingLayoutAttr layout = getPadLayout(layoutAttr, type);
   if (!isNonZeroPadding(layout)) {
     return type.dropEncoding();
   }
@@ -67,15 +85,11 @@ static RankedTensorType getPaddedType(RankedTensorType type) {
   return RankedTensorType::get(newShape, type.getElementType());
 }
 
-static bool hasNonZeroPadding(RankedTensorType type) {
-  return isNonZeroPadding(getPadLayout(type));
-}
-
 struct MaterializePadEncodingTypeConverter final
     : MaterializeEncodingTypeConverter {
-  MaterializePadEncodingTypeConverter(MLIRContext *ctx)
-      : MaterializeEncodingTypeConverter(
-            IREE::Codegen::EncodingNopLayoutAttr::get(ctx)) {
+  MaterializePadEncodingTypeConverter(
+      IREE::Codegen::LayoutAttrInterface layoutAttr)
+      : MaterializeEncodingTypeConverter(layoutAttr) {
     addConversion([](RankedTensorType type) -> std::optional<RankedTensorType> {
       // The type converter is designed for `pad_encoding_layout` encoding
       // attribute. By the definition, the final converted type is the same
@@ -85,18 +99,23 @@ struct MaterializePadEncodingTypeConverter final
     addConversion([&](IREE::Flow::DispatchTensorType dispatchTensorType)
                       -> IREE::Flow::DispatchTensorType {
       auto type = dyn_cast<RankedTensorType>(dispatchTensorType.getBoundType());
-      if (!type) {
+      if (!type || !type.getEncoding()) {
         return dispatchTensorType;
       }
       // The incoming bindings have the padded type, if `pad_encoding_layout` is
       // present.
-      if (getPadLayout(type)) {
-        type = getPaddedType(type);
+      if (getPadLayout(getLayoutAttr(), type)) {
+        type = getPaddedType(getLayoutAttr(), type);
       }
       return IREE::Flow::DispatchTensorType::get(dispatchTensorType.getAccess(),
                                                  type);
     });
   }
+
+  bool hasNonZeroPadding(RankedTensorType type) const {
+    PadEncodingLayoutAttr layout = getPadLayout(getLayoutAttr(), type);
+    return layout && !layout.isIdentityLayout();
+  }
 };
 
 /// Pattern to convert `flow.dispatch.tensor.load` operation when
@@ -116,15 +135,15 @@ struct MaterializeFlowDispatchTensorLoadOp final
       return rewriter.notifyMatchFailure(loadOp, "unhandled partial loads");
     }
 
+    auto &typeConverter =
+        *getTypeConverter<MaterializePadEncodingTypeConverter>();
     IREE::Flow::DispatchTensorType sourceType = loadOp.getSourceType();
     auto boundTensorType = cast<RankedTensorType>(sourceType.getBoundType());
-    if (!hasNonZeroPadding(boundTensorType)) {
+    if (!typeConverter.hasNonZeroPadding(boundTensorType)) {
       // Let the Nop pattern handle this.
       return rewriter.notifyMatchFailure(loadOp, "no padding applied");
     }
 
-    auto &typeConverter =
-        *getTypeConverter<MaterializePadEncodingTypeConverter>();
     auto paddedType =
         typeConverter.convertType<RankedTensorType>(boundTensorType);
     assert(paddedType != boundTensorType && "Expected conversion with padding");
@@ -171,15 +190,15 @@ struct MaterializeFlowDispatchTensorStoreOp final
       return rewriter.notifyMatchFailure(storeOp, "unhandled partial stores");
     }
 
+    auto &typeConverter =
+        *getTypeConverter<MaterializePadEncodingTypeConverter>();
     IREE::Flow::DispatchTensorType targetType = storeOp.getTargetType();
     auto boundTensorType = cast<RankedTensorType>(targetType.getBoundType());
-    if (!hasNonZeroPadding(boundTensorType)) {
+    if (!typeConverter.hasNonZeroPadding(boundTensorType)) {
       // Let the Nop pattern handle this.
       return rewriter.notifyMatchFailure(storeOp, "no padding applied");
     }
 
-    auto &typeConverter =
-        *getTypeConverter<MaterializePadEncodingTypeConverter>();
     IREE::Flow::DispatchTensorType newTargetType =
         typeConverter.convertType<IREE::Flow::DispatchTensorType>(targetType);
     RankedTensorType paddedType = newTargetType.asRankedTensorType();
@@ -245,8 +264,9 @@ struct MaterializeEncodingIntoPaddingPass final
     : impl::MaterializeEncodingIntoPaddingPassBase<
           MaterializeEncodingIntoPaddingPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<linalg::LinalgDialect, tensor::TensorDialect,
-                    IREE::Codegen::IREECodegenDialect>();
+    registry.insert<arith::ArithDialect, linalg::LinalgDialect,
+                    tensor::TensorDialect, IREE::Codegen::IREECodegenDialect,
+                    IREE::GPU::IREEGPUDialect>();
   }
 
   void runOnOperation() override {
@@ -259,8 +279,43 @@ struct MaterializeEncodingIntoPaddingPass final
       return failure();
     };
 
+    // Retrieve the config from executable target attribute, if any. Otherwise,
+    // retrieve the config from CLI GPU target and construct a virtual
+    // configuration.
+    auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(operation);
+    DictionaryAttr targetConfig;
+    if (targetAttr) {
+      targetConfig = targetAttr.getConfiguration();
+    } else {
+      IREE::GPU::TargetAttr gpuTargetAttr = getCLGPUTarget(context);
+      SmallVector<NamedAttribute> items;
+      items.emplace_back(
+          IREE::Encoding::kEncodingResolverAttrName,
+          IREE::GPU::getHIPTargetEncodingLayoutAttr(gpuTargetAttr, "pad"));
+      targetConfig = DictionaryAttr::get(context, items);
+    }
+
+    // The layoutAttr should come in without any target info attached to it,
+    // so we need to clone the layout attrs with the configuration so it can
+    // access the target info during materialization.
+    //
+    // Otherwise, fall back to the nop layout.
+    IREE::Codegen::LayoutAttrInterface layoutAttr;
+    if (targetConfig &&
+        targetConfig.contains(IREE::Encoding::kEncodingResolverAttrName)) {
+      layoutAttr = targetConfig.getAs<IREE::Codegen::LayoutAttrInterface>(
+          IREE::Encoding::kEncodingResolverAttrName);
+      auto resolverAttr =
+          cast<IREE::Encoding::EncodingLayoutResolverAttrInterface>(layoutAttr);
+      layoutAttr = cast<IREE::Codegen::LayoutAttrInterface>(
+          resolverAttr.cloneWithSimplifiedConfig(targetConfig));
+    } else {
+      layoutAttr = cast<IREE::Codegen::LayoutAttrInterface>(
+          IREE::Codegen::EncodingNopLayoutAttr::get(context));
+    }
+
     RewritePatternSet materializeEncodingPattern(context);
-    MaterializePadEncodingTypeConverter typeConverter(context);
+    MaterializePadEncodingTypeConverter typeConverter(layoutAttr);
     MaterializeEncodingConversionTarget target(*context);
     populateMaterializeEncodingPatterns(materializeEncodingPattern, target,
                                         typeConverter,
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/materialize_encoding_into_padding.mlir b/compiler/src/iree/compiler/Codegen/Common/test/materialize_encoding_into_padding.mlir
@@ -1,6 +1,94 @@
 // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-materialize-encoding-into-padding))" \
+// RUN:   --iree-gpu-test-target=gfx942 \
 // RUN:   --split-input-file %s | FileCheck %s
 
+#binding_ro = #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">
+#binding = #hal.pipeline.binding<storage_buffer, Indirect>
+#encoding = #iree_encoding.matmul_k<k_dims = [1]>
+func.func @set_encoding_and_store_with_unresolved_encodings() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#binding_ro, #binding], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#binding_ro, #binding], flags = Indirect>) binding(0) alignment(64) offset(%1) flags("ReadOnly|Indirect")
+    : !flow.dispatch.tensor<readonly:tensor<2048x2048xf16>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#binding_ro, #binding], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags(Indirect)
+    : !flow.dispatch.tensor<writeonly:tensor<2048x2048xf16, #encoding>>
+  %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1]
+    : !flow.dispatch.tensor<readonly:tensor<2048x2048xf16>> -> tensor<2048x2048xf16>
+  %6 = iree_encoding.set_encoding %5 : tensor<2048x2048xf16> -> tensor<2048x2048xf16, #encoding>
+  flow.dispatch.tensor.store %6, %4, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1]
+    : tensor<2048x2048xf16, #encoding> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xf16, #encoding>>
+  return
+}
+// CHECK-LABEL: @set_encoding_and_store_with_unresolved_encodings
+// CHECK:         %[[A:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(0)
+// CHECK-SAME:                  !flow.dispatch.tensor<readonly:tensor<2048x2048xf16>>
+// CHECK:         %[[B:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(1)
+// CHECK-SAME:                  !flow.dispatch.tensor<writeonly:tensor<2048x2112xf16>>
+// CHECK:         %[[LD:.+]] = flow.dispatch.tensor.load %[[A]], offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1]
+// CHECK-SAME:                  !flow.dispatch.tensor<readonly:tensor<2048x2048xf16>> -> tensor<2048x2048xf16>
+// CHECK:         flow.dispatch.tensor.store %[[LD]], %[[B]], offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1]
+// CHECK-SAME:                  tensor<2048x2048xf16> -> !flow.dispatch.tensor<writeonly:tensor<2048x2112xf16>>
+
+// -----
+
+// The test is as the same as the
+// set_encoding_and_store_with_unresolved_encodings test, but it gets the
+// encoding resolver from executable target.
+
+#binding_ro = #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">
+#binding = #hal.pipeline.binding<storage_buffer, Indirect>
+#encoding = #iree_encoding.matmul_k<k_dims = [1]>
+#executable_target = #hal.executable.target<"rocm", "rocm-hsaco-fb",
+  {
+    abi = "hip",
+    iree.encoding.resolver = #iree_gpu.gpu_pad_layout<>,
+    iree.gpu.target = #iree_gpu.target<arch = "gfx942",
+                                       features = "",
+                                       wgp = <compute = fp32,
+                                              storage =  b32,
+                                              subgroup =  none,
+                                              dot =  none,
+                                              mma = [<MFMA_F32_16x16x4_F32>],
+                                              subgroup_size_choices = [64],
+                                              max_workgroup_sizes = [1024, 1024, 1024],
+                                              max_thread_count_per_workgroup = 1024,
+                                              max_workgroup_memory_bytes = 65536,
+                                              max_workgroup_counts = [2147483647, 2147483647, 2147483647],
+                                              max_load_instruction_bits = 128,
+                                              simds_per_wgp = 4,
+                                              vgpr_space_bits = 16384>>
+  }>
+func.func @set_encoding_and_store_with_unresolved_encodings_from_executable() attributes {
+  hal.executable.target = #executable_target
+} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#binding_ro, #binding], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#binding_ro, #binding], flags = Indirect>) binding(0) alignment(64) offset(%1) flags("ReadOnly|Indirect")
+    : !flow.dispatch.tensor<readonly:tensor<2048x2048xf16>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#binding_ro, #binding], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags(Indirect)
+    : !flow.dispatch.tensor<writeonly:tensor<2048x2048xf16, #encoding>>
+  %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1]
+    : !flow.dispatch.tensor<readonly:tensor<2048x2048xf16>> -> tensor<2048x2048xf16>
+  %6 = iree_encoding.set_encoding %5 : tensor<2048x2048xf16> -> tensor<2048x2048xf16, #encoding>
+  flow.dispatch.tensor.store %6, %4, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1]
+    : tensor<2048x2048xf16, #encoding> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xf16, #encoding>>
+  return
+}
+// CHECK-LABEL: @set_encoding_and_store_with_unresolved_encodings_from_executable
+// CHECK:         %[[A:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(0)
+// CHECK-SAME:                  !flow.dispatch.tensor<readonly:tensor<2048x2048xf16>>
+// CHECK:         %[[B:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(1)
+// CHECK-SAME:                  !flow.dispatch.tensor<writeonly:tensor<2048x2112xf16>>
+// CHECK:         %[[LD:.+]] = flow.dispatch.tensor.load %[[A]], offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1]
+// CHECK-SAME:                  !flow.dispatch.tensor<readonly:tensor<2048x2048xf16>> -> tensor<2048x2048xf16>
+// CHECK:         flow.dispatch.tensor.store %[[LD]], %[[B]], offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1]
+// CHECK-SAME:                  tensor<2048x2048xf16> -> !flow.dispatch.tensor<writeonly:tensor<2048x2112xf16>>
+
+
+// -----
+
 #binding_ro = #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">
 #binding = #hal.pipeline.binding<storage_buffer, Indirect>
 #encoding_mmt = #iree_encoding.encoding<operand_index = 0 : index, op_type = matmul, element_types = [f16, f16, f16]>
diff --git a/compiler/src/iree/compiler/Codegen/ExternalInterfaces/GPUEncodingExternalModels.cpp b/compiler/src/iree/compiler/Codegen/ExternalInterfaces/GPUEncodingExternalModels.cpp
@@ -406,6 +406,27 @@ struct GPUHostEncodingLayoutResolverAttrInterface final
   }
 };
 
+struct GPUPadDeviceEncodingLayoutAttrInterface final
+    : Codegen::LayoutAttrInterface::ExternalModel<
+          GPUPadDeviceEncodingLayoutAttrInterface, GPUPadLayoutAttr> {
+
+  // TODO(#20160): Do not implement the interface method because it is
+  // data-tiling specific. It is a workaround to reuse encoding materialization
+  // patterns, because we query types from the method in the conversion. We
+  // should really move them to interface methods, then we can delete the
+  // workaround.
+  MaterializeEncodingInfo getEncodingInfo(Attribute attr,
+                                          RankedTensorType type) const {
+    return MaterializeEncodingInfo{};
+  }
+
+  Operation *lowerOp(Attribute attr, OpBuilder &b, Operation *op,
+                     TypeRange convertedResTypes,
+                     ValueRange convertedOperands) const {
+    return clone(b, op, convertedResTypes, convertedOperands);
+  }
+};
+
 struct GPUPadEncodingLayoutResolverAttrInterface final
     : Encoding::EncodingLayoutResolverAttrInterface::ExternalModel<
           GPUPadEncodingLayoutResolverAttrInterface, GPUPadLayoutAttr> {
@@ -416,7 +437,7 @@ struct GPUPadEncodingLayoutResolverAttrInterface final
     std::optional<IREE::GPU::L1CacheInfo> cache =
         IREE::GPU::getL1CacheInfo(gpuTarget);
     if (!cache) {
-      return IREE::Encoding::IdentityEncodingAttr::get(ctx);
+      return IREE::Codegen::EncodingNopLayoutAttr::get(ctx);
     }
     return GPUPadLayoutAttr::get(ctx, cache->cacheLineBytes, cache->cacheSets);
   }
@@ -535,6 +556,7 @@ void registerGPUEncodingExternalModels(DialectRegistry &registry) {
             GPUHostEncodingLayoutResolverAttrInterface,
             GPUHostSerializableEncodingAttrInterface>(*ctx);
         IREE::GPU::GPUPadLayoutAttr::attachInterface<
+            GPUPadDeviceEncodingLayoutAttrInterface,
             GPUPadEncodingLayoutResolverAttrInterface>(*ctx);
       });
 }