iree-org · lialan · Mar 26, 2026 · Mar 26, 2026
@@ -826,3 +826,154 @@ func.func @copy_swizzle_hint_linearized(%source: tensor<128x16xf32>) -> tensor<1
 
   return %result : tensor<128x16xf32>
 }
+
+// -----
+
+// Test: im2col → gather DMA conversion (happy path).
+// NHWC layout, 3×3 kernel, stride 1, dilation 1, C=512 on gfx950.
+// The im2col should be converted to: collapse_shape + linalg.generic (index
+// computation) + gather → then the gather gets converted to coalesced DMA.
+
+#gpu_target_im2col = #iree_gpu.target<arch = "gfx950", features = "", wgp = <
+  compute = fp32, storage = b32, subgroup = shuffle,
+  max_load_instruction_bits = 128, subgroup_size_choices = [64],
+  max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
+  max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647],
+  dma_sizes = [32, 128]
+>>
+#exec_target_im2col = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.target_info = #gpu_target_im2col}>
+#translation_im2col = #iree_codegen.translation_info<pipeline = #iree_gpu.pipeline<TileAndFuse> workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_num_stages = 0, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>
+
+// CHECK-LABEL: func.func @im2col_to_gather_dma
+// CHECK-SAME:    %[[INPUT:[a-zA-Z0-9]+]]: tensor<1x16x16x512xf16>
+// CHECK-SAME:    %[[OUTPUT:[a-zA-Z0-9]+]]: tensor<1x196x512xf16>
+func.func @im2col_to_gather_dma(%input: tensor<1x16x16x512xf16>, %output: tensor<1x196x512xf16>) -> tensor<1x196x512xf16>
+  attributes {hal.executable.target = #exec_target_im2col, translation_info = #translation_im2col} {
+  %result = iree_linalg_ext.im2col
+    {lowering_config = #iree_gpu.use_global_load_dma}
+    strides = [1, 1] dilations = [1, 1] kernel_size = [3, 3]
+    offsets = [0, 0, 0]
+    output_sizes = [[1], [14, 14], [3, 3, 512]]
+    batch_pos = [0] m_pos = [1, 2] k_pos = [3]
+    input_k_perm = [0, 1, 2] output_perm = [0, 1, 2]
+    ins(%input : tensor<1x16x16x512xf16>)
+    outs(%output : tensor<1x196x512xf16>) -> tensor<1x196x512xf16>
+
+  // Step 1: Collapse input [1,16,16,512] → [256,512] (flatten spatial dims).
+  // CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[INPUT]] {{\[}}[0, 1, 2], [3]{{\]}}
+  // CHECK-SAME: tensor<1x16x16x512xf16> into tensor<256x512xf16>
+
+  // Step 2: Compute linearized spatial indices via linalg.generic.
+  // Each of the 196 output positions (14×14) maps to a row in the 256-row
+  // collapsed source via: linearize(delinearize(i, [14,14]), [16,16]).
+  // CHECK: %[[INDICES:.+]] = linalg.generic
+  // CHECK:   %[[IDX:.+]] = linalg.index 0
+  // CHECK:   affine.delinearize_index %[[IDX]] into (14, 14)
+  // CHECK:   affine.linearize_index
+  // CHECK:   linalg.yield
+  // CHECK: } -> tensor<196xindex>
+
+  // Step 3: Collapse output [1,196,512] → [196,512].
+  // CHECK: %[[COLLAPSED_OUT:.+]] = tensor.collapse_shape %[[OUTPUT]] {{\[}}[0, 1], [2]{{\]}}
+
+  // Step 4: Warp-level forall distributes 196 batch positions across warps.
+  // 256 threads / 64 subgroup_size = 4 warps. 196 / 4 = 49 per warp.
+  // CHECK: scf.forall (%[[WIV0:.+]], %[[WIV1:.+]]) = (0, 0) to (196, 512) step (49, 512)
+  // CHECK-SAME: shared_outs(%[[WINIT:.+]] = %[[COLLAPSED_OUT]])
+
+  // Step 5: Slice indices for this warp's batch positions.
+  // CHECK:   %[[WARP_INDICES:.+]] = tensor.extract_slice %[[INDICES]][%[[WIV0]]] [49] [1]
+
+  // Step 6: Lane-level forall (64 lanes) + coalesced gather DMA.
+  // Each lane reads elementsPerLane contiguous f16 from the collapsed source.
+  // CHECK:   scf.forall (%[[LANE:.+]]) in (64)
+  // CHECK:     scf.forall.in_parallel {
+  // CHECK:       iree_gpu.coalesced_gather_dma %[[COLLAPSED]][%[[WARP_INDICES]]]
+  // CHECK-SAME:    into %{{.+}} lane(%[[LANE]])
+  // CHECK-SAME:    tensor<256x512xf16>, tensor<49xindex>, tensor<49x512xf16>, index
+  // CHECK:   } {mapping = [#iree_gpu.lane_id<0>]}
+
+  // CHECK:   } {mapping = [#gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
+
+  // Step 7: Expand result back to [1,196,512].
+  // CHECK: tensor.expand_shape %{{.+}} {{\[}}[0, 1], [2]{{\]}}
+
+  // No im2col or gather should remain.
+  // CHECK-NOT: iree_linalg_ext.im2col
+  // CHECK-NOT: iree_linalg_ext.gather
+
+  return %result : tensor<1x196x512xf16>
+}
+
+// -----
+
+// Negative test: im2col NOT converted when K_tile is too small for DMA
+// alignment. With f16, dma_sizes=[32,128], subgroup_size=64:
+// min_elements_per_transfer = 64 * (32/16) = 128. K_tile=4 is not aligned.
+// The im2col should be downgraded to derived_thread_config.
+
+#gpu_target_im2col_small_k = #iree_gpu.target<arch = "gfx950", features = "", wgp = <
+  compute = fp32, storage = b32, subgroup = shuffle,
+  max_load_instruction_bits = 128, subgroup_size_choices = [64],
+  max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
+  max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647],
+  dma_sizes = [32, 128]
+>>
+#exec_target_im2col_small_k = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.target_info = #gpu_target_im2col_small_k}>
+#translation_im2col_small_k = #iree_codegen.translation_info<pipeline = #iree_gpu.pipeline<TileAndFuse> workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_num_stages = 0, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>
+
+// CHECK-LABEL: func.func @im2col_small_k_no_dma
+func.func @im2col_small_k_no_dma(%input: tensor<1x6x6x4xf16>, %output: tensor<1x16x4xf16>) -> tensor<1x16x4xf16>
+  attributes {hal.executable.target = #exec_target_im2col_small_k, translation_info = #translation_im2col_small_k} {
+  %result = iree_linalg_ext.im2col
+    {lowering_config = #iree_gpu.use_global_load_dma}
+    strides = [1, 1] dilations = [1, 1] kernel_size = [3, 3]
+    offsets = [0, 0, 0]
+    output_sizes = [[1], [4, 4], [3, 3, 4]]
+    batch_pos = [0] m_pos = [1, 2] k_pos = [3]
+    input_k_perm = [0, 1, 2] output_perm = [0, 1, 2]
+    ins(%input : tensor<1x6x6x4xf16>)
+    outs(%output : tensor<1x16x4xf16>) -> tensor<1x16x4xf16>
+
+  // K_tile=4 is too small. Im2col remains with derived_thread_config.
+  // CHECK: iree_linalg_ext.im2col
+  // CHECK-SAME: lowering_config = #iree_gpu.derived_thread_config
+  // CHECK-NOT: iree_gpu.coalesced_gather_dma
+
+  return %result : tensor<1x16x4xf16>
+}
+
+// -----
+
+// Negative test: im2col NOT converted on non-gfx950 target (gfx942).
+// gfx942 does not support global load DMA (no dma_sizes field).
+
+#gpu_target_im2col_nogfx950 = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+  compute = fp32, storage = b32, subgroup = shuffle,
+  max_load_instruction_bits = 128, subgroup_size_choices = [64],
+  max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
+  max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]
+>>
+#exec_target_im2col_nogfx950 = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.target_info = #gpu_target_im2col_nogfx950}>
+#translation_im2col_nogfx950 = #iree_codegen.translation_info<pipeline = #iree_gpu.pipeline<TileAndFuse> workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_num_stages = 0, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>
+
+// CHECK-LABEL: func.func @im2col_nogfx950_no_dma
+func.func @im2col_nogfx950_no_dma(%input: tensor<1x16x16x512xf16>, %output: tensor<1x196x512xf16>) -> tensor<1x196x512xf16>
+  attributes {hal.executable.target = #exec_target_im2col_nogfx950, translation_info = #translation_im2col_nogfx950} {
+  %result = iree_linalg_ext.im2col
+    {lowering_config = #iree_gpu.use_global_load_dma}
+    strides = [1, 1] dilations = [1, 1] kernel_size = [3, 3]
+    offsets = [0, 0, 0]
+    output_sizes = [[1], [14, 14], [3, 3, 512]]
+    batch_pos = [0] m_pos = [1, 2] k_pos = [3]
+    input_k_perm = [0, 1, 2] output_perm = [0, 1, 2]
+    ins(%input : tensor<1x16x16x512xf16>)
+    outs(%output : tensor<1x196x512xf16>) -> tensor<1x196x512xf16>
+
+  // Non-gfx950 target. Im2col remains with derived_thread_config.
+  // CHECK: iree_linalg_ext.im2col
+  // CHECK-SAME: lowering_config = #iree_gpu.derived_thread_config
+  // CHECK-NOT: iree_gpu.coalesced_gather_dma
+
+  return %result : tensor<1x196x512xf16>
+}
@@ -419,11 +419,10 @@ func.func @im2col_producer_dma_downgraded_to_derived(
   return %mm : tensor<2x32x256xf32>
 }
 
-// Im2col gets derived_thread_config (not use_global_load_dma) because Im2col
-// has no DMA lowering path. The non-Im2col operand still gets use_global_load_dma.
+// Im2col now gets use_global_load_dma because Im2col has a DMA lowering path.
 // CHECK-LABEL: func.func @im2col_producer_dma_downgraded_to_derived
 //       CHECK:   %[[PA:.+]] = iree_linalg_ext.im2col
-//  CHECK-SAME:     lowering_config = #iree_gpu.derived_thread_config
+//  CHECK-SAME:     lowering_config = #iree_gpu.use_global_load_dma
 //       CHECK:   %[[PB:.+]] = linalg.copy
 //  CHECK-SAME:     lowering_config = #iree_gpu.use_global_load_dma
 //       CHECK:   linalg.batch_matmul {{.*}} ins(%[[PA]], %[[PB]]

@@ -79,11 +79,16 @@ static std::optional<Value> promotionImpl(OpBuilder &builder,
       setLoweringConfig(producer, attr);
       return operand.get();
     }
-    // Im2colOp has no DMA conversion path in GPUConvertToCoalescedDMA, so
-    // always use derived_thread_config regardless of the requested attr.
+    // If the promotion attr requests DMA, pass it through to im2col.
+    // GPUConvertToCoalescedDMA will convert im2col → gather → DMA.
+    // Otherwise, fall back to derived_thread_config.
     if (isa<IREE::LinalgExt::Im2colOp>(producer.getOperation())) {
-      setLoweringConfig(producer,
-                        DerivedThreadConfigAttr::get(producer->getContext()));
+      if (isa<UseGlobalLoadDMAAttr>(attr)) {
+        setLoweringConfig(producer, attr);
+      } else {
+        setLoweringConfig(producer,
+                          DerivedThreadConfigAttr::get(producer->getContext()));
+      }
       return operand.get();
     }
   }

@@ -39,6 +39,7 @@ iree_lit_test_suite(
             "pipeline_elementwise_f8ocp.mlir",
             "pipeline_igemm_tile_and_fuse.mlir",
             "pipeline_igemm_tile_and_fuse_gfx950.mlir",
+            "pipeline_im2col_dma_gfx950.mlir",
             "pipeline_lower_to_llvmgpu.mlir",
             "pipeline_scaled_truncation_gfx950.mlir",
             "pipeline_tile_and_fuse.mlir",

@@ -34,6 +34,7 @@ iree_lit_test_suite(
     "pipeline_elementwise_f8ocp.mlir"
     "pipeline_igemm_tile_and_fuse.mlir"
     "pipeline_igemm_tile_and_fuse_gfx950.mlir"
+    "pipeline_im2col_dma_gfx950.mlir"
     "pipeline_lower_to_llvmgpu.mlir"
     "pipeline_scaled_truncation_gfx950.mlir"
     "pipeline_tile_and_fuse.mlir"

@@ -0,0 +1,63 @@
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx950 \
+// RUN:   --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-llvmgpu-lower-executable-target{for-rocdl=true})))))" %s | FileCheck %s
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer, ReadOnly>,
+  #hal.pipeline.binding<storage_buffer, ReadOnly>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+#translation = #iree_codegen.translation_info<pipeline =
+  #iree_gpu.pipeline<TileAndFuse>
+  workgroup_size = [512, 1, 1]
+  subgroup_size = 64,
+  {
+     gpu_pipeline_options = #iree_gpu.pipeline_options<
+       prefetch_num_stages = 2,
+       no_reduce_shared_memory_bank_conflicts = true,
+       use_igemm_convolution = true>
+  }>
+#config = #iree_gpu.lowering_config<{
+  mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F16>,
+  promote_operands = [0, 1],
+  promotion_types = [#iree_gpu.use_global_load_dma, #iree_gpu.use_global_load_dma],
+  reduction = [0, 0, 0, 0, 4],
+  subgroup = [1, 2, 1, 4, 0],
+  workgroup = [1, 4, 32, 128, 0]
+}>
+hal.executable private @conv_im2col_dma {
+  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+    hal.executable.export public @conv_im2col_dma ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @conv_im2col_dma() attributes {translation_info = #translation} {
+        %cst = arith.constant 0.000000e+00 : f32
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x34x34x1280xf16>>
+        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<3x3x1280x1280xf16>>
+        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x32x32x1280xf32>>
+        %3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 34, 34, 1280], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x34x34x1280xf16>> -> tensor<2x34x34x1280xf16>
+        %4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 1280, 1280], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<3x3x1280x1280xf16>> -> tensor<3x3x1280x1280xf16>
+        %5 = tensor.empty() : tensor<2x32x32x1280xf32>
+        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32x32x1280xf32>) -> tensor<2x32x32x1280xf32>
+        %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>, lowering_config = #config} ins(%3, %4 : tensor<2x34x34x1280xf16>, tensor<3x3x1280x1280xf16>) outs(%6 : tensor<2x32x32x1280xf32>) -> tensor<2x32x32x1280xf32>
+        iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 32, 1280], strides = [1, 1, 1, 1] : tensor<2x32x32x1280xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x32x32x1280xf32>>
+        return
+      }
+    }
+  }
+}
+
+// Verify im2col DMA path: conv is lowered through im2col -> gather ->
+// coalesced_gather_dma -> amdgpu.gather_to_lds. The gather_to_lds ops
+// read from fat_raw_buffer (global) into workgroup memory (LDS).
+//
+//    CHECK-LABEL: func @conv_im2col_dma
+//          CHECK:   scf.forall
+//          CHECK:     scf.for {{.*}} iter_args
+//          CHECK:       amdgpu.gather_to_lds {{.*}}#amdgpu.address_space<fat_raw_buffer>{{.*}}#gpu.address_space<workgroup>
+//          CHECK:       amdgpu.gather_to_lds {{.*}}#amdgpu.address_space<fat_raw_buffer>{{.*}}#gpu.address_space<workgroup>
+//          CHECK:       gpu.barrier
+//          CHECK:       amdgpu.mfma 16x16x32
+//          CHECK:       scf.yield
@@ -693,6 +693,16 @@ chooseDimToVectorize(OpBuilder &b, Location loc, Im2colOp im2colOp,
   return std::nullopt;
 }
 
+std::optional<unsigned> Im2colOp::getVectorizableDim(OpBuilder &b,
+                                                     Location loc) {
+  SmallVector<Range> iterDomain(getIterationDomain(b));
+  SmallVector<OpFoldResult> inputSizes =
+      tensor::getMixedSizes(b, loc, getInput());
+  SmallVector<OpFoldResult> mixedOffsets = getMixedOffsets();
+  return chooseDimToVectorize(b, loc, *this, iterDomain, inputSizes,
+                              mixedOffsets);
+}
+
 /// Decomposition implementation for iree_linalg_ext.im2col op.
 /// The im2col op is decomposed into serial loops of `insert->extract->copy`.
 /// The decomposition supports leaving either the `batch` or `K` dimension
@@ -747,8 +757,8 @@ FailureOr<SmallVector<Value>> Im2colOp::decomposeOperation(OpBuilder &b) {
   SmallVector<Range> iterationDomain(getIterationDomain(b));
   SmallVector<OpFoldResult> inputSizes =
       tensor::getMixedSizes(b, loc, getInput());
-  std::optional<unsigned> maybeOutputDimToVectorize = chooseDimToVectorize(
-      b, loc, *this, iterationDomain, inputSizes, mixedOffsets);
+  std::optional<unsigned> maybeOutputDimToVectorize =
+      getVectorizableDim(b, loc);
 
   OpFoldResult innerInputTileSize;
   if (maybeOutputDimToVectorize.has_value()) {

@@ -1439,6 +1439,12 @@ def IREELinalgExt_Im2colOp : IREELinalgExt_Op<"im2col",
     void setMixedOffsets(SmallVector<OpFoldResult> offsets);
     void setMixedOutputSizes(ArrayRef<SmallVector<OpFoldResult>> outputSizes);
 
+    // Returns the output dimension index that maps to a contiguous slice of
+    // the input's innermost dimension. Returns std::nullopt if no such dim
+    // exists. This wraps the chooseDimToVectorize logic used by
+    // decomposeOperation.
+    std::optional<unsigned> getVectorizableDim(OpBuilder &b, Location loc);
+
     // Method to implement for specifying output range for
     // DestinationStyleOpInterface
     MutableOperandRange getDpsInitsMutable() {

@@ -0,0 +1,42 @@
+// Test conv2d using im2col + DMA path on gfx950+.
+//
+// Compile:
+//   iree-compile \
+//     --iree-hal-target-backends=rocm \
+//     --iree-rocm-target=gfx950 \
+//     --iree-codegen-llvmgpu-use-igemm=true \
+//     --iree-llvmgpu-use-direct-load=true \
+//     im2col_dma_conv.mlir -o im2col_dma_conv.vmfb
+//
+// Run:
+//   iree-check-module --device=hip --module=im2col_dma_conv.vmfb
+//
+// Dump IR (for debugging):
+//   iree-compile \
+//     --iree-hal-target-backends=rocm \
+//     --iree-rocm-target=gfx950 \
+//     --iree-codegen-llvmgpu-use-igemm=true \
+//     --iree-llvmgpu-use-direct-load=true \
+//     --mlir-print-ir-after-all \
+//     im2col_dma_conv.mlir -o im2col_dma_conv.vmfb 2> im2col_dma_ir_dump.mlir
+
+!input_type = tensor<1x10x10x512xf16>
+!filter_type = tensor<3x3x512x512xf16>
+!output_type = tensor<1x8x8x512xf32>
+
+func.func @im2col_dma_conv() {
+  %input = util.unfoldable_constant dense<1.0> : !input_type
+  %filter = util.unfoldable_constant dense<1.0> : !filter_type
+  %cst = arith.constant 0.000000e+00 : f32
+  %empty = tensor.empty() : !output_type
+  %fill = linalg.fill ins(%cst : f32) outs(%empty : !output_type) -> !output_type
+  %result = linalg.conv_2d_nhwc_hwcf {
+    dilations = dense<1> : tensor<2xi64>,
+    strides = dense<1> : tensor<2xi64>
+  } ins(%input, %filter : !input_type, !filter_type)
+    outs(%fill : !output_type) -> !output_type
+  // Each output element = sum over 3*3*512 products of 1*1 = 4608.
+  check.expect_almost_eq_const(
+    %result, dense<4608.0> : !output_type) : !output_type
+  return
+}