Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -826,3 +826,154 @@ func.func @copy_swizzle_hint_linearized(%source: tensor<128x16xf32>) -> tensor<1

return %result : tensor<128x16xf32>
}

// -----

// Test: im2col → gather DMA conversion (happy path).
// NHWC layout, 3×3 kernel, stride 1, dilation 1, C=512 on gfx950.
// The im2col should be converted to: collapse_shape + linalg.generic (index
// computation) + gather → then the gather gets converted to coalesced DMA.

#gpu_target_im2col = #iree_gpu.target<arch = "gfx950", features = "", wgp = <
compute = fp32, storage = b32, subgroup = shuffle,
max_load_instruction_bits = 128, subgroup_size_choices = [64],
max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647],
dma_sizes = [32, 128]
>>
#exec_target_im2col = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.target_info = #gpu_target_im2col}>
#translation_im2col = #iree_codegen.translation_info<pipeline = #iree_gpu.pipeline<TileAndFuse> workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_num_stages = 0, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>

// CHECK-LABEL: func.func @im2col_to_gather_dma
// CHECK-SAME: %[[INPUT:[a-zA-Z0-9]+]]: tensor<1x16x16x512xf16>
// CHECK-SAME: %[[OUTPUT:[a-zA-Z0-9]+]]: tensor<1x196x512xf16>
func.func @im2col_to_gather_dma(%input: tensor<1x16x16x512xf16>, %output: tensor<1x196x512xf16>) -> tensor<1x196x512xf16>
attributes {hal.executable.target = #exec_target_im2col, translation_info = #translation_im2col} {
%result = iree_linalg_ext.im2col
{lowering_config = #iree_gpu.use_global_load_dma}
strides = [1, 1] dilations = [1, 1] kernel_size = [3, 3]
offsets = [0, 0, 0]
output_sizes = [[1], [14, 14], [3, 3, 512]]
batch_pos = [0] m_pos = [1, 2] k_pos = [3]
input_k_perm = [0, 1, 2] output_perm = [0, 1, 2]
ins(%input : tensor<1x16x16x512xf16>)
outs(%output : tensor<1x196x512xf16>) -> tensor<1x196x512xf16>

// Step 1: Collapse input [1,16,16,512] → [256,512] (flatten spatial dims).
// CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[INPUT]] {{\[}}[0, 1, 2], [3]{{\]}}
// CHECK-SAME: tensor<1x16x16x512xf16> into tensor<256x512xf16>

// Step 2: Compute linearized spatial indices via linalg.generic.
// Each of the 196 output positions (14×14) maps to a row in the 256-row
// collapsed source via: linearize(delinearize(i, [14,14]), [16,16]).
// CHECK: %[[INDICES:.+]] = linalg.generic
// CHECK: %[[IDX:.+]] = linalg.index 0
// CHECK: affine.delinearize_index %[[IDX]] into (14, 14)
// CHECK: affine.linearize_index
// CHECK: linalg.yield
// CHECK: } -> tensor<196xindex>

// Step 3: Collapse output [1,196,512] → [196,512].
// CHECK: %[[COLLAPSED_OUT:.+]] = tensor.collapse_shape %[[OUTPUT]] {{\[}}[0, 1], [2]{{\]}}

// Step 4: Warp-level forall distributes 196 batch positions across warps.
// 256 threads / 64 subgroup_size = 4 warps. 196 / 4 = 49 per warp.
// CHECK: scf.forall (%[[WIV0:.+]], %[[WIV1:.+]]) = (0, 0) to (196, 512) step (49, 512)
// CHECK-SAME: shared_outs(%[[WINIT:.+]] = %[[COLLAPSED_OUT]])

// Step 5: Slice indices for this warp's batch positions.
// CHECK: %[[WARP_INDICES:.+]] = tensor.extract_slice %[[INDICES]][%[[WIV0]]] [49] [1]

// Step 6: Lane-level forall (64 lanes) + coalesced gather DMA.
// Each lane reads elementsPerLane contiguous f16 from the collapsed source.
// CHECK: scf.forall (%[[LANE:.+]]) in (64)
// CHECK: scf.forall.in_parallel {
// CHECK: iree_gpu.coalesced_gather_dma %[[COLLAPSED]][%[[WARP_INDICES]]]
// CHECK-SAME: into %{{.+}} lane(%[[LANE]])
// CHECK-SAME: tensor<256x512xf16>, tensor<49xindex>, tensor<49x512xf16>, index
// CHECK: } {mapping = [#iree_gpu.lane_id<0>]}

// CHECK: } {mapping = [#gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}

// Step 7: Expand result back to [1,196,512].
// CHECK: tensor.expand_shape %{{.+}} {{\[}}[0, 1], [2]{{\]}}

// No im2col or gather should remain.
// CHECK-NOT: iree_linalg_ext.im2col
// CHECK-NOT: iree_linalg_ext.gather

return %result : tensor<1x196x512xf16>
}

// -----

// Negative test: im2col NOT converted when K_tile is too small for DMA
// alignment. With f16, dma_sizes=[32,128], subgroup_size=64:
// min_elements_per_transfer = 64 * (32/16) = 128. K_tile=4 is not aligned.
// The im2col should be downgraded to derived_thread_config.

#gpu_target_im2col_small_k = #iree_gpu.target<arch = "gfx950", features = "", wgp = <
compute = fp32, storage = b32, subgroup = shuffle,
max_load_instruction_bits = 128, subgroup_size_choices = [64],
max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647],
dma_sizes = [32, 128]
>>
#exec_target_im2col_small_k = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.target_info = #gpu_target_im2col_small_k}>
#translation_im2col_small_k = #iree_codegen.translation_info<pipeline = #iree_gpu.pipeline<TileAndFuse> workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_num_stages = 0, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>

// CHECK-LABEL: func.func @im2col_small_k_no_dma
func.func @im2col_small_k_no_dma(%input: tensor<1x6x6x4xf16>, %output: tensor<1x16x4xf16>) -> tensor<1x16x4xf16>
attributes {hal.executable.target = #exec_target_im2col_small_k, translation_info = #translation_im2col_small_k} {
%result = iree_linalg_ext.im2col
{lowering_config = #iree_gpu.use_global_load_dma}
strides = [1, 1] dilations = [1, 1] kernel_size = [3, 3]
offsets = [0, 0, 0]
output_sizes = [[1], [4, 4], [3, 3, 4]]
batch_pos = [0] m_pos = [1, 2] k_pos = [3]
input_k_perm = [0, 1, 2] output_perm = [0, 1, 2]
ins(%input : tensor<1x6x6x4xf16>)
outs(%output : tensor<1x16x4xf16>) -> tensor<1x16x4xf16>

// K_tile=4 is too small. Im2col remains with derived_thread_config.
// CHECK: iree_linalg_ext.im2col
// CHECK-SAME: lowering_config = #iree_gpu.derived_thread_config
// CHECK-NOT: iree_gpu.coalesced_gather_dma

return %result : tensor<1x16x4xf16>
}

// -----

// Negative test: im2col NOT converted on non-gfx950 target (gfx942).
// gfx942 does not support global load DMA (no dma_sizes field).

#gpu_target_im2col_nogfx950 = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
compute = fp32, storage = b32, subgroup = shuffle,
max_load_instruction_bits = 128, subgroup_size_choices = [64],
max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]
>>
#exec_target_im2col_nogfx950 = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.target_info = #gpu_target_im2col_nogfx950}>
#translation_im2col_nogfx950 = #iree_codegen.translation_info<pipeline = #iree_gpu.pipeline<TileAndFuse> workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_num_stages = 0, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>

// CHECK-LABEL: func.func @im2col_nogfx950_no_dma
func.func @im2col_nogfx950_no_dma(%input: tensor<1x16x16x512xf16>, %output: tensor<1x196x512xf16>) -> tensor<1x196x512xf16>
attributes {hal.executable.target = #exec_target_im2col_nogfx950, translation_info = #translation_im2col_nogfx950} {
%result = iree_linalg_ext.im2col
{lowering_config = #iree_gpu.use_global_load_dma}
strides = [1, 1] dilations = [1, 1] kernel_size = [3, 3]
offsets = [0, 0, 0]
output_sizes = [[1], [14, 14], [3, 3, 512]]
batch_pos = [0] m_pos = [1, 2] k_pos = [3]
input_k_perm = [0, 1, 2] output_perm = [0, 1, 2]
ins(%input : tensor<1x16x16x512xf16>)
outs(%output : tensor<1x196x512xf16>) -> tensor<1x196x512xf16>

// Non-gfx950 target. Im2col remains with derived_thread_config.
// CHECK: iree_linalg_ext.im2col
// CHECK-SAME: lowering_config = #iree_gpu.derived_thread_config
// CHECK-NOT: iree_gpu.coalesced_gather_dma

return %result : tensor<1x196x512xf16>
}
Original file line number Diff line number Diff line change
Expand Up @@ -419,11 +419,10 @@ func.func @im2col_producer_dma_downgraded_to_derived(
return %mm : tensor<2x32x256xf32>
}

// Im2col gets derived_thread_config (not use_global_load_dma) because Im2col
// has no DMA lowering path. The non-Im2col operand still gets use_global_load_dma.
// Im2col now gets use_global_load_dma because Im2col has a DMA lowering path.
// CHECK-LABEL: func.func @im2col_producer_dma_downgraded_to_derived
// CHECK: %[[PA:.+]] = iree_linalg_ext.im2col
// CHECK-SAME: lowering_config = #iree_gpu.derived_thread_config
// CHECK-SAME: lowering_config = #iree_gpu.use_global_load_dma
// CHECK: %[[PB:.+]] = linalg.copy
// CHECK-SAME: lowering_config = #iree_gpu.use_global_load_dma
// CHECK: linalg.batch_matmul {{.*}} ins(%[[PA]], %[[PB]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,16 @@ static std::optional<Value> promotionImpl(OpBuilder &builder,
setLoweringConfig(producer, attr);
return operand.get();
}
// Im2colOp has no DMA conversion path in GPUConvertToCoalescedDMA, so
// always use derived_thread_config regardless of the requested attr.
// If the promotion attr requests DMA, pass it through to im2col.
// GPUConvertToCoalescedDMA will convert im2col → gather → DMA.
// Otherwise, fall back to derived_thread_config.
if (isa<IREE::LinalgExt::Im2colOp>(producer.getOperation())) {
setLoweringConfig(producer,
DerivedThreadConfigAttr::get(producer->getContext()));
if (isa<UseGlobalLoadDMAAttr>(attr)) {
setLoweringConfig(producer, attr);
} else {
setLoweringConfig(producer,
DerivedThreadConfigAttr::get(producer->getContext()));
}
return operand.get();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ iree_lit_test_suite(
"pipeline_elementwise_f8ocp.mlir",
"pipeline_igemm_tile_and_fuse.mlir",
"pipeline_igemm_tile_and_fuse_gfx950.mlir",
"pipeline_im2col_dma_gfx950.mlir",
"pipeline_lower_to_llvmgpu.mlir",
"pipeline_scaled_truncation_gfx950.mlir",
"pipeline_tile_and_fuse.mlir",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ iree_lit_test_suite(
"pipeline_elementwise_f8ocp.mlir"
"pipeline_igemm_tile_and_fuse.mlir"
"pipeline_igemm_tile_and_fuse_gfx950.mlir"
"pipeline_im2col_dma_gfx950.mlir"
"pipeline_lower_to_llvmgpu.mlir"
"pipeline_scaled_truncation_gfx950.mlir"
"pipeline_tile_and_fuse.mlir"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx950 \
// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-llvmgpu-lower-executable-target{for-rocdl=true})))))" %s | FileCheck %s

#pipeline_layout = #hal.pipeline.layout<bindings = [
#hal.pipeline.binding<storage_buffer, ReadOnly>,
#hal.pipeline.binding<storage_buffer, ReadOnly>,
#hal.pipeline.binding<storage_buffer>
]>
#translation = #iree_codegen.translation_info<pipeline =
#iree_gpu.pipeline<TileAndFuse>
workgroup_size = [512, 1, 1]
subgroup_size = 64,
{
gpu_pipeline_options = #iree_gpu.pipeline_options<
prefetch_num_stages = 2,
no_reduce_shared_memory_bank_conflicts = true,
use_igemm_convolution = true>
}>
#config = #iree_gpu.lowering_config<{
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F16>,
promote_operands = [0, 1],
promotion_types = [#iree_gpu.use_global_load_dma, #iree_gpu.use_global_load_dma],
reduction = [0, 0, 0, 0, 4],
subgroup = [1, 2, 1, 4, 0],
workgroup = [1, 4, 32, 128, 0]
}>
hal.executable private @conv_im2col_dma {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
hal.executable.export public @conv_im2col_dma ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) {
%x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @conv_im2col_dma() attributes {translation_info = #translation} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x34x34x1280xf16>>
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<3x3x1280x1280xf16>>
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x32x32x1280xf32>>
%3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 34, 34, 1280], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x34x34x1280xf16>> -> tensor<2x34x34x1280xf16>
%4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 1280, 1280], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<3x3x1280x1280xf16>> -> tensor<3x3x1280x1280xf16>
%5 = tensor.empty() : tensor<2x32x32x1280xf32>
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32x32x1280xf32>) -> tensor<2x32x32x1280xf32>
%7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>, lowering_config = #config} ins(%3, %4 : tensor<2x34x34x1280xf16>, tensor<3x3x1280x1280xf16>) outs(%6 : tensor<2x32x32x1280xf32>) -> tensor<2x32x32x1280xf32>
iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 32, 1280], strides = [1, 1, 1, 1] : tensor<2x32x32x1280xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x32x32x1280xf32>>
return
}
}
}
}

// Verify im2col DMA path: conv is lowered through im2col -> gather ->
// coalesced_gather_dma -> amdgpu.gather_to_lds. The gather_to_lds ops
// read from fat_raw_buffer (global) into workgroup memory (LDS).
//
// CHECK-LABEL: func @conv_im2col_dma
// CHECK: scf.forall
// CHECK: scf.for {{.*}} iter_args
// CHECK: amdgpu.gather_to_lds {{.*}}#amdgpu.address_space<fat_raw_buffer>{{.*}}#gpu.address_space<workgroup>
// CHECK: amdgpu.gather_to_lds {{.*}}#amdgpu.address_space<fat_raw_buffer>{{.*}}#gpu.address_space<workgroup>
// CHECK: gpu.barrier
// CHECK: amdgpu.mfma 16x16x32
// CHECK: scf.yield
Original file line number Diff line number Diff line change
Expand Up @@ -693,6 +693,16 @@ chooseDimToVectorize(OpBuilder &b, Location loc, Im2colOp im2colOp,
return std::nullopt;
}

std::optional<unsigned> Im2colOp::getVectorizableDim(OpBuilder &b,
Location loc) {
SmallVector<Range> iterDomain(getIterationDomain(b));
SmallVector<OpFoldResult> inputSizes =
tensor::getMixedSizes(b, loc, getInput());
SmallVector<OpFoldResult> mixedOffsets = getMixedOffsets();
return chooseDimToVectorize(b, loc, *this, iterDomain, inputSizes,
mixedOffsets);
}

/// Decomposition implementation for iree_linalg_ext.im2col op.
/// The im2col op is decomposed into serial loops of `insert->extract->copy`.
/// The decomposition supports leaving either the `batch` or `K` dimension
Expand Down Expand Up @@ -747,8 +757,8 @@ FailureOr<SmallVector<Value>> Im2colOp::decomposeOperation(OpBuilder &b) {
SmallVector<Range> iterationDomain(getIterationDomain(b));
SmallVector<OpFoldResult> inputSizes =
tensor::getMixedSizes(b, loc, getInput());
std::optional<unsigned> maybeOutputDimToVectorize = chooseDimToVectorize(
b, loc, *this, iterationDomain, inputSizes, mixedOffsets);
std::optional<unsigned> maybeOutputDimToVectorize =
getVectorizableDim(b, loc);

OpFoldResult innerInputTileSize;
if (maybeOutputDimToVectorize.has_value()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1439,6 +1439,12 @@ def IREELinalgExt_Im2colOp : IREELinalgExt_Op<"im2col",
void setMixedOffsets(SmallVector<OpFoldResult> offsets);
void setMixedOutputSizes(ArrayRef<SmallVector<OpFoldResult>> outputSizes);

// Returns the output dimension index that maps to a contiguous slice of
// the input's innermost dimension. Returns std::nullopt if no such dim
// exists. This wraps the chooseDimToVectorize logic used by
// decomposeOperation.
std::optional<unsigned> getVectorizableDim(OpBuilder &b, Location loc);

// Method to implement for specifying output range for
// DestinationStyleOpInterface
MutableOperandRange getDpsInitsMutable() {
Expand Down
42 changes: 42 additions & 0 deletions tests/e2e/rocm_specific/im2col_dma_conv.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Test conv2d using im2col + DMA path on gfx950+.
//
// Compile:
// iree-compile \
// --iree-hal-target-backends=rocm \
// --iree-rocm-target=gfx950 \
// --iree-codegen-llvmgpu-use-igemm=true \
// --iree-llvmgpu-use-direct-load=true \
// im2col_dma_conv.mlir -o im2col_dma_conv.vmfb
//
// Run:
// iree-check-module --device=hip --module=im2col_dma_conv.vmfb
//
// Dump IR (for debugging):
// iree-compile \
// --iree-hal-target-backends=rocm \
// --iree-rocm-target=gfx950 \
// --iree-codegen-llvmgpu-use-igemm=true \
// --iree-llvmgpu-use-direct-load=true \
// --mlir-print-ir-after-all \
// im2col_dma_conv.mlir -o im2col_dma_conv.vmfb 2> im2col_dma_ir_dump.mlir

!input_type = tensor<1x10x10x512xf16>
!filter_type = tensor<3x3x512x512xf16>
!output_type = tensor<1x8x8x512xf32>

func.func @im2col_dma_conv() {
%input = util.unfoldable_constant dense<1.0> : !input_type
%filter = util.unfoldable_constant dense<1.0> : !filter_type
%cst = arith.constant 0.000000e+00 : f32
%empty = tensor.empty() : !output_type
%fill = linalg.fill ins(%cst : f32) outs(%empty : !output_type) -> !output_type
%result = linalg.conv_2d_nhwc_hwcf {
dilations = dense<1> : tensor<2xi64>,
strides = dense<1> : tensor<2xi64>
} ins(%input, %filter : !input_type, !filter_type)
outs(%fill : !output_type) -> !output_type
// Each output element = sum over 3*3*512 products of 1*1 = 4608.
check.expect_almost_eq_const(
%result, dense<4608.0> : !output_type) : !output_type
return
}
Loading