Skip to content

[experiment] Support unplaced TileOps#2265

Draft
fifield wants to merge 5 commits intoXilinx:mainfrom
fifield:unplaced_tileop
Draft

[experiment] Support unplaced TileOps#2265
fifield wants to merge 5 commits intoXilinx:mainfrom
fifield:unplaced_tileop

Conversation

@fifield
Copy link
Collaborator

@fifield fifield commented May 2, 2025

This branch is an experiment to see what it takes to support unplaced aie dialect.

It is based on a a simple extension to aie.tile op to support ? as the row or column operand, meaning the row or column is not physically placed:

// unplaced tile
%tile_c_r = aie.tile(?, ?)

// unplaced shim
%shim_noc_tile_c_0 = aie.tile(?, 0)

// unplaced memtile
%mem_tile_c_0 = aie.tile(?, 1)

To test this I add a "null placer" to iron placers.py:

class NullPlacer(Placer):
    """NullPlacer is a simple implementation of a placer. The NullPlacer does not do any placement.
    """

    def __init__(self):
        super().__init__()

    def make_placement(
        self,
        device: Device,
        rt: Runtime,
        workers: list[Worker],
        object_fifos: list[ObjectFifoHandle],
    ):
        for worker in workers:
            if worker.tile == AnyComputeTile:
                worker.place(Tile(-1, -1))
                for buffer in worker.buffers:
                    buffer.place(worker.tile)
            for of in object_fifos:
                of_endpoints = of.all_of_endpoints()
                for ofe in of_endpoints:
                    if ofe.tile == AnyMemTile:
                        ofe.place(Tile(-1, 1))
                    elif ofe.tile == AnyComputeTile:
                        ofe.place(Tile(-1, -1))
                    elif ofe.tile == AnyShimTile:
                        ofe.place(Tile(-1, 0))

So that unplaced MLIR is emitted from unplaced IRON:

# place_test.py
@construct_and_print_module
def shim_three_in(module):
    N = 4096
    n = 1024

    n_ty = np.ndarray[(n,), np.dtype[np.int32]]

    n_inputs = 3
    of_ins = []
    for i in range(n_inputs):
        of_ins.append(ObjectFifo(n_ty, name=f"in_{i}"))

    def core_fn(of_in):
        pass

    workers = []
    for i in range(n_inputs):
        workers.append(Worker(core_fn, [of_ins[i].cons()]))

    rt = Runtime()
    with rt.sequence(n_ty, n_ty, n_ty) as (A, B, C):
        rt.start(*workers)
        rt.fill(of_ins[0].prod(), A)
        rt.fill(of_ins[1].prod(), B)
        rt.fill(of_ins[2].prod(), C)

    module = Program(NPU2Col2(), rt).resolve_program(NullPlacer())
    return module

emits:

module {
  aie.device(npu2_2col) {
    %shim_noc_tile_c_0 = aie.tile(?, 0)
    %tile_c_r = aie.tile(?, ?)
    %shim_noc_tile_c_0_0 = aie.tile(?, 0)
    %tile_c_r_1 = aie.tile(?, ?)
    %shim_noc_tile_c_0_2 = aie.tile(?, 0)
    %tile_c_r_3 = aie.tile(?, ?)
    aie.objectfifo @in_0(%shim_noc_tile_c_0, {%tile_c_r}, 2 : i32) : !aie.objectfifo<memref<1024xi32>> 
    aie.objectfifo @in_2(%shim_noc_tile_c_0_0, {%tile_c_r_1}, 2 : i32) : !aie.objectfifo<memref<1024xi32>> 
    aie.objectfifo @in_1(%shim_noc_tile_c_0_2, {%tile_c_r_3}, 2 : i32) : !aie.objectfifo<memref<1024xi32>> 
    %core_c_r = aie.core(%tile_c_r) {
      aie.end
    }
    %core_c_r_4 = aie.core(%tile_c_r_3) {
      aie.end
    }
    %core_c_r_5 = aie.core(%tile_c_r_1) {
      aie.end
    }
    aiex.runtime_sequence @sequence(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>, %arg2: memref<1024xi32>) {
      %0 = aiex.dma_configure_task_for @in_0 {
        aie.dma_bd(%arg0 : memref<1024xi32>, 0, 1024, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1024, stride = 1>]) {burst_length = 0 : i32}
        aie.end
      }
      aiex.dma_start_task(%0)
      %1 = aiex.dma_configure_task_for @in_1 {
        aie.dma_bd(%arg1 : memref<1024xi32>, 0, 1024, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1024, stride = 1>]) {burst_length = 0 : i32}
        aie.end
      }
      aiex.dma_start_task(%1)
      %2 = aiex.dma_configure_task_for @in_2 {
        aie.dma_bd(%arg2 : memref<1024xi32>, 0, 1024, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1024, stride = 1>]) {burst_length = 0 : i32}
        aie.end
      }
      aiex.dma_start_task(%2)
    }
  }
}

which can be placed with the mlir pass in this branch:

$ python place_test.py  | aie-opt -canonicalize -aie-sequential-placer -canonicalize
module {
  aie.device(npu2_2col) {
    %tile_0_2 = aie.tile(0, 2)
    %tile_0_3 = aie.tile(0, 3)
    %shim_noc_tile_0_0 = aie.tile(0, 0)
    %tile_0_4 = aie.tile(0, 4)
    aie.objectfifo @in_0(%shim_noc_tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<1024xi32>> 
    aie.objectfifo @in_2(%shim_noc_tile_0_0, {%tile_0_3}, 2 : i32) : !aie.objectfifo<memref<1024xi32>> 
    aie.objectfifo @in_1(%shim_noc_tile_0_0, {%tile_0_4}, 2 : i32) : !aie.objectfifo<memref<1024xi32>> 
    %core_0_2 = aie.core(%tile_0_2) {
      aie.end
    }
    %core_0_4 = aie.core(%tile_0_4) {
      aie.end
    }
    %core_0_3 = aie.core(%tile_0_3) {
      aie.end
    }
    aiex.runtime_sequence @sequence(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>, %arg2: memref<1024xi32>) {
      %0 = aiex.dma_configure_task_for @in_0 {
        aie.dma_bd(%arg0 : memref<1024xi32>, 0, 1024, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1024, stride = 1>]) {burst_length = 0 : i32}
        aie.end
      }
      aiex.dma_start_task(%0)
      %1 = aiex.dma_configure_task_for @in_1 {
        aie.dma_bd(%arg1 : memref<1024xi32>, 0, 1024, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1024, stride = 1>]) {burst_length = 0 : i32}
        aie.end
      }
      aiex.dma_start_task(%1)
      %2 = aiex.dma_configure_task_for @in_2 {
        aie.dma_bd(%arg2 : memref<1024xi32>, 0, 1024, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1024, stride = 1>]) {burst_length = 0 : i32}
        aie.end
      }
      aiex.dma_start_task(%2)
    }
  }
}

@fifield fifield force-pushed the unplaced_tileop branch from 56f003c to f2fbac6 Compare May 6, 2025 20:43
@fifield fifield force-pushed the unplaced_tileop branch 3 times, most recently from b01739a to c2ee6ba Compare May 20, 2025 15:46
@fifield fifield force-pushed the unplaced_tileop branch from c2ee6ba to 263fc98 Compare May 29, 2025 03:29
@fifield fifield force-pushed the unplaced_tileop branch from 263fc98 to 0887a3f Compare June 6, 2025 22:35
@fifield fifield force-pushed the unplaced_tileop branch 2 times, most recently from 0a3e8a9 to 9904e93 Compare July 2, 2025 18:07
@fifield fifield force-pushed the unplaced_tileop branch 2 times, most recently from f4e1aad to 0aec197 Compare July 31, 2025 17:37
@kurtis-b-1
Copy link
Contributor

kurtis-b-1 commented Oct 1, 2025

At the level of the unplaced MLIR, is it necessary to know the specific memory access patterns? I wonder that if there was a way to provide the size of the application workload, the loop and tiling variables, and the direction of the tilings, then maybe the placement pass could also decide the memory access pattern.
The data movement will be partly decided by the kernel implementation, so the buffer sizes passed to the Kernel objects would still need to be explicit, I think.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants