Optimize GPU Permute Kernel for B↔F Axis Swap ([1,0,2,3] order)

andrew-k-park · andrew-k-park · commit 8cd4d82819ee · 2026-03-26T09:06:12.000+09:00
Signed-off-by: Andrew Park &lt;andrew.park@intel.com&gt;
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_b_f_axes.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_b_f_axes.cl
@@ -0,0 +1,93 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+// Permute kernel for B <-> F axis swap (order [1,0,2,3] and higher-dim equivalents).
+// X is contiguous before and after the swap, so no SLM transpose is needed.
+// Each work item vectorizes along X with vload/vstore and loops over F internally.
+//
+// GWS: (ceil(X / VEC_WIDTH), Y [* Z [* W]], B)
+
+#include "include/batch_headers/fetch_data.cl"
+
+KERNEL(permute_b_f_axes)(
+    OPTIONAL_SHAPE_INFO_ARG
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+    , FUSED_OPS_DECLS
+#endif
+    )
+{
+    const uint x_t = get_global_id(0);  // X tile index
+
+#if INPUT0_DIMS == 4
+    const uint y   = get_global_id(1);
+#elif INPUT0_DIMS == 5
+    const uint z   = get_global_id(1) / INPUT0_SIZE_Y;
+    const uint y   = get_global_id(1) % INPUT0_SIZE_Y;
+#elif INPUT0_DIMS == 6
+    const uint w   = get_global_id(1) / (INPUT0_SIZE_Z * INPUT0_SIZE_Y);
+    const uint z   = get_global_id(1) / INPUT0_SIZE_Y % INPUT0_SIZE_Z;
+    const uint y   = get_global_id(1) % INPUT0_SIZE_Y;
+#endif
+
+    const uint b   = get_global_id(2);
+    const uint x_base = x_t * VEC_WIDTH;
+
+    for (uint f = 0; f < INPUT0_FEATURE_NUM; ++f) {
+
+#if X_REMAINDER_SIZE > 0
+        if (x_t == X_TILES) {
+            for (uint i = 0; i < X_REMAINDER_SIZE; ++i) {
+                const uint x = x_base + i;
+#if INPUT0_DIMS == 4
+                const uint in_idx  = INPUT0_GET_INDEX(b, f, y, x);
+                const uint out_idx = OUTPUT_GET_INDEX(f, b, y, x);
+#elif INPUT0_DIMS == 5
+                const uint in_idx  = INPUT0_GET_INDEX(b, f, z, y, x);
+                const uint out_idx = OUTPUT_GET_INDEX(f, b, z, y, x);
+#elif INPUT0_DIMS == 6
+                const uint in_idx  = INPUT0_GET_INDEX(b, f, w, z, y, x);
+                const uint out_idx = OUTPUT_GET_INDEX(f, b, w, z, y, x);
+#endif
+                INPUT0_TYPE val = input[in_idx];
+#if HAS_FUSED_OPS
+                INPUT0_TYPE input_var = val;
+                FUSED_OPS;
+                output[out_idx] = FUSED_OPS_RESULT;
+#else
+                output[out_idx] = ACTIVATION(val, ACTIVATION_PARAMS);
+#endif
+            }
+            continue;
+        }
+#endif  // X_REMAINDER_SIZE > 0
+
+#if INPUT0_DIMS == 4
+        const uint in_idx  = INPUT0_GET_INDEX(b, f, y, x_base);
+        const uint out_idx = OUTPUT_GET_INDEX(f, b, y, x_base);
+#elif INPUT0_DIMS == 5
+        const uint in_idx  = INPUT0_GET_INDEX(b, f, z, y, x_base);
+        const uint out_idx = OUTPUT_GET_INDEX(f, b, z, y, x_base);
+#elif INPUT0_DIMS == 6
+        const uint in_idx  = INPUT0_GET_INDEX(b, f, w, z, y, x_base);
+        const uint out_idx = OUTPUT_GET_INDEX(f, b, w, z, y, x_base);
+#endif
+
+        INPUTVTYPE vals = CAT(vload, VEC_WIDTH)(0, input + in_idx);
+#if HAS_FUSED_OPS
+        OUTPUTVTYPE out_vals;
+        __attribute__((opencl_unroll_hint(VEC_WIDTH)))
+        for (uint i = 0; i < VEC_WIDTH; ++i) {
+            INPUT0_TYPE input_var = vals[i];
+            FUSED_OPS;
+            out_vals[i] = FUSED_OPS_RESULT;
+        }
+        CAT(vstore, VEC_WIDTH)(out_vals, 0, output + out_idx);
+#else
+        OUTPUTVTYPE out_vals = ACTIVATION(CAT(convert_, OUTPUTVTYPE)(vals), ACTIVATION_PARAMS);
+        CAT(vstore, VEC_WIDTH)(out_vals, 0, output + out_idx);
+#endif
+    }  // for f
+}
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_b_f_axes.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_b_f_axes.cpp
@@ -0,0 +1,165 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "permute_kernel_b_f_axes.h"
+
+#include <string>
+
+#include "common_tools.h"
+#include "kernel_selector_utils.h"
+
+namespace kernel_selector {
+
+// Vector width chosen to make each vload/vstore a 16-byte transaction.
+static size_t GetVecWidth(const permute_params& params) {
+    switch (params.inputs[0].GetDType()) {
+        case Datatype::F16:
+        case Datatype::INT16:
+        case Datatype::UINT16:
+            return 8;
+        case Datatype::F32:
+        case Datatype::INT32:
+            return 4;
+        case Datatype::INT8:
+        case Datatype::UINT8:
+            return 16;
+        case Datatype::INT64:
+            return 2;
+        default:
+            return 4;
+    }
+}
+
+ParamsKey PermuteKernel_b_f_axes::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::F16);
+    k.EnableInputDataType(Datatype::F32);
+    k.EnableInputDataType(Datatype::INT8);
+    k.EnableInputDataType(Datatype::UINT8);
+    k.EnableInputDataType(Datatype::INT32);
+    k.EnableInputDataType(Datatype::INT64);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::INT32);
+    k.EnableOutputDataType(Datatype::INT64);
+    k.EnableDifferentTypes();
+    k.EnableInputLayout(DataLayout::bfyx);
+    k.EnableOutputLayout(DataLayout::bfyx);
+    k.EnableInputLayout(DataLayout::bfzyx);
+    k.EnableOutputLayout(DataLayout::bfzyx);
+    k.EnableInputLayout(DataLayout::bfwzyx);
+    k.EnableOutputLayout(DataLayout::bfwzyx);
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBatching();
+    k.EnableDynamicShapesSupport();
+    return k;
+}
+
+JitConstants PermuteKernel_b_f_axes::GetJitConstants(const permute_params& params,
+                                                     const CommonDispatchData& /*dispatchData*/) const {
+    auto jit = Parent::GetJitConstants(params, {});
+
+    const size_t vec_width = GetVecWidth(params);
+    const size_t x_size    = params.inputs[0].X().v;
+    const size_t x_tiles   = x_size / vec_width;
+    const size_t x_rem     = x_size % vec_width;
+
+    jit.AddConstant(MakeJitConstant("VEC_WIDTH",       vec_width));
+    jit.AddConstant(MakeJitConstant("X_TILES",         x_tiles));
+    jit.AddConstant(MakeJitConstant("X_REMAINDER_SIZE", x_rem));
+
+    jit.AddConstant(MakeJitConstant("INPUTVTYPE",  "CAT(INPUT0_TYPE, VEC_WIDTH)"));
+    jit.AddConstant(MakeJitConstant("OUTPUTVTYPE", "CAT(OUTPUT_TYPE, VEC_WIDTH)"));
+
+    if (!params.fused_ops.empty()) {
+        std::vector<std::string> output_order;
+        switch (params.inputs[0].GetDims().size()) {
+            case 4: output_order = {"b", "f", "y", "x"}; break;
+            case 5: output_order = {"b", "f", "z", "y", "x"}; break;
+            case 6: output_order = {"b", "f", "w", "z", "y", "x"}; break;
+            default: break;
+        }
+        FusedOpsConfiguration conf = {"", output_order, "input_var", params.inputs[0].GetDType(), 1};
+        jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
+    }
+
+    return jit;
+}
+
+CommonDispatchData PermuteKernel_b_f_axes::SetDefault(const permute_params& params) const {
+    CommonDispatchData dispatchData;
+
+    const auto& in        = params.inputs[0];
+    const auto in_layout  = in.GetLayout();
+    const auto out_layout = params.outputs[0].GetLayout();
+    const size_t vec_width = GetVecWidth(params);
+    const size_t x_tiles = CeilDiv(in.X().v, vec_width);
+
+    size_t spatial_outer = 1;
+    if (in.GetDims().size() >= 5) spatial_outer *= in.Z().v;
+    if (in.GetDims().size() >= 6) spatial_outer *= in.W().v;
+
+    // F is looped inside the kernel; GWS[2] covers B only.
+    dispatchData.gws = {x_tiles, in.Y().v * spatial_outer, in.Batch().v};
+
+    const std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {
+        {Tensor::DataChannelName::X},
+        {Tensor::DataChannelName::Y, Tensor::DataChannelName::Z, Tensor::DataChannelName::W},
+        {Tensor::DataChannelName::BATCH}};
+    dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo,
+                                                     in_layout, out_layout, dims_by_gws);
+
+    return dispatchData;
+}
+
+bool PermuteKernel_b_f_axes::Validate(const Params& p) const {
+    if (!Parent::Validate(p)) DO_NOT_USE_THIS_KERNEL(p.layerID);
+
+    const permute_params& params = static_cast<const permute_params&>(p);
+
+    if (params.outputs[0].PitchesDifferFromLogicalDims() || params.inputs[0].PitchesDifferFromLogicalDims())
+        DO_NOT_USE_THIS_KERNEL(p.layerID);
+
+    if (!SimpleLayout(params.inputs[0].GetLayout()))
+        DO_NOT_USE_THIS_KERNEL(p.layerID);
+
+    if (params.inputs[0].GetLayout() != params.outputs[0].GetLayout())
+        DO_NOT_USE_THIS_KERNEL(p.layerID);
+
+    // Only accept [1, 0, 2, ...] — B <-> F swap with spatial axes unchanged.
+    const auto& order = params.order;
+    const size_t ndim = order.size();
+    if (ndim < 3 || ndim > 6)
+        DO_NOT_USE_THIS_KERNEL(p.layerID);
+
+    if (order[0] != 1 || order[1] != 0)
+        DO_NOT_USE_THIS_KERNEL(p.layerID);
+
+    for (size_t i = 2; i < ndim; ++i) {
+        if (order[i] != static_cast<uint16_t>(i))
+            DO_NOT_USE_THIS_KERNEL(p.layerID);
+    }
+
+    return true;
+}
+
+KernelsPriority PermuteKernel_b_f_axes::GetKernelsPriority(const Params& params) const {
+    KernelData kd = KernelData::Default<permute_params>(params);
+    permute_params& newParams = *static_cast<permute_params*>(kd.params.get());
+
+    const size_t vec_width = GetVecWidth(newParams);
+    const size_t x_size    = newParams.inputs[0].X().v;
+
+    if (x_size >= vec_width * 2 && (x_size % vec_width == 0))
+        return FORCE_PRIORITY_2;
+    if (x_size >= vec_width)
+        return FORCE_PRIORITY_3;
+
+    return FORCE_PRIORITY_4;
+}
+
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_b_f_axes.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_b_f_axes.h
@@ -0,0 +1,34 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "permute_kernel_base.h"
+
+namespace kernel_selector {
+
+class PermuteKernel_b_f_axes : public PermuteKernelBase {
+public:
+    using Parent = PermuteKernelBase;
+    using Parent::Parent;
+    PermuteKernel_b_f_axes() : PermuteKernelBase("permute_b_f_axes") {}
+    virtual ~PermuteKernel_b_f_axes() {}
+
+    bool Validate(const Params& p) const override;
+    KernelsPriority GetKernelsPriority(const Params& params) const override;
+    ParamsKey GetSupportedKey() const override;
+
+protected:
+    JitConstants GetJitConstants(const permute_params& params, const CommonDispatchData& dispatchData) const override;
+    CommonDispatchData SetDefault(const permute_params& params) const override;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return {
+            FusedOpType::REORDER,
+            FusedOpType::ACTIVATION,
+            FusedOpType::QUANTIZE,
+            FusedOpType::ELTWISE
+        };
+    }
+};
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_selector.cpp
@@ -8,6 +8,7 @@
 #include "permute_kernel_tile_8x8_4x4_fsv.h"
 #include "permute_kernel_bfzyx_to_bfyxz.h"
 #include "permute_kernel_f_y_axes.h"
+#include "permute_kernel_b_f_axes.h"
 
 namespace kernel_selector {
 
@@ -17,6 +18,7 @@ permute_kernel_selector::permute_kernel_selector() {
     Attach<PermuteKernel_tile_8x8_4x4_fsv>();
     Attach<PermuteKernel_bfzyx_to_bfyxz>();
     Attach<PermuteKernel_f_y_axes>();
+    Attach<PermuteKernel_b_f_axes>();
 }
 
 KernelsData permute_kernel_selector::GetBestKernels(const Params& params) const {
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/permute_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/permute_gpu_test.cpp