openvinotoolkit · andrew-k-park · Mar 25, 2026 · Mar 26, 2026
@@ -0,0 +1,109 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+// Permute kernel for B <-> F axis swap (order [1,0,2,3] and higher-dim equivalents).
+// X is contiguous before and after the swap, so no SLM transpose is needed.
+// Each work item vectorizes along X with vload/vstore, processes Y_BLOCK Y-rows
+// to improve memory-level parallelism, and loops over F internally.
+//
+// GWS: (ceil(X / VEC_WIDTH), ceil(Y / Y_BLOCK) [* Z [* W]], B)
+
+#include "include/batch_headers/fetch_data.cl"
+
+KERNEL(permute_b_f_axes)(
+    OPTIONAL_SHAPE_INFO_ARG
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+    , FUSED_OPS_DECLS
+#endif
+    )
+{
+    const uint x_t = get_global_id(0);
+
+#if INPUT0_DIMS == 4
+    const uint y_base = get_global_id(1) * Y_BLOCK;
+#elif INPUT0_DIMS == 5
+    const uint z = get_global_id(1) / Y_TILES_Y;
+    const uint y_base = (get_global_id(1) % Y_TILES_Y) * Y_BLOCK;
+#elif INPUT0_DIMS == 6
+    const uint w = get_global_id(1) / (INPUT0_SIZE_Z * Y_TILES_Y);
+    const uint z = (get_global_id(1) / Y_TILES_Y) % INPUT0_SIZE_Z;
+    const uint y_base = (get_global_id(1) % Y_TILES_Y) * Y_BLOCK;
+#endif
+
+    const uint b = get_global_id(2);
+    const uint x_base = x_t * VEC_WIDTH;
+
+    for (uint f = 0; f < INPUT0_FEATURE_NUM; ++f) {
+
+#if X_REMAINDER_SIZE > 0
+        if (x_t == X_TILES) {
+            __attribute__((opencl_unroll_hint(Y_BLOCK)))
+            for (uint yb = 0; yb < Y_BLOCK; ++yb) {
+                const uint y = y_base + yb;
+                for (uint i = 0; i < X_REMAINDER_SIZE; ++i) {
+                    const uint x = x_base + i;
+#if INPUT0_DIMS == 4
+                    const uint in_idx = INPUT0_GET_INDEX(b, f, y, x);
+                    const uint out_idx = OUTPUT_GET_INDEX(f, b, y, x);
+#elif INPUT0_DIMS == 5
+                    const uint in_idx = INPUT0_GET_INDEX(b, f, z, y, x);
+                    const uint out_idx = OUTPUT_GET_INDEX(f, b, z, y, x);
+#elif INPUT0_DIMS == 6
+                    const uint in_idx = INPUT0_GET_INDEX(b, f, w, z, y, x);
+                    const uint out_idx = OUTPUT_GET_INDEX(f, b, w, z, y, x);
+#endif
+                    INPUT0_TYPE val = input[in_idx];
+#if HAS_FUSED_OPS
+                    INPUT0_TYPE input_var = val;
+                    FUSED_OPS;
+                    output[out_idx] = FUSED_OPS_RESULT;
+#else
+                    output[out_idx] = ACTIVATION(val, ACTIVATION_PARAMS);
+#endif
+                }
+            }
+            continue;
+        }
+#endif  // X_REMAINDER_SIZE > 0
+
+        // Load Y_BLOCK rows then store, to improve memory-level parallelism.
+        INPUTVTYPE vals[Y_BLOCK];
+        uint out_idxs[Y_BLOCK];
+
+        __attribute__((opencl_unroll_hint(Y_BLOCK)))
+        for (uint yb = 0; yb < Y_BLOCK; ++yb) {
+            const uint y = y_base + yb;
+#if INPUT0_DIMS == 4
+            vals[yb] = CAT(vload, VEC_WIDTH)(0, input + INPUT0_GET_INDEX(b, f, y, x_base));
+            out_idxs[yb] = OUTPUT_GET_INDEX(f, b, y, x_base);
+#elif INPUT0_DIMS == 5
+            vals[yb] = CAT(vload, VEC_WIDTH)(0, input + INPUT0_GET_INDEX(b, f, z, y, x_base));
+            out_idxs[yb] = OUTPUT_GET_INDEX(f, b, z, y, x_base);
+#elif INPUT0_DIMS == 6
+            vals[yb] = CAT(vload, VEC_WIDTH)(0, input + INPUT0_GET_INDEX(b, f, w, z, y, x_base));
+            out_idxs[yb] = OUTPUT_GET_INDEX(f, b, w, z, y, x_base);
+#endif
+        }
+
+        __attribute__((opencl_unroll_hint(Y_BLOCK)))
+        for (uint yb = 0; yb < Y_BLOCK; ++yb) {
+#if HAS_FUSED_OPS
+            const uint y = y_base + yb;
+            OUTPUTVTYPE out_vals;
+            __attribute__((opencl_unroll_hint(VEC_WIDTH)))
+            for (uint i = 0; i < VEC_WIDTH; ++i) {
+                INPUT0_TYPE input_var = vals[yb][i];
+                FUSED_OPS;
+                out_vals[i] = FUSED_OPS_RESULT;
+            }
+            CAT(vstore, VEC_WIDTH)(out_vals, 0, output + out_idxs[yb]);
+#else
+            OUTPUTVTYPE out_vals = ACTIVATION(CAT(convert_, OUTPUTVTYPE)(vals[yb]), ACTIVATION_PARAMS);
+            CAT(vstore, VEC_WIDTH)(out_vals, 0, output + out_idxs[yb]);
+#endif
+        }
+    }  // for f
+}
@@ -0,0 +1,179 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "permute_kernel_b_f_axes.h"
+
+#include <string>
+
+#include "common_tools.h"
+#include "kernel_selector_utils.h"
+
+namespace kernel_selector {
+
+// Vector width chosen to make each vload/vstore a 16-byte transaction.
+static size_t GetVecWidth(const permute_params& params) {
+    switch (params.inputs[0].GetDType()) {
+        case Datatype::F16:
+        case Datatype::INT16:
+        case Datatype::UINT16:
+            return 8;
+        case Datatype::F32:
+        case Datatype::INT32:
+            return 4;
+        case Datatype::INT8:
+        case Datatype::UINT8:
+            return 16;
+        case Datatype::INT64:
+            return 2;
+        default:
+            return 4;
+    }
+}
+
+// Y-blocking factor: largest value in {1, 2, 4} that divides Y.
+static size_t GetYBlock(const permute_params& params) {
+    const size_t y = params.inputs[0].Y().v;
+    if (y % 4 == 0) return 4;
+    if (y % 2 == 0) return 2;
+    return 1;
+}
+
+ParamsKey PermuteKernel_b_f_axes::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::F16);
+    k.EnableInputDataType(Datatype::F32);
+    k.EnableInputDataType(Datatype::INT8);
+    k.EnableInputDataType(Datatype::UINT8);
+    k.EnableInputDataType(Datatype::INT32);
+    k.EnableInputDataType(Datatype::INT64);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::INT32);
+    k.EnableOutputDataType(Datatype::INT64);
+    k.EnableDifferentTypes();
+    k.EnableInputLayout(DataLayout::bfyx);
+    k.EnableOutputLayout(DataLayout::bfyx);
+    k.EnableInputLayout(DataLayout::bfzyx);
+    k.EnableOutputLayout(DataLayout::bfzyx);
+    k.EnableInputLayout(DataLayout::bfwzyx);
+    k.EnableOutputLayout(DataLayout::bfwzyx);
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBatching();
+    k.EnableDynamicShapesSupport();
+    return k;
+}
+
+JitConstants PermuteKernel_b_f_axes::GetJitConstants(const permute_params& params,
+                                                     const CommonDispatchData& /*dispatchData*/) const {
+    auto jit = Parent::GetJitConstants(params, {});
+
+    const size_t vec_width = GetVecWidth(params);
+    const size_t x_size = params.inputs[0].X().v;
+    const size_t x_tiles = x_size / vec_width;
+    const size_t x_rem = x_size % vec_width;
+
+    jit.AddConstant(MakeJitConstant("VEC_WIDTH", vec_width));
+    jit.AddConstant(MakeJitConstant("X_TILES", x_tiles));
+    jit.AddConstant(MakeJitConstant("X_REMAINDER_SIZE", x_rem));
+
+    const size_t y_block = GetYBlock(params);
+    jit.AddConstant(MakeJitConstant("Y_BLOCK", y_block));
+    jit.AddConstant(MakeJitConstant("Y_TILES_Y", CeilDiv(params.inputs[0].Y().v, y_block)));
+
+    jit.AddConstant(MakeJitConstant("INPUTVTYPE", "CAT(INPUT0_TYPE, VEC_WIDTH)"));
+    jit.AddConstant(MakeJitConstant("OUTPUTVTYPE", "CAT(OUTPUT_TYPE, VEC_WIDTH)"));
+
+    if (!params.fused_ops.empty()) {
+        std::vector<std::string> output_order;
+        switch (params.inputs[0].GetDims().size()) {
+            case 4: output_order = {"b", "f", "y", "x"}; break;
+            case 5: output_order = {"b", "f", "z", "y", "x"}; break;
+            case 6: output_order = {"b", "f", "w", "z", "y", "x"}; break;
+            default: break;
+        }
+        FusedOpsConfiguration conf = {"", output_order, "input_var", params.inputs[0].GetDType(), 1};
+        jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
+    }
+
+    return jit;
+}
+
+CommonDispatchData PermuteKernel_b_f_axes::SetDefault(const permute_params& params) const {
+    CommonDispatchData dispatchData;
+
+    const auto& in = params.inputs[0];
+    const auto in_layout = in.GetLayout();
+    const auto out_layout = params.outputs[0].GetLayout();
+    const size_t vec_width = GetVecWidth(params);
+    const size_t x_tiles = CeilDiv(in.X().v, vec_width);
+    const size_t y_block = GetYBlock(params);
+    const size_t y_tiles_y = CeilDiv(in.Y().v, y_block);
+
+    size_t spatial_outer = 1;
+    if (in.GetDims().size() >= 5) spatial_outer *= in.Z().v;
+    if (in.GetDims().size() >= 6) spatial_outer *= in.W().v;
+
+    // F is looped inside the kernel; GWS[2] covers B only.
+    dispatchData.gws = {x_tiles, y_tiles_y * spatial_outer, in.Batch().v};
+
+    const std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {
+        {Tensor::DataChannelName::X},
+        {Tensor::DataChannelName::Y, Tensor::DataChannelName::Z, Tensor::DataChannelName::W},
+        {Tensor::DataChannelName::BATCH}};
+    dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo,
+                                                     in_layout, out_layout, dims_by_gws);
+
+    return dispatchData;
+}
+
+bool PermuteKernel_b_f_axes::Validate(const Params& p) const {
+    if (!Parent::Validate(p)) DO_NOT_USE_THIS_KERNEL(p.layerID);
+
+    const permute_params& params = static_cast<const permute_params&>(p);
+
+    if (params.outputs[0].PitchesDifferFromLogicalDims() || params.inputs[0].PitchesDifferFromLogicalDims())
+        DO_NOT_USE_THIS_KERNEL(p.layerID);
+
+    if (!SimpleLayout(params.inputs[0].GetLayout()))
+        DO_NOT_USE_THIS_KERNEL(p.layerID);
+
+    if (params.inputs[0].GetLayout() != params.outputs[0].GetLayout())
+        DO_NOT_USE_THIS_KERNEL(p.layerID);
+
+    // Only accept [1, 0, 2, ...] — B <-> F swap with spatial axes unchanged.
+    const auto& order = params.order;
+    const size_t ndim = order.size();
+    if (ndim < 3 || ndim > 6)
+        DO_NOT_USE_THIS_KERNEL(p.layerID);
+
+    if (order[0] != 1 || order[1] != 0)
+        DO_NOT_USE_THIS_KERNEL(p.layerID);
+
+    for (size_t i = 2; i < ndim; ++i) {
+        if (order[i] != static_cast<uint16_t>(i))
+            DO_NOT_USE_THIS_KERNEL(p.layerID);
+    }
+
+    return true;
+}
+
+KernelsPriority PermuteKernel_b_f_axes::GetKernelsPriority(const Params& params) const {
+    KernelData kd = KernelData::Default<permute_params>(params);
+    permute_params& newParams = *static_cast<permute_params*>(kd.params.get());
+
+    const size_t vec_width = GetVecWidth(newParams);
+    const size_t x_size = newParams.inputs[0].X().v;
+
+    if (x_size >= vec_width * 2 && (x_size % vec_width == 0))
+        return FORCE_PRIORITY_2;
+    if (x_size >= vec_width)
+        return FORCE_PRIORITY_3;
+
+    return FORCE_PRIORITY_4;
+}
+
+}  // namespace kernel_selector
@@ -0,0 +1,34 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "permute_kernel_base.h"
+
+namespace kernel_selector {
+
+class PermuteKernel_b_f_axes : public PermuteKernelBase {
+public:
+    using Parent = PermuteKernelBase;
+    using Parent::Parent;
+    PermuteKernel_b_f_axes() : PermuteKernelBase("permute_b_f_axes") {}
+    virtual ~PermuteKernel_b_f_axes() {}
+
+    bool Validate(const Params& p) const override;
+    KernelsPriority GetKernelsPriority(const Params& params) const override;
+    ParamsKey GetSupportedKey() const override;
+
+protected:
+    JitConstants GetJitConstants(const permute_params& params, const CommonDispatchData& dispatchData) const override;
+    CommonDispatchData SetDefault(const permute_params& params) const override;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return {
+            FusedOpType::REORDER,
+            FusedOpType::ACTIVATION,
+            FusedOpType::QUANTIZE,
+            FusedOpType::ELTWISE
+        };
+    }
+};
+}  // namespace kernel_selector
@@ -8,6 +8,7 @@
 #include "permute_kernel_tile_8x8_4x4_fsv.h"
 #include "permute_kernel_bfzyx_to_bfyxz.h"
 #include "permute_kernel_f_y_axes.h"
+#include "permute_kernel_b_f_axes.h"
 
 namespace kernel_selector {
 
@@ -17,6 +18,7 @@ permute_kernel_selector::permute_kernel_selector() {
     Attach<PermuteKernel_tile_8x8_4x4_fsv>();
     Attach<PermuteKernel_bfzyx_to_bfyxz>();
     Attach<PermuteKernel_f_y_axes>();
+    Attach<PermuteKernel_b_f_axes>();
 }
 
 KernelsData permute_kernel_selector::GetBestKernels(const Params& params) const {