Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
// Copyright (C) 2018-2026 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

// Permute kernel for B <-> F axis swap (order [1,0,2,3] and higher-dim equivalents).
// X is contiguous before and after the swap, so no SLM transpose is needed.
// Each work item vectorizes along X with vload/vstore, processes Y_BLOCK Y-rows
// to improve memory-level parallelism, and loops over F internally.
//
// GWS: (ceil(X / VEC_WIDTH), ceil(Y / Y_BLOCK) [* Z [* W]], B)

#include "include/batch_headers/fetch_data.cl"

KERNEL(permute_b_f_axes)(
OPTIONAL_SHAPE_INFO_ARG
const __global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output
#if HAS_FUSED_OPS_DECLS
, FUSED_OPS_DECLS
#endif
)
{
const uint x_t = get_global_id(0);

#if INPUT0_DIMS == 4
const uint y_base = get_global_id(1) * Y_BLOCK;
#elif INPUT0_DIMS == 5
const uint z = get_global_id(1) / Y_TILES_Y;
const uint y_base = (get_global_id(1) % Y_TILES_Y) * Y_BLOCK;
#elif INPUT0_DIMS == 6
const uint w = get_global_id(1) / (INPUT0_SIZE_Z * Y_TILES_Y);
const uint z = (get_global_id(1) / Y_TILES_Y) % INPUT0_SIZE_Z;
const uint y_base = (get_global_id(1) % Y_TILES_Y) * Y_BLOCK;
#endif

const uint b = get_global_id(2);
const uint x_base = x_t * VEC_WIDTH;

for (uint f = 0; f < INPUT0_FEATURE_NUM; ++f) {

#if X_REMAINDER_SIZE > 0
if (x_t == X_TILES) {
__attribute__((opencl_unroll_hint(Y_BLOCK)))
for (uint yb = 0; yb < Y_BLOCK; ++yb) {
const uint y = y_base + yb;
for (uint i = 0; i < X_REMAINDER_SIZE; ++i) {
const uint x = x_base + i;
#if INPUT0_DIMS == 4
const uint in_idx = INPUT0_GET_INDEX(b, f, y, x);
const uint out_idx = OUTPUT_GET_INDEX(f, b, y, x);
#elif INPUT0_DIMS == 5
const uint in_idx = INPUT0_GET_INDEX(b, f, z, y, x);
const uint out_idx = OUTPUT_GET_INDEX(f, b, z, y, x);
#elif INPUT0_DIMS == 6
const uint in_idx = INPUT0_GET_INDEX(b, f, w, z, y, x);
const uint out_idx = OUTPUT_GET_INDEX(f, b, w, z, y, x);
#endif
INPUT0_TYPE val = input[in_idx];
#if HAS_FUSED_OPS
INPUT0_TYPE input_var = val;
FUSED_OPS;
output[out_idx] = FUSED_OPS_RESULT;
#else
output[out_idx] = ACTIVATION(val, ACTIVATION_PARAMS);
#endif
}
}
continue;
}
#endif // X_REMAINDER_SIZE > 0

// Load Y_BLOCK rows then store, to improve memory-level parallelism.
INPUTVTYPE vals[Y_BLOCK];
uint out_idxs[Y_BLOCK];

__attribute__((opencl_unroll_hint(Y_BLOCK)))
for (uint yb = 0; yb < Y_BLOCK; ++yb) {
const uint y = y_base + yb;
#if INPUT0_DIMS == 4
vals[yb] = CAT(vload, VEC_WIDTH)(0, input + INPUT0_GET_INDEX(b, f, y, x_base));
out_idxs[yb] = OUTPUT_GET_INDEX(f, b, y, x_base);
#elif INPUT0_DIMS == 5
vals[yb] = CAT(vload, VEC_WIDTH)(0, input + INPUT0_GET_INDEX(b, f, z, y, x_base));
out_idxs[yb] = OUTPUT_GET_INDEX(f, b, z, y, x_base);
#elif INPUT0_DIMS == 6
vals[yb] = CAT(vload, VEC_WIDTH)(0, input + INPUT0_GET_INDEX(b, f, w, z, y, x_base));
out_idxs[yb] = OUTPUT_GET_INDEX(f, b, w, z, y, x_base);
#endif
}

__attribute__((opencl_unroll_hint(Y_BLOCK)))
for (uint yb = 0; yb < Y_BLOCK; ++yb) {
#if HAS_FUSED_OPS
const uint y = y_base + yb;
OUTPUTVTYPE out_vals;
__attribute__((opencl_unroll_hint(VEC_WIDTH)))
for (uint i = 0; i < VEC_WIDTH; ++i) {
INPUT0_TYPE input_var = vals[yb][i];
FUSED_OPS;
out_vals[i] = FUSED_OPS_RESULT;
}
CAT(vstore, VEC_WIDTH)(out_vals, 0, output + out_idxs[yb]);
#else
OUTPUTVTYPE out_vals = ACTIVATION(CAT(convert_, OUTPUTVTYPE)(vals[yb]), ACTIVATION_PARAMS);
CAT(vstore, VEC_WIDTH)(out_vals, 0, output + out_idxs[yb]);
#endif
}
} // for f
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
// Copyright (C) 2018-2026 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "permute_kernel_b_f_axes.h"

#include <string>

#include "common_tools.h"
#include "kernel_selector_utils.h"

namespace kernel_selector {

// Vector width chosen to make each vload/vstore a 16-byte transaction.
static size_t GetVecWidth(const permute_params& params) {
switch (params.inputs[0].GetDType()) {
case Datatype::F16:
case Datatype::INT16:
case Datatype::UINT16:
return 8;
case Datatype::F32:
case Datatype::INT32:
return 4;
case Datatype::INT8:
case Datatype::UINT8:
return 16;
case Datatype::INT64:
return 2;
default:
return 4;
}
}

// Y-blocking factor: largest value in {1, 2, 4} that divides Y.
static size_t GetYBlock(const permute_params& params) {
const size_t y = params.inputs[0].Y().v;
if (y % 4 == 0) return 4;
if (y % 2 == 0) return 2;
return 1;
}

ParamsKey PermuteKernel_b_f_axes::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::F16);
k.EnableInputDataType(Datatype::F32);
k.EnableInputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::UINT8);
k.EnableInputDataType(Datatype::INT32);
k.EnableInputDataType(Datatype::INT64);
k.EnableOutputDataType(Datatype::F16);
k.EnableOutputDataType(Datatype::F32);
k.EnableOutputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::INT32);
k.EnableOutputDataType(Datatype::INT64);
k.EnableDifferentTypes();
k.EnableInputLayout(DataLayout::bfyx);
k.EnableOutputLayout(DataLayout::bfyx);
k.EnableInputLayout(DataLayout::bfzyx);
k.EnableOutputLayout(DataLayout::bfzyx);
k.EnableInputLayout(DataLayout::bfwzyx);
k.EnableOutputLayout(DataLayout::bfwzyx);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBatching();
k.EnableDynamicShapesSupport();
return k;
}

JitConstants PermuteKernel_b_f_axes::GetJitConstants(const permute_params& params,
const CommonDispatchData& /*dispatchData*/) const {
auto jit = Parent::GetJitConstants(params, {});

const size_t vec_width = GetVecWidth(params);
const size_t x_size = params.inputs[0].X().v;
const size_t x_tiles = x_size / vec_width;
const size_t x_rem = x_size % vec_width;

jit.AddConstant(MakeJitConstant("VEC_WIDTH", vec_width));
jit.AddConstant(MakeJitConstant("X_TILES", x_tiles));
jit.AddConstant(MakeJitConstant("X_REMAINDER_SIZE", x_rem));

const size_t y_block = GetYBlock(params);
jit.AddConstant(MakeJitConstant("Y_BLOCK", y_block));
jit.AddConstant(MakeJitConstant("Y_TILES_Y", CeilDiv(params.inputs[0].Y().v, y_block)));

jit.AddConstant(MakeJitConstant("INPUTVTYPE", "CAT(INPUT0_TYPE, VEC_WIDTH)"));
jit.AddConstant(MakeJitConstant("OUTPUTVTYPE", "CAT(OUTPUT_TYPE, VEC_WIDTH)"));

if (!params.fused_ops.empty()) {
std::vector<std::string> output_order;
switch (params.inputs[0].GetDims().size()) {
case 4: output_order = {"b", "f", "y", "x"}; break;
case 5: output_order = {"b", "f", "z", "y", "x"}; break;
case 6: output_order = {"b", "f", "w", "z", "y", "x"}; break;
default: break;
}
FusedOpsConfiguration conf = {"", output_order, "input_var", params.inputs[0].GetDType(), 1};
jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
}

return jit;
}

CommonDispatchData PermuteKernel_b_f_axes::SetDefault(const permute_params& params) const {
CommonDispatchData dispatchData;

const auto& in = params.inputs[0];
const auto in_layout = in.GetLayout();
const auto out_layout = params.outputs[0].GetLayout();
const size_t vec_width = GetVecWidth(params);
const size_t x_tiles = CeilDiv(in.X().v, vec_width);
const size_t y_block = GetYBlock(params);
const size_t y_tiles_y = CeilDiv(in.Y().v, y_block);

size_t spatial_outer = 1;
if (in.GetDims().size() >= 5) spatial_outer *= in.Z().v;
if (in.GetDims().size() >= 6) spatial_outer *= in.W().v;

// F is looped inside the kernel; GWS[2] covers B only.
dispatchData.gws = {x_tiles, y_tiles_y * spatial_outer, in.Batch().v};

const std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {
{Tensor::DataChannelName::X},
{Tensor::DataChannelName::Y, Tensor::DataChannelName::Z, Tensor::DataChannelName::W},
{Tensor::DataChannelName::BATCH}};
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo,
in_layout, out_layout, dims_by_gws);

return dispatchData;
}

bool PermuteKernel_b_f_axes::Validate(const Params& p) const {
if (!Parent::Validate(p)) DO_NOT_USE_THIS_KERNEL(p.layerID);

const permute_params& params = static_cast<const permute_params&>(p);

if (params.outputs[0].PitchesDifferFromLogicalDims() || params.inputs[0].PitchesDifferFromLogicalDims())
DO_NOT_USE_THIS_KERNEL(p.layerID);

if (!SimpleLayout(params.inputs[0].GetLayout()))
DO_NOT_USE_THIS_KERNEL(p.layerID);

if (params.inputs[0].GetLayout() != params.outputs[0].GetLayout())
DO_NOT_USE_THIS_KERNEL(p.layerID);

// Only accept [1, 0, 2, ...] — B <-> F swap with spatial axes unchanged.
const auto& order = params.order;
const size_t ndim = order.size();
if (ndim < 3 || ndim > 6)
DO_NOT_USE_THIS_KERNEL(p.layerID);

if (order[0] != 1 || order[1] != 0)
DO_NOT_USE_THIS_KERNEL(p.layerID);

for (size_t i = 2; i < ndim; ++i) {
if (order[i] != static_cast<uint16_t>(i))
DO_NOT_USE_THIS_KERNEL(p.layerID);
}

return true;
}

KernelsPriority PermuteKernel_b_f_axes::GetKernelsPriority(const Params& params) const {
KernelData kd = KernelData::Default<permute_params>(params);
permute_params& newParams = *static_cast<permute_params*>(kd.params.get());

const size_t vec_width = GetVecWidth(newParams);
const size_t x_size = newParams.inputs[0].X().v;

if (x_size >= vec_width * 2 && (x_size % vec_width == 0))
return FORCE_PRIORITY_2;
if (x_size >= vec_width)
return FORCE_PRIORITY_3;

return FORCE_PRIORITY_4;
}

} // namespace kernel_selector
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// Copyright (C) 2018-2026 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "permute_kernel_base.h"

namespace kernel_selector {

class PermuteKernel_b_f_axes : public PermuteKernelBase {
public:
using Parent = PermuteKernelBase;
using Parent::Parent;
PermuteKernel_b_f_axes() : PermuteKernelBase("permute_b_f_axes") {}
virtual ~PermuteKernel_b_f_axes() {}

bool Validate(const Params& p) const override;
KernelsPriority GetKernelsPriority(const Params& params) const override;
ParamsKey GetSupportedKey() const override;

protected:
JitConstants GetJitConstants(const permute_params& params, const CommonDispatchData& dispatchData) const override;
CommonDispatchData SetDefault(const permute_params& params) const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return {
FusedOpType::REORDER,
FusedOpType::ACTIVATION,
FusedOpType::QUANTIZE,
FusedOpType::ELTWISE
};
}
};
} // namespace kernel_selector
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "permute_kernel_tile_8x8_4x4_fsv.h"
#include "permute_kernel_bfzyx_to_bfyxz.h"
#include "permute_kernel_f_y_axes.h"
#include "permute_kernel_b_f_axes.h"

namespace kernel_selector {

Expand All @@ -17,6 +18,7 @@ permute_kernel_selector::permute_kernel_selector() {
Attach<PermuteKernel_tile_8x8_4x4_fsv>();
Attach<PermuteKernel_bfzyx_to_bfyxz>();
Attach<PermuteKernel_f_y_axes>();
Attach<PermuteKernel_b_f_axes>();
}

KernelsData permute_kernel_selector::GetBestKernels(const Params& params) const {
Expand Down
Loading
Loading