Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ class VariableStateIndirectKVCacheCompressed : public VariableStateIndirectKVCac
const std::vector<cldnn::layout>& output_layouts,
size_t beam_idx,
size_t concat_idx,
bool has_zp_state);
bool has_zp_state,
bool is_4bit_kv_cache = false);
using Ptr = std::shared_ptr<VariableStateIndirectKVCacheCompressed>;

void set_state(const ov::SoPtr<ov::ITensor>& state) override;
Expand All @@ -70,5 +71,6 @@ class VariableStateIndirectKVCacheCompressed : public VariableStateIndirectKVCac

private:
bool m_has_zp_state = false;
bool m_is_4bit_kv_cache = false;
};
} // namespace ov::intel_gpu
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ class VariableState : public VariableStateBase {
return m_initial_layout;
}

void set_alloc_inner_dim_divisor(size_t divisor) { m_alloc_inner_dim_divisor = divisor; }

ov::element::Type get_user_specified_type() const;

protected:
Expand All @@ -82,6 +84,7 @@ class VariableState : public VariableStateBase {
cldnn::memory::ptr m_memory = nullptr;
bool m_transpose_required = false;
size_t actual_size = 0;
size_t m_alloc_inner_dim_divisor = 1;

const cldnn::layout m_initial_layout;

Expand Down
3 changes: 3 additions & 0 deletions src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,9 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
params.combine_scales_and_zp =
primitive->quantization_attributes.output_storage_type != ov::op::internal::DynamicQuantize::OutputStorageType::Planar;

const auto kv_cache_dt = impl_param.get_program().get_config().get_kv_cache_precision();
params.is_int4_compressed = ov::element::Type(kv_cache_dt).bitwidth() == 4;

const auto& past_kv_cache_shape = impl_param.input_layouts[0].get_partial_shape();
params.axis_offset = past_kv_cache_shape[primitive->concat_axis].is_static() ? past_kv_cache_shape[primitive->concat_axis].get_length() : 0;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,13 @@ JitConstants SDPABase::get_jit_constants(const kernel_impl_params& params) const
jit.make("HAS_SINK_INPUT", 1);
}
jit.make("IS_KV_COMPRESSED", desc->is_kv_compressed);
{
// const bool is_int4 = ov::element::Type(desc->quantization_attributes.quantization_dt).bitwidth() == 4;
const auto kv_cache_dt = params.get_program().get_config().get_kv_cache_precision();
const bool is_int4 = ov::element::Type(kv_cache_dt).bitwidth() == 4;
// std::cout << ">> get_sdpa_jit_constants: is_int4_compressed = " << is_int4 << std::endl;
jit.make("IS_INT4_COMPRESSED", is_int4);
}
GPU_DEBUG_TRACE_DETAIL << "desc->is_kv_compressed = " << desc->is_kv_compressed << std::endl;

const auto& in_offsets_map = params.in_port_to_shape_info_offset;
Expand Down
305 changes: 279 additions & 26 deletions src/plugins/intel_gpu/src/graph/impls/ocl_v2/sdpa_opt.cl

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,16 @@
// SPDX-License-Identifier: Apache-2.0
//

#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/common.cl"
#include "include/batch_headers/int4_utils.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/sub_group_shuffle.cl"


#define UINT4_RANGE 15

#if OUTPUT_DIMS != 4
#error "dynamic_quantize_gpu_kv_cache.cl: Unsupported output dimension"
#endif
Expand Down Expand Up @@ -73,17 +75,56 @@ KERNEL(dynamic_quantize_gpu_kv_cache)(
const uint input_offset = INPUT0_GET_INDEX(b, f, y, x);
unroll_for (uint i = 0; i < INNERMOST_DIM_VALUE / SUBGROUP_SIZE; i++) {
val[i] = INPUT_BLOCK_READ(input, input_offset + i * SUBGROUP_SIZE);
#if ASYMMETRIC_QUANTIZATION
#if ASYMMETRIC_QUANTIZATION || IS_INT4_COMPRESSED
max_value = fmax(max_value, val[i]);
min_value = fmin(min_value, val[i]);
#else
max_value = fmax(max_value, fabs(val[i]));
#endif
}
#if !ASYMMETRIC_QUANTIZATION
#if !ASYMMETRIC_QUANTIZATION && !IS_INT4_COMPRESSED
max_value = fmax(max_value, grp_max);
#endif

#ifdef APPEND_MODE
APPEND_AXIS_NAME += axis_offset;
#endif

#if IS_INT4_COMPRESSED
// 4-bit unsigned asymmetric quantization: map [min, max] to [0, 15].
// Two INT4 values are packed per byte using SUBGROUP_SIZE-stride grouping:
// output byte at physical offset (k * SUBGROUP_SIZE + sglid) holds:
// lo nibble = quantized val[(2k) * SUBGROUP_SIZE + sglid]
// hi nibble = quantized val[(2k+1) * SUBGROUP_SIZE + sglid]
min_value = work_group_reduce_min(min_value);
max_value = work_group_reduce_max(max_value);

ACCUMULATOR_TYPE diff_value = max_value == min_value ? (ACCUMULATOR_TYPE)(grp_max)
: (ACCUMULATOR_TYPE)(max_value - min_value);
ACCUMULATOR_TYPE scale_tmp = (ACCUMULATOR_TYPE)((UINT4_RANGE) / diff_value);
ACCUMULATOR_TYPE zp_tmp = (ACCUMULATOR_TYPE)(-min_value * scale_tmp); // maps min -> 0, max -> UINT4_RANGE

// INT4 packed buffer: the output layout uses i8 with full head_size shape.
// Use element-level offset directly (same stride as layout) so that SDPA
// can address rows with the standard GET_INDEX pitch.
const uint output_offset = OUTPUT_GET_INDEX(b, f, y, x);
// Pairs of consecutive SUBGROUP_SIZE blocks are packed together.
unroll_for (uint i = 0; i < INNERMOST_DIM_VALUE / SUBGROUP_SIZE; i += 2) {
uchar q0 = (uchar)clamp(convert_int_rte((float)val[i] * scale_tmp + zp_tmp), 0, UINT4_RANGE);
uchar q1 = (uchar)clamp(convert_int_rte((float)val[i + 1] * scale_tmp + zp_tmp), 0, UINT4_RANGE);
// Pack: lo nibble = q0, hi nibble = q1
char packed = cvt_uint8x2_to_uint4x2((uchar2)(q0, q1));
OUTPUT_BLOCK_WRITE(output, output_offset + (i / 2) * SUBGROUP_SIZE, packed);
}

const uint scale_idx = FUNC_CALL(get_scales_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x);
if (grouped_indexes == 0 && sglid == 0) {
output_scale[scale_idx] = (OUTPUT1_TYPE)(1.0f / scale_tmp); // dequant scale
output_scale[scale_idx + 1] = (OUTPUT1_TYPE)(zp_tmp); // zero-point
}

#else // !IS_INT4_COMPRESSED — original INT8 path

#if ASYMMETRIC_QUANTIZATION
min_value = work_group_reduce_min(min_value);
max_value = work_group_reduce_max(max_value);
Expand All @@ -100,10 +141,6 @@ KERNEL(dynamic_quantize_gpu_kv_cache)(
OUTPUT1_TYPE scale = 127.0h / max_value;
#endif

#ifdef APPEND_MODE
APPEND_AXIS_NAME += axis_offset;
#endif

const uint output_offset = OUTPUT_GET_INDEX(b, f, y, x);
unroll_for (uint i = 0; i < INNERMOST_DIM_VALUE / SUBGROUP_SIZE; i++) {
#if ASYMMETRIC_QUANTIZATION
Expand Down Expand Up @@ -134,4 +171,6 @@ KERNEL(dynamic_quantize_gpu_kv_cache)(
output_scale[scale_idx] = 1.0h / scale;
#endif
}

#endif // IS_INT4_COMPRESSED
}
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ ParamsKey DynamicQuantizeKernelKVCache::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::F16);
k.EnableOutputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::INT4);
k.EnableOutputDataType(Datatype::UINT4);
k.EnableDifferentTypes();
k.EnableAllInputLayout();
k.EnableAllOutputLayout();
Expand Down Expand Up @@ -141,6 +144,7 @@ JitConstants DynamicQuantizeKernelKVCache::GetJitConstants(const dynamic_quantiz
jit.AddConstant(MakeJitConstant("ITERATIONS_NUMBER", iterations_number));
jit.AddConstant(MakeJitConstant("ASYMMETRIC_QUANTIZATION", params.use_asymmetric_quantization));
jit.AddConstant(MakeJitConstant("GROUP_SCALES_WITH_ZP", params.combine_scales_and_zp));
jit.AddConstant(MakeJitConstant("IS_INT4_COMPRESSED", params.is_int4_compressed));

// Use FP32 accumulator type for scale/zp calculation
jit.Merge(MakeTypeJitConstants(Datatype::F32, "ACCUMULATOR"));
Expand Down Expand Up @@ -184,6 +188,19 @@ CommonDispatchData DynamicQuantizeKernelKVCache::SetDefault(const dynamic_quanti
const auto total_grouped_elements = get_elements_number_per_group(params);
const auto total_subgroups_number = total_grouped_elements / input_dims.back().v;

// [DEBUG]
// {
// size_t total_elements_number = 1;
// const auto& group_sizes = params.group_sizes;
// for (size_t i = 0; i < group_sizes.size(); i++) {
// if (group_sizes[i] != UINT64_MAX) {
// total_elements_number *= input_dims[i].v;
// }
// }
// std::cout << " >> group_sizes : " << group_sizes[0] << ", " << group_sizes[1] << ", " << group_sizes[2] << ", " << group_sizes[3]
// << " => total_elements_number : " << total_elements_number << ", total_batched_elements : " << total_batched_elements << std::endl;
// }

dispatchData.gws = {subgroup_size, total_subgroups_number, total_batched_elements};
dispatchData.lws = {subgroup_size, total_subgroups_number, 1};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ struct dynamic_quantize_params : public base_params {
bool use_asymmetric_quantization = false;
bool combine_scales_and_zp = false;
bool generate_precomputed_reduction = false;
bool is_int4_compressed = false;
};

class DynamicQuantizeKernelRef : public KernelBaseOpenCL {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,11 @@ VariableStateIndirectKVCacheCompressed::VariableStateIndirectKVCacheCompressed(
const std::vector<cldnn::layout>& output_layouts,
size_t beam_idx,
size_t concat_idx,
bool has_zp_state = false)
bool has_zp_state,
bool is_4bit_kv_cache)
: VariableStateIndirectKVCache(info, context, shape_predictor, beam_idx, concat_idx),
m_has_zp_state(has_zp_state) {
m_has_zp_state(has_zp_state),
m_is_4bit_kv_cache(is_4bit_kv_cache) {
OPENVINO_ASSERT((has_zp_state && output_layouts.size() == 3) ||
(!has_zp_state && output_layouts.size() == 2),
"[GPU] Unexpected number of output layouts for VariableStateIndirectKVCacheCompressed");
Expand All @@ -185,6 +187,12 @@ VariableStateIndirectKVCacheCompressed::VariableStateIndirectKVCacheCompressed(
OPENVINO_ASSERT((!m_has_zp_state && m_hidden_states.size() == 3) || (m_has_zp_state && m_hidden_states.size() == 4),
"[GPU] VariableStateIndirectKVCacheCompressed expects 3 or 4 internal states to be initialized, "
"actual number is ", m_hidden_states.size());

// For 4-bit KV-cache, two INT4 values are packed per byte.
// Halve the innermost dim of the allocation to reduce physical memory usage.
if (m_is_4bit_kv_cache) {
m_hidden_states[0]->set_alloc_inner_dim_divisor(2);
}
}

VariableState::Ptr VariableStateIndirectKVCacheCompressed::get_compression_scale_state() const {
Expand Down
5 changes: 4 additions & 1 deletion src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -667,13 +667,16 @@ void SyncInferRequest::allocate_states() {
}

if (compressed) {
const auto kv_precision = m_graph->get_config().get_kv_cache_precision();
const bool is_4bit_kv_cache = ov::element::Type(kv_precision).bitwidth() == 4;
m_variables.emplace(vi.first, std::make_shared<VariableStateIndirectKVCacheCompressed>(vi.second,
m_context,
m_shape_predictor,
states_layouts,
beam_axis,
concat_axis,
has_zp_state));
has_zp_state,
is_4bit_kv_cache));
} else if (indirect_kv_cache) {
m_variables.emplace(vi.first, std::make_shared<VariableStateIndirectKVCache>(vi.second,
m_context,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include "intel_gpu/op/kv_cache.hpp"
#include "intel_gpu/op/kv_cache_compressed.hpp"
#include "intel_gpu/runtime/utils.hpp"
#include "gather_shape_inference.hpp"
#include "concat_shape_inference.hpp"
#include "openvino/core/partial_shape.hpp"
Expand Down Expand Up @@ -207,8 +208,8 @@ KVCacheCompressed::KVCacheCompressed(const OutputVector& inputs,
: KVCache(inputs, past_variable, true, trim, concat_axis, gather_axis, output_type)
, m_compressed(true)
, m_quantization_attrs(quantization_attrs) {
OPENVINO_ASSERT(quantization_attrs.quantization_dt == ov::element::i8,
"[GPU] Only I8 data type is currently supported for KV-cache compression");
OPENVINO_ASSERT(cldnn::one_of(quantization_attrs.quantization_dt , {element::i8, element::i4, element::u4}),
"[GPU] data type is currently not supported for KV-cache compression");

m_variable = past_variable;
size_t output_size = 3;
Expand Down
19 changes: 15 additions & 4 deletions src/plugins/intel_gpu/src/plugin/variable_state.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,21 @@ void VariableState::update_device_buffer() {
const auto alloc_type = m_context->get_engine().use_unified_shared_memory() ? cldnn::allocation_type::usm_device : cldnn::allocation_type::cl_mem;
const auto current_buf_size = m_layout.get_padded_dims();
ov::Shape current_shape(current_buf_size.begin(), current_buf_size.end());
const auto alloc_shape = predict_shape(m_name, cldnn::layout(current_shape, m_layout.data_type, m_layout.format), *m_shape_predictor);
const auto alloc_layout = cldnn::layout(alloc_shape, m_layout.data_type, m_layout.format);
m_memory = m_context->get_engine().allocate_memory(alloc_layout, alloc_type, false);
actual_size = std::max(actual_size, alloc_layout.bytes_count());
auto alloc_shape = predict_shape(m_name, cldnn::layout(current_shape, m_layout.data_type, m_layout.format), *m_shape_predictor);

// For INT4 packed KV-cache, halve the innermost dim to reduce physical allocation.
// actual_size tracks LOGICAL capacity (un-halved) for correct max_pad calculations.
if (m_alloc_inner_dim_divisor > 1 && !alloc_shape.empty()) {
auto logical_alloc_shape = alloc_shape;
alloc_shape.back() /= m_alloc_inner_dim_divisor;
const auto alloc_layout = cldnn::layout(alloc_shape, m_layout.data_type, m_layout.format);
m_memory = m_context->get_engine().allocate_memory(alloc_layout, alloc_type, false);
actual_size = std::max(actual_size, cldnn::layout(logical_alloc_shape, m_layout.data_type, m_layout.format).bytes_count());
} else {
const auto alloc_layout = cldnn::layout(alloc_shape, m_layout.data_type, m_layout.format);
m_memory = m_context->get_engine().allocate_memory(alloc_layout, alloc_type, false);
actual_size = std::max(actual_size, alloc_layout.bytes_count());
}
}

OPENVINO_ASSERT(m_memory != nullptr, "m_memory is nullptr!!!");
Expand Down
Loading