openvinotoolkit · byungilm · Mar 17, 2026 · Mar 23, 2026 · Mar 24, 2026 · Mar 25, 2026
@@ -55,7 +55,8 @@ class VariableStateIndirectKVCacheCompressed : public VariableStateIndirectKVCac
                                            const std::vector<cldnn::layout>& output_layouts,
                                            size_t beam_idx,
                                            size_t concat_idx,
-                                           bool has_zp_state);
+                                           bool has_zp_state,
+                                           bool is_4bit_kv_cache = false);
     using Ptr = std::shared_ptr<VariableStateIndirectKVCacheCompressed>;
 
     void set_state(const ov::SoPtr<ov::ITensor>& state) override;
@@ -70,5 +71,6 @@ class VariableStateIndirectKVCacheCompressed : public VariableStateIndirectKVCac
 
 private:
     bool m_has_zp_state = false;
+    bool m_is_4bit_kv_cache = false;
 };
 }  // namespace ov::intel_gpu
@@ -72,6 +72,8 @@ class VariableState : public VariableStateBase {
         return m_initial_layout;
     }
 
+    void set_alloc_inner_dim_divisor(size_t divisor) { m_alloc_inner_dim_divisor = divisor; }
+
     ov::element::Type get_user_specified_type() const;
 
 protected:
@@ -82,6 +84,7 @@ class VariableState : public VariableStateBase {
     cldnn::memory::ptr m_memory = nullptr;
     bool m_transpose_required = false;
     size_t actual_size = 0;
+    size_t m_alloc_inner_dim_divisor = 1;
 
     const cldnn::layout m_initial_layout;
 

@@ -506,6 +506,9 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
         params.combine_scales_and_zp =
             primitive->quantization_attributes.output_storage_type != ov::op::internal::DynamicQuantize::OutputStorageType::Planar;
 
+        const auto kv_cache_dt = impl_param.get_program().get_config().get_kv_cache_precision();
+        params.is_int4_compressed = ov::element::Type(kv_cache_dt).bitwidth() == 4;
+
         const auto& past_kv_cache_shape = impl_param.input_layouts[0].get_partial_shape();
         params.axis_offset = past_kv_cache_shape[primitive->concat_axis].is_static() ? past_kv_cache_shape[primitive->concat_axis].get_length() : 0;
 

@@ -212,6 +212,13 @@ JitConstants SDPABase::get_jit_constants(const kernel_impl_params& params) const
             jit.make("HAS_SINK_INPUT", 1);
         }
         jit.make("IS_KV_COMPRESSED", desc->is_kv_compressed);
+        {
+            // const bool is_int4 = ov::element::Type(desc->quantization_attributes.quantization_dt).bitwidth() == 4;
+            const auto kv_cache_dt = params.get_program().get_config().get_kv_cache_precision();
+            const bool is_int4 = ov::element::Type(kv_cache_dt).bitwidth() == 4;
+            // std::cout << ">> get_sdpa_jit_constants: is_int4_compressed = " << is_int4 << std::endl;
+            jit.make("IS_INT4_COMPRESSED", is_int4);
+        }
         GPU_DEBUG_TRACE_DETAIL << "desc->is_kv_compressed = " << desc->is_kv_compressed << std::endl;
 
         const auto& in_offsets_map = params.in_port_to_shape_info_offset;

@@ -2,14 +2,16 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "include/batch_headers/fetch_data.cl"
 #include "include/batch_headers/fetch_data.cl"
 #include "include/batch_headers/common.cl"
+#include "include/batch_headers/int4_utils.cl"
 #include "include/batch_headers/sub_group_block_read.cl"
 #include "include/batch_headers/sub_group_block_write.cl"
 #include "include/batch_headers/sub_group_shuffle.cl"
 
 
+#define UINT4_RANGE 15
+
 #if OUTPUT_DIMS != 4
 #error "dynamic_quantize_gpu_kv_cache.cl: Unsupported output dimension"
 #endif
@@ -73,17 +75,56 @@ KERNEL(dynamic_quantize_gpu_kv_cache)(
     const uint input_offset = INPUT0_GET_INDEX(b, f, y, x);
     unroll_for (uint i = 0; i < INNERMOST_DIM_VALUE / SUBGROUP_SIZE; i++) {
         val[i] = INPUT_BLOCK_READ(input, input_offset + i * SUBGROUP_SIZE);
-#if ASYMMETRIC_QUANTIZATION
+#if ASYMMETRIC_QUANTIZATION || IS_INT4_COMPRESSED
         max_value = fmax(max_value, val[i]);
         min_value = fmin(min_value, val[i]);
 #else
         max_value = fmax(max_value, fabs(val[i]));
 #endif
     }
-#if !ASYMMETRIC_QUANTIZATION
+#if !ASYMMETRIC_QUANTIZATION && !IS_INT4_COMPRESSED
     max_value = fmax(max_value, grp_max);
 #endif
 
+#ifdef APPEND_MODE
+    APPEND_AXIS_NAME += axis_offset;
+#endif
+
+#if IS_INT4_COMPRESSED
+    // 4-bit unsigned asymmetric quantization: map [min, max] to [0, 15].
+    // Two INT4 values are packed per byte using SUBGROUP_SIZE-stride grouping:
+    //   output byte at physical offset (k * SUBGROUP_SIZE + sglid) holds:
+    //     lo nibble = quantized val[(2k)   * SUBGROUP_SIZE + sglid]
+    //     hi nibble = quantized val[(2k+1) * SUBGROUP_SIZE + sglid]
+    min_value = work_group_reduce_min(min_value);
+    max_value = work_group_reduce_max(max_value);
+
+    ACCUMULATOR_TYPE diff_value = max_value == min_value ? (ACCUMULATOR_TYPE)(grp_max)
+                                                         : (ACCUMULATOR_TYPE)(max_value - min_value);
+    ACCUMULATOR_TYPE scale_tmp = (ACCUMULATOR_TYPE)((UINT4_RANGE) / diff_value);
+    ACCUMULATOR_TYPE zp_tmp    = (ACCUMULATOR_TYPE)(-min_value * scale_tmp); // maps min -> 0, max -> UINT4_RANGE
+
+    // INT4 packed buffer: the output layout uses i8 with full head_size shape.
+    // Use element-level offset directly (same stride as layout) so that SDPA
+    // can address rows with the standard GET_INDEX pitch.
+    const uint output_offset = OUTPUT_GET_INDEX(b, f, y, x);
+    // Pairs of consecutive SUBGROUP_SIZE blocks are packed together.
+    unroll_for (uint i = 0; i < INNERMOST_DIM_VALUE / SUBGROUP_SIZE; i += 2) {
+        uchar q0 = (uchar)clamp(convert_int_rte((float)val[i]     * scale_tmp + zp_tmp), 0, UINT4_RANGE);
+        uchar q1 = (uchar)clamp(convert_int_rte((float)val[i + 1] * scale_tmp + zp_tmp), 0, UINT4_RANGE);
+        // Pack: lo nibble = q0, hi nibble = q1
+        char packed = cvt_uint8x2_to_uint4x2((uchar2)(q0, q1));
+        OUTPUT_BLOCK_WRITE(output, output_offset + (i / 2) * SUBGROUP_SIZE, packed);
+    }
+
+    const uint scale_idx = FUNC_CALL(get_scales_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x);
+    if (grouped_indexes == 0 && sglid == 0) {
+        output_scale[scale_idx]     = (OUTPUT1_TYPE)(1.0f / scale_tmp); // dequant scale
+        output_scale[scale_idx + 1] = (OUTPUT1_TYPE)(zp_tmp);           // zero-point
+    }
+
+#else  // !IS_INT4_COMPRESSED — original INT8 path
+
 #if ASYMMETRIC_QUANTIZATION
     min_value = work_group_reduce_min(min_value);
     max_value = work_group_reduce_max(max_value);
@@ -100,10 +141,6 @@ KERNEL(dynamic_quantize_gpu_kv_cache)(
     OUTPUT1_TYPE scale = 127.0h / max_value;
 #endif
 
-#ifdef APPEND_MODE
-    APPEND_AXIS_NAME += axis_offset;
-#endif
-
     const uint output_offset = OUTPUT_GET_INDEX(b, f, y, x);
     unroll_for (uint i = 0; i < INNERMOST_DIM_VALUE / SUBGROUP_SIZE; i++) {
 #if ASYMMETRIC_QUANTIZATION
@@ -134,4 +171,6 @@ KERNEL(dynamic_quantize_gpu_kv_cache)(
         output_scale[scale_idx] = 1.0h / scale;
 #endif
     }
+
+#endif  // IS_INT4_COMPRESSED
 }
@@ -88,6 +88,9 @@ ParamsKey DynamicQuantizeKernelKVCache::GetSupportedKey() const {
     ParamsKey k;
     k.EnableInputDataType(Datatype::F16);
     k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::INT4);
+    k.EnableOutputDataType(Datatype::UINT4);
     k.EnableDifferentTypes();
     k.EnableAllInputLayout();
     k.EnableAllOutputLayout();
@@ -141,6 +144,7 @@ JitConstants DynamicQuantizeKernelKVCache::GetJitConstants(const dynamic_quantiz
     jit.AddConstant(MakeJitConstant("ITERATIONS_NUMBER", iterations_number));
     jit.AddConstant(MakeJitConstant("ASYMMETRIC_QUANTIZATION", params.use_asymmetric_quantization));
     jit.AddConstant(MakeJitConstant("GROUP_SCALES_WITH_ZP", params.combine_scales_and_zp));
+    jit.AddConstant(MakeJitConstant("IS_INT4_COMPRESSED", params.is_int4_compressed));
 
     // Use FP32 accumulator type for scale/zp calculation
     jit.Merge(MakeTypeJitConstants(Datatype::F32, "ACCUMULATOR"));
@@ -184,6 +188,19 @@ CommonDispatchData DynamicQuantizeKernelKVCache::SetDefault(const dynamic_quanti
     const auto total_grouped_elements = get_elements_number_per_group(params);
     const auto total_subgroups_number = total_grouped_elements / input_dims.back().v;
 
+    // [DEBUG]
+    // {
+    //     size_t total_elements_number = 1;
+    //     const auto& group_sizes = params.group_sizes;
+    //     for (size_t i = 0; i < group_sizes.size(); i++) {
+    //         if (group_sizes[i] != UINT64_MAX) {
+    //             total_elements_number *= input_dims[i].v;
+    //         }
+    //     }
+    //     std::cout << "  >> group_sizes : " << group_sizes[0] << ", " << group_sizes[1] << ", " << group_sizes[2] << ", " << group_sizes[3]
+    //                 << " => total_elements_number : " << total_elements_number << ", total_batched_elements : " << total_batched_elements << std::endl;
+    // }
+
     dispatchData.gws = {subgroup_size, total_subgroups_number, total_batched_elements};
     dispatchData.lws = {subgroup_size, total_subgroups_number, 1};
 

@@ -21,6 +21,7 @@ struct dynamic_quantize_params : public base_params {
     bool use_asymmetric_quantization = false;
     bool combine_scales_and_zp = false;
     bool generate_precomputed_reduction = false;
+    bool is_int4_compressed = false;
 };
 
 class DynamicQuantizeKernelRef : public KernelBaseOpenCL {

@@ -165,9 +165,11 @@ VariableStateIndirectKVCacheCompressed::VariableStateIndirectKVCacheCompressed(
     const std::vector<cldnn::layout>& output_layouts,
     size_t beam_idx,
     size_t concat_idx,
-    bool has_zp_state = false)
+    bool has_zp_state,
+    bool is_4bit_kv_cache)
     : VariableStateIndirectKVCache(info, context, shape_predictor, beam_idx, concat_idx),
-      m_has_zp_state(has_zp_state) {
+      m_has_zp_state(has_zp_state),
+      m_is_4bit_kv_cache(is_4bit_kv_cache) {
     OPENVINO_ASSERT((has_zp_state && output_layouts.size() == 3) ||
                     (!has_zp_state && output_layouts.size() == 2),
                     "[GPU] Unexpected number of output layouts for VariableStateIndirectKVCacheCompressed");
@@ -185,6 +187,12 @@ VariableStateIndirectKVCacheCompressed::VariableStateIndirectKVCacheCompressed(
     OPENVINO_ASSERT((!m_has_zp_state && m_hidden_states.size() == 3) || (m_has_zp_state && m_hidden_states.size() == 4),
                     "[GPU] VariableStateIndirectKVCacheCompressed expects 3 or 4 internal states to be initialized, "
                     "actual number is ", m_hidden_states.size());
+
+    // For 4-bit KV-cache, two INT4 values are packed per byte.
+    // Halve the innermost dim of the allocation to reduce physical memory usage.
+    if (m_is_4bit_kv_cache) {
+        m_hidden_states[0]->set_alloc_inner_dim_divisor(2);
+    }
 }
 
 VariableState::Ptr VariableStateIndirectKVCacheCompressed::get_compression_scale_state() const {

@@ -667,13 +667,16 @@ void SyncInferRequest::allocate_states() {
         }
 
         if (compressed) {
+            const auto kv_precision = m_graph->get_config().get_kv_cache_precision();
+            const bool is_4bit_kv_cache = ov::element::Type(kv_precision).bitwidth() == 4;
             m_variables.emplace(vi.first, std::make_shared<VariableStateIndirectKVCacheCompressed>(vi.second,
                                                                                                    m_context,
                                                                                                    m_shape_predictor,
                                                                                                    states_layouts,
                                                                                                    beam_axis,
                                                                                                    concat_axis,
-                                                                                                   has_zp_state));
+                                                                                                   has_zp_state,
+                                                                                                   is_4bit_kv_cache));
         } else if (indirect_kv_cache) {
             m_variables.emplace(vi.first, std::make_shared<VariableStateIndirectKVCache>(vi.second,
                                                                                          m_context,

@@ -4,6 +4,7 @@
 
 #include "intel_gpu/op/kv_cache.hpp"
 #include "intel_gpu/op/kv_cache_compressed.hpp"
+#include "intel_gpu/runtime/utils.hpp"
 #include "gather_shape_inference.hpp"
 #include "concat_shape_inference.hpp"
 #include "openvino/core/partial_shape.hpp"
@@ -207,8 +208,8 @@ KVCacheCompressed::KVCacheCompressed(const OutputVector& inputs,
     : KVCache(inputs, past_variable, true, trim, concat_axis, gather_axis, output_type)
     , m_compressed(true)
     , m_quantization_attrs(quantization_attrs) {
-    OPENVINO_ASSERT(quantization_attrs.quantization_dt == ov::element::i8,
-                    "[GPU] Only I8 data type is currently supported for KV-cache compression");
+    OPENVINO_ASSERT(cldnn::one_of(quantization_attrs.quantization_dt , {element::i8, element::i4, element::u4}),
+                    "[GPU] data type is currently not supported for KV-cache compression");
 
     m_variable = past_variable;
     size_t output_size = 3;

@@ -119,10 +119,21 @@ void VariableState::update_device_buffer() {
         const auto alloc_type = m_context->get_engine().use_unified_shared_memory() ? cldnn::allocation_type::usm_device : cldnn::allocation_type::cl_mem;
         const auto current_buf_size = m_layout.get_padded_dims();
         ov::Shape current_shape(current_buf_size.begin(), current_buf_size.end());
-        const auto alloc_shape = predict_shape(m_name, cldnn::layout(current_shape, m_layout.data_type, m_layout.format), *m_shape_predictor);
-        const auto alloc_layout = cldnn::layout(alloc_shape, m_layout.data_type, m_layout.format);
-        m_memory = m_context->get_engine().allocate_memory(alloc_layout, alloc_type, false);
-        actual_size = std::max(actual_size, alloc_layout.bytes_count());
+        auto alloc_shape = predict_shape(m_name, cldnn::layout(current_shape, m_layout.data_type, m_layout.format), *m_shape_predictor);
+
+        // For INT4 packed KV-cache, halve the innermost dim to reduce physical allocation.
+        // actual_size tracks LOGICAL capacity (un-halved) for correct max_pad calculations.
+        if (m_alloc_inner_dim_divisor > 1 && !alloc_shape.empty()) {
+            auto logical_alloc_shape = alloc_shape;
+            alloc_shape.back() /= m_alloc_inner_dim_divisor;
+            const auto alloc_layout = cldnn::layout(alloc_shape, m_layout.data_type, m_layout.format);
+            m_memory = m_context->get_engine().allocate_memory(alloc_layout, alloc_type, false);
+            actual_size = std::max(actual_size, cldnn::layout(logical_alloc_shape, m_layout.data_type, m_layout.format).bytes_count());
+        } else {
+            const auto alloc_layout = cldnn::layout(alloc_shape, m_layout.data_type, m_layout.format);
+            m_memory = m_context->get_engine().allocate_memory(alloc_layout, alloc_type, false);
+            actual_size = std::max(actual_size, alloc_layout.bytes_count());
+        }
     }
 
     OPENVINO_ASSERT(m_memory != nullptr, "m_memory is nullptr!!!");