Reduce KV-cache memory size halve

byungilm · byungilm · commit 28549e34c3b8 · 2026-03-25T18:15:07.000+09:00
Signed-off-by: Min, Byungil &lt;byungil.min@intel.com&gt;
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp
@@ -55,7 +55,8 @@ class VariableStateIndirectKVCacheCompressed : public VariableStateIndirectKVCac
                                            const std::vector<cldnn::layout>& output_layouts,
                                            size_t beam_idx,
                                            size_t concat_idx,
-                                           bool has_zp_state);
+                                           bool has_zp_state,
+                                           bool is_4bit_kv_cache = false);
     using Ptr = std::shared_ptr<VariableStateIndirectKVCacheCompressed>;
 
     void set_state(const ov::SoPtr<ov::ITensor>& state) override;
@@ -70,5 +71,6 @@ class VariableStateIndirectKVCacheCompressed : public VariableStateIndirectKVCac
 
 private:
     bool m_has_zp_state = false;
+    bool m_is_4bit_kv_cache = false;
 };
 }  // namespace ov::intel_gpu
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/variable_state.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/variable_state.hpp
@@ -72,6 +72,8 @@ class VariableState : public VariableStateBase {
         return m_initial_layout;
     }
 
+    void set_alloc_inner_dim_divisor(size_t divisor) { m_alloc_inner_dim_divisor = divisor; }
+
     ov::element::Type get_user_specified_type() const;
 
 protected:
@@ -82,6 +84,7 @@ class VariableState : public VariableStateBase {
     cldnn::memory::ptr m_memory = nullptr;
     bool m_transpose_required = false;
     size_t actual_size = 0;
+    size_t m_alloc_inner_dim_divisor = 1;
 
     const cldnn::layout m_initial_layout;
 
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/sdpa_opt.cl b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/sdpa_opt.cl
@@ -264,10 +264,10 @@ KERNEL(sdpa_opt)(
 #if IS_INT4_COMPRESSED && !defined(BEAM_TABLE_TYPE)
     #ifdef INPUT1_DIMS_ORDER
             const uint key_base_p0 = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 0, 0);
-            const uint key_packed_pitch_p0 = (FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 1, 0) - key_base_p0) / 2;
+            const uint key_packed_pitch_p0 = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 1, 0) - key_base_p0;
     #else
             const uint key_base_p0 = INPUT1_GET_INDEX(b0_idx, b1_idx, 0, 0);
-            const uint key_packed_pitch_p0 = K_HEAD_SIZE / 2;
+            const uint key_packed_pitch_p0 = K_HEAD_SIZE;
     #endif
 #endif
             for (uint seq_len = sgid; seq_len < partition_seq_len; seq_len += SUBGROUPS_PER_WG) {
@@ -713,13 +713,13 @@ KERNEL(sdpa_opt)(
         uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 0, 0);
         uint value_offset_next_seq = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 1, 0);
     #if IS_INT4_COMPRESSED
-        const uint value_pitch = (value_offset_next_seq - value_offset) / 2;
+        const uint value_pitch = value_offset_next_seq - value_offset;
     #else
         const uint value_pitch = value_offset_next_seq - value_offset;
     #endif
 #else
     #if IS_INT4_COMPRESSED
-        const uint value_pitch = V_HEAD_SIZE / 2;
+        const uint value_pitch = V_HEAD_SIZE;
     #else
         const uint value_pitch = V_HEAD_SIZE;
     #endif
@@ -1296,10 +1296,10 @@ KERNEL(sdpa_opt)(
 #if IS_INT4_COMPRESSED && !defined(IS_PAGED_ATTENTION) && !defined(BEAM_TABLE_TYPE)
     #ifdef INPUT1_DIMS_ORDER
     const uint key_base_s1 = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 0, 0);
-    const uint key_packed_pitch_s1 = (FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 1, 0) - key_base_s1) / 2;
+    const uint key_packed_pitch_s1 = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 1, 0) - key_base_s1;
     #else
     const uint key_base_s1 = INPUT1_GET_INDEX(b0_idx, b1_idx, 0, 0);
-    const uint key_packed_pitch_s1 = K_HEAD_SIZE / 2;
+    const uint key_packed_pitch_s1 = K_HEAD_SIZE;
     #endif
 #endif
 
@@ -1373,7 +1373,7 @@ KERNEL(sdpa_opt)(
                     // INT4: process 2*SUBGROUP_SIZE logical head dims per iteration (one packed byte per lane per token row)
                     #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, 1, ptr, offset);
                     #define QUERY_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE)
-                    const uint key_pitch_int4 = K_HEAD_SIZE / 2;
+                    const uint key_pitch_int4 = K_HEAD_SIZE;
                     for (uint hi = 0; hi < K_HEAD_SIZE; hi += 2 * SUBGROUP_SIZE) {
                         QUERY_VEC qvec_lo, qvec_hi;
                         uint qlo = hi * TARGET_SEQ_LEN_BLOCK_SIZE + sglid;
@@ -1470,7 +1470,7 @@ KERNEL(sdpa_opt)(
                     // INT4 partial block: process 2*SUBGROUP_SIZE logical head dims per iteration
                     #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, 1, ptr, offset)
                     #define QUERY_VEC_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE)
-                    const uint key_pitch_int4 = K_HEAD_SIZE / 2;
+                    const uint key_pitch_int4 = K_HEAD_SIZE;
                     for (uint hi = 0; hi < K_HEAD_SIZE; hi += 2 * SUBGROUP_SIZE) {
                         QUERY_VEC_TYPE qvec_lo, qvec_hi;
                         uint qlo = hi * TARGET_SEQ_LEN_BLOCK_SIZE + sglid;
@@ -1856,13 +1856,13 @@ KERNEL(sdpa_opt)(
             uint value_offset_base = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 0, 0);
             uint value_offset_next_seq = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 1, 0);
     #if IS_INT4_COMPRESSED
-            const uint value_pitch = (value_offset_next_seq - value_offset_base) / 2;
+            const uint value_pitch = value_offset_next_seq - value_offset_base;
     #else
             const uint value_pitch = value_offset_next_seq - value_offset_base;
     #endif
 #else
     #if IS_INT4_COMPRESSED
-            const uint value_pitch = V_HEAD_SIZE / 2;
+            const uint value_pitch = V_HEAD_SIZE;
     #else
             const uint value_pitch = V_HEAD_SIZE;
     #endif
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_kv_cache.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_kv_cache.cl
@@ -104,9 +104,10 @@ KERNEL(dynamic_quantize_gpu_kv_cache)(
     ACCUMULATOR_TYPE scale_tmp = (ACCUMULATOR_TYPE)((UINT4_RANGE) / diff_value);
     ACCUMULATOR_TYPE zp_tmp    = (ACCUMULATOR_TYPE)(-min_value * scale_tmp); // maps min -> 0, max -> UINT4_RANGE
 
-    // INT4 packed buffer: the output layout uses i8 with full head_size shape,
-    // so divide by 2 to get the correct packed byte offset (2 INT4 values per byte).
-    const uint output_offset = OUTPUT_GET_INDEX(b, f, y, x) / 2;
+    // INT4 packed buffer: the output layout uses i8 with full head_size shape.
+    // Use element-level offset directly (same stride as layout) so that SDPA
+    // can address rows with the standard GET_INDEX pitch.
+    const uint output_offset = OUTPUT_GET_INDEX(b, f, y, x);
     // Pairs of consecutive SUBGROUP_SIZE blocks are packed together.
     unroll_for (uint i = 0; i < INNERMOST_DIM_VALUE / SUBGROUP_SIZE; i += 2) {
         uchar q0 = (uchar)clamp(convert_int_rte((float)val[i]     * scale_tmp + zp_tmp), 0, UINT4_RANGE);
diff --git a/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp b/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp
@@ -165,9 +165,11 @@ VariableStateIndirectKVCacheCompressed::VariableStateIndirectKVCacheCompressed(
     const std::vector<cldnn::layout>& output_layouts,
     size_t beam_idx,
     size_t concat_idx,
-    bool has_zp_state = false)
+    bool has_zp_state,
+    bool is_4bit_kv_cache)
     : VariableStateIndirectKVCache(info, context, shape_predictor, beam_idx, concat_idx),
-      m_has_zp_state(has_zp_state) {
+      m_has_zp_state(has_zp_state),
+      m_is_4bit_kv_cache(is_4bit_kv_cache) {
     OPENVINO_ASSERT((has_zp_state && output_layouts.size() == 3) ||
                     (!has_zp_state && output_layouts.size() == 2),
                     "[GPU] Unexpected number of output layouts for VariableStateIndirectKVCacheCompressed");
@@ -185,6 +187,12 @@ VariableStateIndirectKVCacheCompressed::VariableStateIndirectKVCacheCompressed(
     OPENVINO_ASSERT((!m_has_zp_state && m_hidden_states.size() == 3) || (m_has_zp_state && m_hidden_states.size() == 4),
                     "[GPU] VariableStateIndirectKVCacheCompressed expects 3 or 4 internal states to be initialized, "
                     "actual number is ", m_hidden_states.size());
+
+    // For 4-bit KV-cache, two INT4 values are packed per byte.
+    // Halve the innermost dim of the allocation to reduce physical memory usage.
+    if (m_is_4bit_kv_cache) {
+        m_hidden_states[0]->set_alloc_inner_dim_divisor(2);
+    }
 }
 
 VariableState::Ptr VariableStateIndirectKVCacheCompressed::get_compression_scale_state() const {
diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
@@ -667,13 +667,16 @@ void SyncInferRequest::allocate_states() {
         }
 
         if (compressed) {
+            const auto kv_precision = m_graph->get_config().get_kv_cache_precision();
+            const bool is_4bit_kv_cache = ov::element::Type(kv_precision).bitwidth() == 4;
             m_variables.emplace(vi.first, std::make_shared<VariableStateIndirectKVCacheCompressed>(vi.second,
                                                                                                    m_context,
                                                                                                    m_shape_predictor,
                                                                                                    states_layouts,
                                                                                                    beam_axis,
                                                                                                    concat_axis,
-                                                                                                   has_zp_state));
+                                                                                                   has_zp_state,
+                                                                                                   is_4bit_kv_cache));
         } else if (indirect_kv_cache) {
             m_variables.emplace(vi.first, std::make_shared<VariableStateIndirectKVCache>(vi.second,
                                                                                          m_context,
diff --git a/src/plugins/intel_gpu/src/plugin/variable_state.cpp b/src/plugins/intel_gpu/src/plugin/variable_state.cpp
@@ -119,10 +119,21 @@ void VariableState::update_device_buffer() {
         const auto alloc_type = m_context->get_engine().use_unified_shared_memory() ? cldnn::allocation_type::usm_device : cldnn::allocation_type::cl_mem;
         const auto current_buf_size = m_layout.get_padded_dims();
         ov::Shape current_shape(current_buf_size.begin(), current_buf_size.end());
-        const auto alloc_shape = predict_shape(m_name, cldnn::layout(current_shape, m_layout.data_type, m_layout.format), *m_shape_predictor);
-        const auto alloc_layout = cldnn::layout(alloc_shape, m_layout.data_type, m_layout.format);
-        m_memory = m_context->get_engine().allocate_memory(alloc_layout, alloc_type, false);
-        actual_size = std::max(actual_size, alloc_layout.bytes_count());
+        auto alloc_shape = predict_shape(m_name, cldnn::layout(current_shape, m_layout.data_type, m_layout.format), *m_shape_predictor);
+
+        // For INT4 packed KV-cache, halve the innermost dim to reduce physical allocation.
+        // actual_size tracks LOGICAL capacity (un-halved) for correct max_pad calculations.
+        if (m_alloc_inner_dim_divisor > 1 && !alloc_shape.empty()) {
+            auto logical_alloc_shape = alloc_shape;
+            alloc_shape.back() /= m_alloc_inner_dim_divisor;
+            const auto alloc_layout = cldnn::layout(alloc_shape, m_layout.data_type, m_layout.format);
+            m_memory = m_context->get_engine().allocate_memory(alloc_layout, alloc_type, false);
+            actual_size = std::max(actual_size, cldnn::layout(logical_alloc_shape, m_layout.data_type, m_layout.format).bytes_count());
+        } else {
+            const auto alloc_layout = cldnn::layout(alloc_shape, m_layout.data_type, m_layout.format);
+            m_memory = m_context->get_engine().allocate_memory(alloc_layout, alloc_type, false);
+            actual_size = std::max(actual_size, alloc_layout.bytes_count());
+        }
     }
 
     OPENVINO_ASSERT(m_memory != nullptr, "m_memory is nullptr!!!");