openvinotoolkit · p-wysocki · Feb 9, 2026 · Feb 9, 2026 · Feb 9, 2026 · Feb 12, 2026
@@ -200,14 +200,11 @@ static std::shared_ptr<ov::Node> handle_baichuan2_13b_alibi(
 
 static std::shared_ptr<ov::Node> handle_gemma3_token_type_ids(
     const std::map<std::string, std::shared_ptr<v0::Parameter>>& optional_model_wide_params) {
-    if (optional_model_wide_params.find("token_type_ids") != optional_model_wide_params.end()) {
-        auto param = optional_model_wide_params.at("token_type_ids");
-        if (param->get_element_type() != ov::element::i32) {
-            return std::make_shared<v0::Convert>(param, ov::element::i32);
-        }
-        return param;
+    auto param = optional_model_wide_params.at("token_type_ids");
+    if (param->get_element_type() != ov::element::i32) {
+        return std::make_shared<v0::Convert>(param, ov::element::i32);
     }
-    return v0::Constant::create(ov::element::i32, ov::Shape{0}, {});
+    return param;
 }
 
 static std::tuple<std::shared_ptr<ov::Node>, std::shared_ptr<ov::Node>> phi3_sliding_window_pattern() {
@@ -438,10 +435,11 @@ ov::pass::StateManagementPattern::StateManagementPattern(
 
     auto sdpa_variants = std::make_shared<Or>(OutputVector{sdpa_with_4_inputs, sdpa_with_5_inputs, sdpa_with_6_inputs});
 
-    // Shared flag to track whether the model is Gemma3, set when any layer matches
-    // the gptoss_gemma3 sliding window pattern. Combined with the token_type_ids check,
-    // this uniquely identifies Gemma3 (gpt-oss shares the pattern but lacks token_type_ids).
-    auto is_gptoss_gemma3 = std::make_shared<bool>(false);
+    // Set to true once a sliding_attention layer matching the gptoss_gemma3 pattern is found
+    // alongside a token_type_ids model input - the combination that uniquely identifies Gemma3
+    // since pattern for full attention mask in Gemma3 is different than sliding window
+    // it has to be persistent in the callback, so shared_ptr is used
+    auto has_token_type_ids = std::make_shared<bool>(false);
 
     ov::matcher_pass_callback callback = [=,
                                           &kv_parameters,
@@ -621,7 +619,9 @@ ov::pass::StateManagementPattern::StateManagementPattern(
             }
             sliding_window = std::make_shared<v1::Subtract>(v0::Constant::create(element::i32, Shape{}, {2}), offset);
         } else if (pattern_map.count(gptoss_gemma3_offset)) {
-            *is_gptoss_gemma3 = true;
+            // gptoss_gemma3 pattern + token_type_ids input uniquely identifies Gemma3;
+            // gpt-oss shares this sliding window pattern but has no token_type_ids.
+            *has_token_type_ids = optional_model_wide_params.count("token_type_ids");
             auto offset = pattern_map.at(gptoss_gemma3_offset).get_node_shared_ptr();
             if (pattern_map.at(gptoss_gemma3_offset).get_partial_shape().rank() != 0) {
                 offset = std::make_shared<v15::Squeeze>(offset);
@@ -756,7 +756,7 @@ ov::pass::StateManagementPattern::StateManagementPattern(
         }
         OPENVINO_ASSERT(pa_arguments.size() == 25);
 
-        if (*is_gptoss_gemma3) {
+        if (*has_token_type_ids) {
             pa_arguments.insert(pa_arguments.begin() + 25, handle_gemma3_token_type_ids(optional_model_wide_params));
         } else {
             pa_arguments.insert(pa_arguments.begin() + 25, v0::Constant::create(element::i32, Shape{0}, {}));

@@ -270,5 +270,98 @@ TEST(type_prop, paged_attention_invalid_rank_key_cache) {
     EXPECT_THROW(std::ignore = std::make_shared<op::PagedAttentionExtension>(args), ov::NodeValidationFailure);
 }
 
+static ov::OutputVector make_args_with_token_type(const std::shared_ptr<ov::op::v0::Parameter>& token_type_ids) {
+    using namespace ov::op;
+    const auto query = std::make_shared<v0::Parameter>(ov::element::f32, ov::PartialShape{3, 4});
+    const auto key = std::make_shared<v0::Parameter>(ov::element::f32, ov::PartialShape{3, 4});
+    const auto value = std::make_shared<v0::Parameter>(ov::element::f32, ov::PartialShape{3, 4});
+    const auto key_cache = std::make_shared<v0::Parameter>(ov::element::f32, ov::PartialShape{6, 2, 5, 4});
+    const auto value_cache = std::make_shared<v0::Parameter>(ov::element::f32, ov::PartialShape{6, 2, 5, 4});
+    const auto past_lens = std::make_shared<v0::Parameter>(ov::element::i32, ov::PartialShape{5});
+    const auto subsequence_begins = std::make_shared<v0::Parameter>(ov::element::i32, ov::PartialShape{5});
+    const auto block_indices = std::make_shared<v0::Parameter>(ov::element::i32, ov::PartialShape{15});
+    const auto block_indices_begins = std::make_shared<v0::Parameter>(ov::element::i32, ov::PartialShape{8});
+    const auto scale = std::make_shared<v0::Parameter>(ov::element::f32, ov::PartialShape{});
+    const auto sliding_window = std::make_shared<v0::Parameter>(ov::element::i32, ov::PartialShape{});
+    const auto alibi_slopes = std::make_shared<v0::Parameter>(ov::element::f32, ov::PartialShape{9});
+    const auto max_context_len = std::make_shared<v0::Parameter>(ov::element::i32, ov::PartialShape{});
+    const auto score_aggregation_window = std::make_shared<v0::Parameter>(ov::element::i32, ov::PartialShape{5});
+    const auto rotated_block_indices = std::make_shared<v0::Parameter>(ov::element::i32, ov::PartialShape{3});
+    const auto rotation_deltas = std::make_shared<v0::Parameter>(ov::element::i32, ov::PartialShape{12, 1});
+    const auto rotation_trig_lut = std::make_shared<v0::Parameter>(ov::element::f32, ov::PartialShape{256, 4});
+    const auto xattention_threshold = std::make_shared<v0::Parameter>(ov::element::f32, ov::PartialShape{5});
+    const auto xattention_block_size = std::make_shared<v0::Parameter>(ov::element::i32, ov::PartialShape{});
+    const auto xattention_stride = std::make_shared<v0::Parameter>(ov::element::i32, ov::PartialShape{});
+    const auto sinks = std::make_shared<v0::Parameter>(ov::element::f32, ov::PartialShape{1, 2, 1, 1});
+    const auto adaptive_rkv_start_size = std::make_shared<v0::Parameter>(ov::element::i32, ov::PartialShape{});
+    const auto adaptive_rkv_evictable_sizes = std::make_shared<v0::Parameter>(ov::element::i32, ov::PartialShape{5});
+    const auto adaptive_rkv_diversity_block_set_indices =
+        std::make_shared<v0::Parameter>(ov::element::i32, ov::PartialShape{10});
+    const auto adaptive_rkv_diversity_block_set_indices_begins =
+        std::make_shared<v0::Parameter>(ov::element::i32, ov::PartialShape{5});
+
+    return {query,
+            key,
+            value,
+            key_cache,
+            value_cache,
+            past_lens,
+            subsequence_begins,
+            block_indices,
+            block_indices_begins,
+            scale,
+            sliding_window,
+            alibi_slopes,
+            max_context_len,
+            score_aggregation_window,
+            rotated_block_indices,
+            rotation_deltas,
+            rotation_trig_lut,
+            xattention_threshold,
+            xattention_block_size,
+            xattention_stride,
+            sinks,
+            adaptive_rkv_start_size,
+            adaptive_rkv_evictable_sizes,
+            adaptive_rkv_diversity_block_set_indices,
+            adaptive_rkv_diversity_block_set_indices_begins,
+            token_type_ids};
+}
+
+TEST(type_prop, paged_attention_token_type_ids_1d) {
+    const auto token_type_ids = std::make_shared<op::v0::Parameter>(ov::element::i32, ov::PartialShape{3});
+    const auto args = make_args_with_token_type(token_type_ids);
+    const auto op = std::make_shared<op::PagedAttentionExtension>(args);
+    EXPECT_EQ(op->get_output_element_type(0), ov::element::f32);
+    EXPECT_EQ(op->get_output_partial_shape(0), (ov::PartialShape{3, 4}));
+}
+
+TEST(type_prop, paged_attention_token_type_ids_2d) {
+    const auto token_type_ids = std::make_shared<op::v0::Parameter>(ov::element::i32, ov::PartialShape{1, 3});
+    const auto args = make_args_with_token_type(token_type_ids);
+    const auto op = std::make_shared<op::PagedAttentionExtension>(args);
+    EXPECT_EQ(op->get_output_element_type(0), ov::element::f32);
+    EXPECT_EQ(op->get_output_partial_shape(0), (ov::PartialShape{3, 4}));
+}
+
+TEST(type_prop, paged_attention_token_type_ids_dynamic_shape) {
+    const auto token_type_ids =
+        std::make_shared<op::v0::Parameter>(ov::element::i32, ov::PartialShape{ov::Dimension::dynamic()});
+    const auto args = make_args_with_token_type(token_type_ids);
+    EXPECT_NO_THROW(std::ignore = std::make_shared<op::PagedAttentionExtension>(args));
+}
+
+TEST(type_prop, paged_attention_invalid_type_token_type_ids) {
+    const auto token_type_ids = std::make_shared<op::v0::Parameter>(ov::element::f32, ov::PartialShape{3});
+    const auto args = make_args_with_token_type(token_type_ids);
+    EXPECT_THROW(std::ignore = std::make_shared<op::PagedAttentionExtension>(args), ov::NodeValidationFailure);
+}
+
+TEST(type_prop, paged_attention_invalid_rank_token_type_ids) {
+    const auto token_type_ids = std::make_shared<op::v0::Parameter>(ov::element::i32, ov::PartialShape{1, 1, 3});
+    const auto args = make_args_with_token_type(token_type_ids);
+    EXPECT_THROW(std::ignore = std::make_shared<op::PagedAttentionExtension>(args), ov::NodeValidationFailure);
+}
+
 }  // namespace testing
 }  // namespace ov