add support for QNN stateful models

qti-ashimaj · qti-ashimaj · commit 9d44197bb544 · 2026-03-06T13:54:57.000+05:30
diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp
@@ -6,6 +6,7 @@
 #include "kv_cache.h"
 #include "windowed_kv_cache.h"
 #include "../openvino/interface.h"
+#include "../qnn/interface.h"
 #include <algorithm>
 
 namespace Generators {
@@ -507,10 +508,10 @@ bool IsCacheNeeded(const Model& model) {
 }  // namespace
 
 std::unique_ptr<KeyValueCache> CreateKeyValueCache(State& state) {
-  // For OpenVINO Stateful models, they do not contain exposed past/present KV tensors.
+  // For OpenVINO and QNN Stateful models, they do not contain exposed past/present KV tensors.
   // In this case, 'IsCacheNeeded' below will return false. But in this case we need to create a
   // special 'ModelManagedKeyValueCache' object, and so we check this condition first.
-  if (IsOpenVINOStatefulModel(state.model_)) {
+  if (IsOpenVINOStatefulModel(state.model_) || IsQNNStatefulModel(state.model_)) {
     if (g_log.enabled)
       Log("info", "CreateKeyValueCache: Creating ModelManagedKeyValueCache");
     return std::make_unique<ModelManagedKeyValueCache>(state);
diff --git a/src/models/logits.cpp b/src/models/logits.cpp
@@ -6,6 +6,7 @@
 #include "model.h"
 #include "logits.h"
 #include "../openvino/interface.h"
+#include "../qnn/interface.h"
 
 namespace Generators {
 
@@ -17,8 +18,8 @@ Logits::Logits(State& state)
 
   input_sequence_lengths.resize(state_.params_->search.batch_size);
 
-  if (IsOpenVINOStatefulModel(state.model_) || state.model_.IsPruned()) {
-    // In the case of OpenVINO stateful models, or any model whose ONNX graph
+  if (IsOpenVINOStatefulModel(state.model_) || IsQNNStatefulModel(state.model_) || state.model_.IsPruned()) {
+    // In the case of OpenVINO and QNN stateful models, or any model whose ONNX graph
     // has been patched to only output last-token logits (logits dim[1]==1), they only return the
     // sliced logits needed for sampling. For example, given 43 prompt tokens, instead of returning
     // logits of the shape:  [1,43,<vocab_size>]
diff --git a/src/qnn/interface.cpp b/src/qnn/interface.cpp
@@ -3,6 +3,7 @@
 
 #include "../generators.h"
 #include "../search.h"
+#include "../models/model.h"
 #include "interface.h"
 
 namespace Generators {
@@ -78,4 +79,24 @@ DeviceInterface* GetQNNInterface() {
   return g_device.get();
 }
 
+bool IsQNNStatefulModel(const Model& model) {
+  if (model.p_device_->GetType() == DeviceType::QNN || model.p_device_->GetType() == DeviceType::CPU) {
+    const auto& provider_options = model.config_->model.decoder.session_options.provider_options;
+    for (auto& po : provider_options) {
+      if (po.name == "QNN") {
+        const auto& qnn_options = po.options;
+        for (auto& option : qnn_options) {
+          // For QNN, if session option 'genai_model' is set, the session will encapsulate
+          // a stateful model, so KVCache will be managed internally.
+          if (option.first == "genai_model" && option.second == "True") {
+            return true;
+          }
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
 }  // namespace Generators
diff --git a/src/qnn/interface.h b/src/qnn/interface.h
@@ -5,4 +5,7 @@ namespace Generators {
 
 DeviceInterface* GetQNNInterface();
 
+struct Model;
+bool IsQNNStatefulModel(const Model& model);
+
 }  // namespace Generators