openvinotoolkit · dmatveev · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026
@@ -19,7 +19,6 @@ ov::npuw::IBaseInferRequest::IBaseInferRequest(const std::shared_ptr<ov::npuw::C
       m_npuw_model(compiled_model),
       m_num_submodels(m_npuw_model->m_compiled_submodels.size()) {
     m_subrequests.resize(m_num_submodels, {});
-    m_subrequest_devices.resize(m_num_submodels, {});
     m_completion_cbs.resize(m_num_submodels, {});
     if (m_npuw_model->m_acc_check) {
         m_ref_subrequests.resize(m_num_submodels);
@@ -34,8 +33,7 @@ ov::npuw::IBaseInferRequest::IBaseInferRequest(const std::shared_ptr<ov::npuw::C
 }
 
 ov::npuw::IBaseInferRequest::RqPtrs ov::npuw::IBaseInferRequest::create_infer_requests(std::size_t id,
-                                                                                       std::size_t nireq,
-                                                                                       bool* recompiled) {
+                                                                                       std::size_t nireq) {
     NPUW_ASSERT(nireq > 0);
     RqPtrs rqs;
     rqs.reserve(nireq);
@@ -44,39 +42,9 @@ ov::npuw::IBaseInferRequest::RqPtrs ov::npuw::IBaseInferRequest::create_infer_re
     auto& comp_model_desc = m_npuw_model->m_compiled_submodels[id];
     NPUW_ASSERT(comp_model_desc.replaced_by.value_or(id) == id);
 
-    bool successful = false;
-    bool can_try_again = true;
-
-    // Altering iterators here!! Contracts should be changed!
-    while (!successful && can_try_again) {
-        bool should_recompile = false;
-        try {
-            // FIXME: As the model may recompile, reference
-            // shouldn't be lifted from the loop
-            auto& comp_model = comp_model_desc.compiled_model;
-            rqs.clear();
-            for (std::size_t i = 0u; i < nireq; i++) {
-                rqs.emplace_back(comp_model->create_infer_request(), comp_model._so);
-            }
-            successful = true;
-        } catch (const std::exception& ex) {
-            LOG_WARN("Subgraph [" << id << "] - Failed to create infer request:" << std::endl << ex.what());
-            should_recompile = true;
-        } catch (...) {
-            LOG_WARN("Subgraph [" << id << "] - Failed to create infer request: REASON UNKNOWN");
-            should_recompile = true;
-        }
-        if (should_recompile) {
-            LOG_INFO("- Trying next device...");
-            comp_model_desc.device_it++;
-            can_try_again = m_npuw_model->compile_for_success(id);
-            if (can_try_again && recompiled) {
-                *recompiled = true;
-            }
-        }
-    }  // while(!new_ireq && can_try_again)
-    if (!successful) {
-        OPENVINO_THROW("NPUW: Fatal - couldn't create infer request for Subgraph[", id, "]");
+    auto& comp_model = comp_model_desc.compiled_model;
+    for (std::size_t i = 0u; i < nireq; i++) {
+        rqs.emplace_back(comp_model->create_infer_request(), comp_model._so);
     }
     NPUW_ASSERT(rqs.size() == nireq);
 
@@ -286,8 +254,7 @@ std::vector<ov::ProfilingInfo> ov::npuw::IBaseInferRequest::get_profiling_info()
 
 std::string ov::npuw::IBaseInferRequest::profile_tag(std::size_t idx) const {
     // So far accumulate over devices involved
-    const auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real(idx)];
-    return *proto_comp_model_desc.device_it;
+    return m_npuw_model->submodel_device(real(idx));
 }
 
 void ov::npuw::IBaseInferRequest::infer() {
@@ -350,8 +317,7 @@ std::string ov::npuw::IBaseInferRequest::global_input_mem_device(std::size_t idx
 
     const auto& to_submodel = m_npuw_model->m_inputs_to_submodels_inputs.at(idx);
     if (to_submodel != CompiledModel::NO_LINK) {
-        const auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real(to_submodel.first)];
-        return *proto_comp_model_desc.device_it;
+        return m_npuw_model->submodel_device(real(to_submodel.first));
     }
 
     // Resort to global again
@@ -361,8 +327,7 @@ std::string ov::npuw::IBaseInferRequest::global_input_mem_device(std::size_t idx
 std::string ov::npuw::IBaseInferRequest::global_output_mem_device(std::size_t idx) const {
     // Pick the affinitiy based on the producer subgraph
     const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(idx);
-    const auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real(from_submodel.first)];
-    return *proto_comp_model_desc.device_it;
+    return m_npuw_model->submodel_device(real(from_submodel.first));
 }
 
 void ov::npuw::IBaseInferRequest::alloc_quant_gather() {
@@ -781,7 +746,7 @@ void ov::npuw::IBaseInferRequest::bind_attention_inputs(std::size_t idx, RqPtr r
                 dst->set_shape(shape);
                 const auto new_ptr = dst->data();
                 if (old_ptr != new_ptr) {
-                    m_footprint[*comp_model_desc.device_it] += dst->get_byte_size();
+                    m_footprint[m_npuw_model->submodel_device(real(idx))] += dst->get_byte_size();
                 }
                 LOG_DEBUG("Do copy: " << shape << "...");
                 view->copy_to(dst._ptr);
@@ -1091,7 +1056,7 @@ bool ov::npuw::IBaseInferRequest::needs_copy(std::size_t idx) const {
     // the set/get_ tensor API
     auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
     const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
-    if (ov::npuw::util::starts_with(m_subrequest_devices[real_idx], "CPU")) {
+    if (ov::npuw::util::starts_with(m_npuw_model->submodel_device(real_idx), "CPU")) {
         return false;
     }
 

@@ -89,20 +89,14 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
     // function bodies. Function calls are not allowed to have
     // their inference requests anymore - they must be stored
     // only once in the subrequests list
-    RqPtrs create_infer_requests(std::size_t id, size_t nireq = 1, bool* recompiled = nullptr);
+    RqPtrs create_infer_requests(std::size_t id, size_t nireq = 1);
     void ensure_subrequest_is_accurate(std::size_t idx, bool& failover);
     virtual void update_subrequest_links(std::size_t idx) = 0;
 
     std::shared_ptr<ov::npuw::CompiledModel> m_npuw_model;
     std::vector<IBaseInferRequest::Completed> m_completion_cbs;
     RqPtrs m_subrequests;
 
-    // This vector is used to track devices for individual subrequests
-    // here locally. Note that the models can be recompiled in
-    // contexts of other requests (if multiple of those are created)
-    // so this cached information is used to detect these situations.
-    std::vector<std::string> m_subrequest_devices;
-
     struct TensorStorage {
         ov::SoPtr<ov::ITensor> tensor;
         bool persistent = false;       // true for the parent I/O tensors