Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ ov::npuw::IBaseInferRequest::IBaseInferRequest(const std::shared_ptr<ov::npuw::C
m_npuw_model(compiled_model),
m_num_submodels(m_npuw_model->m_compiled_submodels.size()) {
m_subrequests.resize(m_num_submodels, {});
m_subrequest_devices.resize(m_num_submodels, {});
m_completion_cbs.resize(m_num_submodels, {});
if (m_npuw_model->m_acc_check) {
m_ref_subrequests.resize(m_num_submodels);
Expand All @@ -34,8 +33,7 @@ ov::npuw::IBaseInferRequest::IBaseInferRequest(const std::shared_ptr<ov::npuw::C
}

ov::npuw::IBaseInferRequest::RqPtrs ov::npuw::IBaseInferRequest::create_infer_requests(std::size_t id,
std::size_t nireq,
bool* recompiled) {
std::size_t nireq) {
NPUW_ASSERT(nireq > 0);
RqPtrs rqs;
rqs.reserve(nireq);
Expand All @@ -44,39 +42,9 @@ ov::npuw::IBaseInferRequest::RqPtrs ov::npuw::IBaseInferRequest::create_infer_re
auto& comp_model_desc = m_npuw_model->m_compiled_submodels[id];
NPUW_ASSERT(comp_model_desc.replaced_by.value_or(id) == id);

bool successful = false;
bool can_try_again = true;

// Altering iterators here!! Contracts should be changed!
while (!successful && can_try_again) {
bool should_recompile = false;
try {
// FIXME: As the model may recompile, reference
// shouldn't be lifted from the loop
auto& comp_model = comp_model_desc.compiled_model;
rqs.clear();
for (std::size_t i = 0u; i < nireq; i++) {
rqs.emplace_back(comp_model->create_infer_request(), comp_model._so);
}
successful = true;
} catch (const std::exception& ex) {
LOG_WARN("Subgraph [" << id << "] - Failed to create infer request:" << std::endl << ex.what());
should_recompile = true;
} catch (...) {
LOG_WARN("Subgraph [" << id << "] - Failed to create infer request: REASON UNKNOWN");
should_recompile = true;
}
if (should_recompile) {
LOG_INFO("- Trying next device...");
comp_model_desc.device_it++;
can_try_again = m_npuw_model->compile_for_success(id);
if (can_try_again && recompiled) {
*recompiled = true;
}
}
} // while(!new_ireq && can_try_again)
if (!successful) {
OPENVINO_THROW("NPUW: Fatal - couldn't create infer request for Subgraph[", id, "]");
auto& comp_model = comp_model_desc.compiled_model;
for (std::size_t i = 0u; i < nireq; i++) {
rqs.emplace_back(comp_model->create_infer_request(), comp_model._so);
}
NPUW_ASSERT(rqs.size() == nireq);

Expand Down Expand Up @@ -286,8 +254,7 @@ std::vector<ov::ProfilingInfo> ov::npuw::IBaseInferRequest::get_profiling_info()

std::string ov::npuw::IBaseInferRequest::profile_tag(std::size_t idx) const {
// So far accumulate over devices involved
const auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real(idx)];
return *proto_comp_model_desc.device_it;
return m_npuw_model->submodel_device(real(idx));
}

void ov::npuw::IBaseInferRequest::infer() {
Expand Down Expand Up @@ -350,8 +317,7 @@ std::string ov::npuw::IBaseInferRequest::global_input_mem_device(std::size_t idx

const auto& to_submodel = m_npuw_model->m_inputs_to_submodels_inputs.at(idx);
if (to_submodel != CompiledModel::NO_LINK) {
const auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real(to_submodel.first)];
return *proto_comp_model_desc.device_it;
return m_npuw_model->submodel_device(real(to_submodel.first));
}

// Resort to global again
Expand All @@ -361,8 +327,7 @@ std::string ov::npuw::IBaseInferRequest::global_input_mem_device(std::size_t idx
std::string ov::npuw::IBaseInferRequest::global_output_mem_device(std::size_t idx) const {
// Pick the affinitiy based on the producer subgraph
const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(idx);
const auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real(from_submodel.first)];
return *proto_comp_model_desc.device_it;
return m_npuw_model->submodel_device(real(from_submodel.first));
}

void ov::npuw::IBaseInferRequest::alloc_quant_gather() {
Expand Down Expand Up @@ -781,7 +746,7 @@ void ov::npuw::IBaseInferRequest::bind_attention_inputs(std::size_t idx, RqPtr r
dst->set_shape(shape);
const auto new_ptr = dst->data();
if (old_ptr != new_ptr) {
m_footprint[*comp_model_desc.device_it] += dst->get_byte_size();
m_footprint[m_npuw_model->submodel_device(real(idx))] += dst->get_byte_size();
}
LOG_DEBUG("Do copy: " << shape << "...");
view->copy_to(dst._ptr);
Expand Down Expand Up @@ -1091,7 +1056,7 @@ bool ov::npuw::IBaseInferRequest::needs_copy(std::size_t idx) const {
// the set/get_ tensor API
auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
if (ov::npuw::util::starts_with(m_subrequest_devices[real_idx], "CPU")) {
if (ov::npuw::util::starts_with(m_npuw_model->submodel_device(real_idx), "CPU")) {
return false;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,20 +89,14 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
// function bodies. Function calls are not allowed to have
// their inference requests anymore - they must be stored
// only once in the subrequests list
RqPtrs create_infer_requests(std::size_t id, size_t nireq = 1, bool* recompiled = nullptr);
RqPtrs create_infer_requests(std::size_t id, size_t nireq = 1);
void ensure_subrequest_is_accurate(std::size_t idx, bool& failover);
virtual void update_subrequest_links(std::size_t idx) = 0;

std::shared_ptr<ov::npuw::CompiledModel> m_npuw_model;
std::vector<IBaseInferRequest::Completed> m_completion_cbs;
RqPtrs m_subrequests;

// This vector is used to track devices for individual subrequests
// here locally. Note that the models can be recompiled in
// contexts of other requests (if multiple of those are created)
// so this cached information is used to detect these situations.
std::vector<std::string> m_subrequest_devices;

struct TensorStorage {
ov::SoPtr<ov::ITensor> tensor;
bool persistent = false; // true for the parent I/O tensors
Expand Down
Loading
Loading