[OV][ITT][GPU Plugin] Enable default ITT markers for inference and op submission (#33313)

tovinkere · web-flow · commit 6b65316d0da1 · 2026-02-19T15:09:01.000Z
- Enables default ITT markers for higher level operations such as inference pass, op preparation and submission - Follows the same guidelines to standardize the conventions for namespaces: ov::phases::gpu::inference ov::op::gpu - Supports both synchronous and asynchronous operations Enabling default GPU ITT markers using standard convention - Part 3 This PR is the **third** of a series of PRs to standardize the ITT markers in OpenVINO that will be enabled by default through host-side instrumentation. 1. The first PR addresses the enhancements required in ITT and the framework to support the creation and propagation of IDs when asynchronous execution is in play [PR#33639](#33639). 2. The second PR will standardize ITT markers in the CPU and enhance support to include asynchronous execution [PR#33312](#33312). 3. This **third** PR will enable default markers for GPU plugin to allow visibility into inference pass begin/end and operator preparation and submission within each inference. Follow standardized conventions as described in 1 and 2 4. The final PR will extend the same host side markers for NPU execution, which capturing the inference span and pipeline activity. Summary of the current PR (PR#3) Use the same convention standardized in [PR#33639](#33639) Ensures the namespace for GPU Plugin activity falls under: ov::phases::gpu::inference ov::op::gpu Details: GPU support is enabled with default ITT markers that support synchronous an asynchronous execution. This PR ensures a standardized convention is followed in namespaces used. Tickets: [CVS-179230](https://jira.devtools.intel.com/browse/CVS-179230) @isanghao Please review this as you are generally aware of what was discussed --------- Signed-off-by: Vasanth Tovinkere <vasanth.tovinkere@intel.com>
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/compiled_model.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/compiled_model.hpp
@@ -61,6 +61,12 @@ class CompiledModel : public ov::ICompiledModel {
     RemoteContextImpl::Ptr get_context_impl() const {
         return m_context;
     }
+
+    // Helper function to return the model name for ITT tracing
+    std::string_view get_model_name() const {
+        return m_model_name;
+    }
+
     const std::vector<std::shared_ptr<Graph>>& get_graphs() const;
     std::shared_ptr<Graph> get_graph(size_t n) const;
 
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/sync_infer_request.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/sync_infer_request.hpp
@@ -118,6 +118,8 @@ class SyncInferRequest : public ov::ISyncInferRequest {
     void init_mappings();
     bool is_batched_input(const ov::Output<const ov::Node>& port) const;
     uint64_t total_output_bytes = 0;
-};
+    // Variable to hold the inference request string with compiled model name
+    // to prevent this string being constructed for each inference call
+    std::string m_itt_infer_request_str;};
 
 }  // namespace ov::intel_gpu
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/itt.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/itt.hpp
@@ -14,6 +14,10 @@
 namespace ov::intel_gpu {
 namespace itt {
 namespace domains {
+    // Domain namespace to define GPU Inference phase tasks
+    OV_ITT_DOMAIN(intel_gpu_inference, "ov::phases::gpu::inference");
+    // Domain namespace for all of the operators
+    OV_ITT_DOMAIN(intel_gpu_op, "ov::op::gpu");
     OV_ITT_DOMAIN(intel_gpu_plugin);
 }  // namespace domains
 }  // namespace itt
diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp
@@ -761,6 +761,7 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
 
     for (auto& inst : _exec_order) {
         NODE_DEBUG(*inst);
+        OV_ITT_SCOPED_TASK_BASE(ov::intel_gpu::itt::domains::intel_gpu_op, openvino::itt::handle(inst->id()));
 
         inst->reset_events();
 
diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -1993,7 +1993,7 @@ void primitive_inst::reset_flags() {
 }
 
 void primitive_inst::prepare_primitive() {
-    OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("primitive_inst::execute: " + id()));
+    OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle(id() + "::prepare"));
     const auto& primitive_id = id();
     if (!_has_valid_input) {
         // For unfused network with dynamic_quantization, we may have empty/unused input
@@ -2184,6 +2184,7 @@ void primitive_inst::prepare_primitive() {
 }
 
 void primitive_inst::execute() {
+    OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle(id() + "::execute"));
     GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::inference);
     if (get_flag(ExecutionFlags::SKIP)) {
         set_out_event(get_network().get_stream().aggregate_events(_impl_params->dep_events));
diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
@@ -93,15 +93,17 @@ SyncInferRequest::SyncInferRequest(const std::shared_ptr<const CompiledModel>& c
     , m_context(std::static_pointer_cast<RemoteContextImpl>(compiled_model->get_context_impl()))
     , m_shape_predictor(new cldnn::ShapePredictor(&m_graph->get_engine(), m_graph->get_config().get_shape_predictor_settings()))
     , m_enable_profiling(m_graph->get_config().get_enable_profiling())
-    , m_use_external_queue(m_graph->use_external_queue()) {
+    , m_use_external_queue(m_graph->use_external_queue())
+    , m_itt_infer_request_str("SyncInferenceGPU::infer::" + std::string(compiled_model->get_model_name())) {
     init_mappings();
     allocate_inputs();
     allocate_outputs();
     allocate_states();
 }
 
 void SyncInferRequest::infer() {
-    OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "SyncInferRequest::infer");
+    // String can be constructed once in the constructor
+    OV_ITT_SCOPED_TASK_BASE(itt::domains::intel_gpu_inference,  m_itt_infer_request_str.c_str());
     setup_stream_graph();
     std::lock_guard<std::mutex> lk(m_graph->get_mutex());
     enqueue();
@@ -308,7 +310,7 @@ void SyncInferRequest::enqueue() {
 }
 
 void SyncInferRequest::wait() {
-    OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "SyncInferRequest::wait");
+    OV_ITT_SCOPED_TASK_BASE(itt::domains::intel_gpu_inference, "SyncInferenceGPU::wait");
     OPENVINO_ASSERT(!m_internal_outputs.empty(), "[GPU] Inference was not started!\n");
 
     int64_t sync_total_time = 0;

Original file line number	Diff line number	Diff line change
`@@ -1993,7 +1993,7 @@ void primitive_inst::reset_flags() {`
`1993`	`1993`	`}`
`1994`	`1994`
`1995`	`1995`	`void primitive_inst::prepare_primitive() {`
`1996`		`- OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("primitive_inst::execute: " + id()));`
	`1996`	`+ OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle(id() + "::prepare"));`
`1997`	`1997`	`const auto& primitive_id = id();`
`1998`	`1998`	`if (!_has_valid_input) {`
`1999`	`1999`	`// For unfused network with dynamic_quantization, we may have empty/unused input`
`@@ -2184,6 +2184,7 @@ void primitive_inst::prepare_primitive() {`
`2184`	`2184`	`}`
`2185`	`2185`
`2186`	`2186`	`void primitive_inst::execute() {`
	`2187`	`+ OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle(id() + "::execute"));`
`2187`	`2188`	`GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::inference);`
`2188`	`2189`	`if (get_flag(ExecutionFlags::SKIP)) {`
`2189`	`2190`	`set_out_event(get_network().get_stream().aggregate_events(_impl_params->dep_events));`