diff --git a/docs/data-sources/gpu.md b/docs/data-sources/gpu.md index 2f899985bcf..d700b56cdd3 100644 --- a/docs/data-sources/gpu.md +++ b/docs/data-sources/gpu.md @@ -87,6 +87,12 @@ data_sources: { `counter_period_ns` sets the desired sampling interval. +Alternatively, counters can be selected by name using `counter_names`. Use one +or the other, not both. Not all producers support this — check +`supports_counter_names` in the `GpuCounterDescriptor` data source descriptor. +Some producers may also support glob patterns in `counter_names` for matching +multiple counters but this is not guaranteed. + ### GPU memory Total GPU memory usage per process is collected via ftrace: @@ -167,6 +173,56 @@ data_sources: { } ``` +For more control over which GPU activities are instrumented, use +`instrumented_sampling_config` instead of the `instrumented_sampling` bool. +This enables a pipeline of filters applied in the following order: + +1. **Activity name filtering**: If `activity_name_filters` is non-empty, the + activity must match at least one filter. Each filter requires a `name_glob` + pattern and an optional `name_base` (defaults to `MANGLED_KERNEL_NAME` if + not specified). If empty, all activities pass this step. + +2. **TX range filtering**: If `activity_tx_include_globs` is non-empty, the + activity must fall within a TX range (e.g. NVTX range for CUDA) matching + one of the include globs. Activities in TX ranges matching + `activity_tx_exclude_globs` are excluded (excludes take precedence over + includes). TX ranges can be nested, and an activity matches if any range + in its nesting hierarchy matches. If both are empty, all activities pass + this step. + +3. **Range-based sampling**: If `activity_ranges` is non-empty, only + activities within the specified skip/count ranges are instrumented. + `skip` defaults to 0 and `count` defaults to UINT32\_MAX (all remaining + activities) when not specified. If empty, all activities that passed the + previous steps are instrumented. + +Example configuration that instruments only activities with demangled kernel +names matching `"myKernel*"` within TX ranges matching `"training*"`, +skipping the first 10 matching activities and then instrumenting 5: + +``` +data_sources: { + config { + name: "gpu.counters" + gpu_counter_config { + counter_names: "sm__cycles_elapsed.avg" + counter_names: "sm__cycles_active.avg" + instrumented_sampling_config { + activity_name_filters { + name_glob: "myKernel*" + name_base: DEMANGLED_KERNEL_NAME + } + activity_tx_include_globs: "training*" + activity_ranges { + skip: 10 + count: 5 + } + } + } + } +} +``` + Counter descriptor mode 2 is recommended for GPGPU use-cases: the producer emits an `InternedGpuCounterDescriptor` referenced by IID, giving each trusted sequence its own scoped counter IDs. This avoids the global diff --git a/protos/perfetto/common/gpu_counter_descriptor.proto b/protos/perfetto/common/gpu_counter_descriptor.proto index bb85cb79742..ab68cab426f 100644 --- a/protos/perfetto/common/gpu_counter_descriptor.proto +++ b/protos/perfetto/common/gpu_counter_descriptor.proto @@ -82,6 +82,11 @@ message GpuCounterDescriptor { // command buffer. optional bool supports_instrumented_sampling = 5; + // optional. The producer supports selecting counters by name via + // GpuCounterConfig.counter_names. Not all producers support this; Android + // GPU producers typically do not. + optional bool supports_counter_names = 6; + // next id: 41 enum MeasureUnit { NONE = 0; diff --git a/protos/perfetto/config/gpu/gpu_counter_config.proto b/protos/perfetto/config/gpu/gpu_counter_config.proto index d51a997bf16..f50db55c6a8 100644 --- a/protos/perfetto/config/gpu/gpu_counter_config.proto +++ b/protos/perfetto/config/gpu/gpu_counter_config.proto @@ -22,12 +22,94 @@ message GpuCounterConfig { // Desired sampling interval for counters. optional uint64 counter_period_ns = 1; - // List of counters to be sampled. Counter IDs correspond to the ones - // described in GpuCounterSpec in the data source descriptor. + // Selects which counters to sample. Use either counter_ids or counter_names, + // not both. Counter IDs and names correspond to the ones described in + // GpuCounterSpec in the data source descriptor. + + // List of counter IDs to be sampled. repeated uint32 counter_ids = 2; + // List of counter names to be sampled. Requires producer support; check + // GpuCounterDescriptor.supports_counter_names in the data source descriptor. + // Some producers may also support glob patterns for matching multiple + // counters by name but this is not guaranteed. + repeated string counter_names = 6; + + // Configuration for sampling counters by instrumenting command buffers. + // + // When instrumented_sampling_config is used (instead of the + // instrumented_sampling bool), the following steps determine whether + // instrumented counters are enabled for a given GPU activity: + // + // 1. Activity name filtering: If activity_name_filters is non-empty, the + // activity must match at least one filter. If empty, all activities + // pass this step. + // 2. TX range filtering: If activity_tx_include_globs is non-empty, the + // activity must fall within a matching TX range. Activities in TX + // ranges matching activity_tx_exclude_globs are excluded (excludes + // take precedence over includes). If both are empty, all activities + // pass this step. + // 3. Range-based sampling: If activity_ranges is non-empty, only + // activities within the specified skip/count ranges are instrumented. + // If empty, all activities that passed the previous steps are + // instrumented. + message InstrumentedSamplingConfig { + // Filters GPU activities by name. Each filter specifies a glob pattern + // and the basis for matching (mangled or demangled kernel name). + message ActivityNameFilter { + enum NameBase { + MANGLED_KERNEL_NAME = 0; + DEMANGLED_KERNEL_NAME = 1; + } + + // required. Glob pattern to use for GPU activity name filtering. + optional string name_glob = 1; + + // Basis for name filtering. Defaults to MANGLED_KERNEL_NAME if not + // specified. + optional NameBase name_base = 2; + } + + // GPU activity name filters. An activity matches if it matches any filter. + repeated ActivityNameFilter activity_name_filters = 3; + + // Glob patterns to use for including GPU activities in TX ranges. TX + // ranges are in-process annotations that mark different sections of GPU + // work (e.g. NVTX ranges for CUDA). TX ranges can be nested, and an + // activity is included if any range in its nesting hierarchy matches. + // Only activities that fall within a matching TX range will be + // instrumented. + repeated string activity_tx_include_globs = 6; + + // Glob patterns to use for excluding GPU activities from TX ranges. + // TX ranges can be nested, and an activity is excluded if any range + // in its nesting hierarchy matches. Excludes take precedence over + // includes. + repeated string activity_tx_exclude_globs = 7; + + // Defines a range of GPU activities to instrument. + message ActivityRange { + // Number of GPU activities to skip before starting to instrument + // command buffers. Defaults to 0 if not specified. + optional uint32 skip = 1; + + // Limit for the number of GPU activities to sample counters for by + // instrumenting command buffers. Defaults to UINT32_MAX (all + // remaining activities) if not specified. + optional uint32 count = 2; + } + + // Ranges of GPU activities to instrument. Applied after activity name + // and TX range filters. If empty, all activities that passed the + // previous filters are instrumented. + repeated ActivityRange activity_ranges = 5; + } + // Sample counters by instrumenting command buffers. - optional bool instrumented_sampling = 3; + oneof instrumented_sampling_mode { + bool instrumented_sampling = 3; + InstrumentedSamplingConfig instrumented_sampling_config = 5; + } // Fix gpu clock rate during trace session. optional bool fix_gpu_clock = 4; diff --git a/protos/perfetto/config/perfetto_config.proto b/protos/perfetto/config/perfetto_config.proto index 2193bb2f689..f2dc838dfba 100644 --- a/protos/perfetto/config/perfetto_config.proto +++ b/protos/perfetto/config/perfetto_config.proto @@ -95,6 +95,11 @@ message GpuCounterDescriptor { // command buffer. optional bool supports_instrumented_sampling = 5; + // optional. The producer supports selecting counters by name via + // GpuCounterConfig.counter_names. Not all producers support this; Android + // GPU producers typically do not. + optional bool supports_counter_names = 6; + // next id: 41 enum MeasureUnit { NONE = 0; @@ -1651,12 +1656,94 @@ message GpuCounterConfig { // Desired sampling interval for counters. optional uint64 counter_period_ns = 1; - // List of counters to be sampled. Counter IDs correspond to the ones - // described in GpuCounterSpec in the data source descriptor. + // Selects which counters to sample. Use either counter_ids or counter_names, + // not both. Counter IDs and names correspond to the ones described in + // GpuCounterSpec in the data source descriptor. + + // List of counter IDs to be sampled. repeated uint32 counter_ids = 2; + // List of counter names to be sampled. Requires producer support; check + // GpuCounterDescriptor.supports_counter_names in the data source descriptor. + // Some producers may also support glob patterns for matching multiple + // counters by name but this is not guaranteed. + repeated string counter_names = 6; + + // Configuration for sampling counters by instrumenting command buffers. + // + // When instrumented_sampling_config is used (instead of the + // instrumented_sampling bool), the following steps determine whether + // instrumented counters are enabled for a given GPU activity: + // + // 1. Activity name filtering: If activity_name_filters is non-empty, the + // activity must match at least one filter. If empty, all activities + // pass this step. + // 2. TX range filtering: If activity_tx_include_globs is non-empty, the + // activity must fall within a matching TX range. Activities in TX + // ranges matching activity_tx_exclude_globs are excluded (excludes + // take precedence over includes). If both are empty, all activities + // pass this step. + // 3. Range-based sampling: If activity_ranges is non-empty, only + // activities within the specified skip/count ranges are instrumented. + // If empty, all activities that passed the previous steps are + // instrumented. + message InstrumentedSamplingConfig { + // Filters GPU activities by name. Each filter specifies a glob pattern + // and the basis for matching (mangled or demangled kernel name). + message ActivityNameFilter { + enum NameBase { + MANGLED_KERNEL_NAME = 0; + DEMANGLED_KERNEL_NAME = 1; + } + + // required. Glob pattern to use for GPU activity name filtering. + optional string name_glob = 1; + + // Basis for name filtering. Defaults to MANGLED_KERNEL_NAME if not + // specified. + optional NameBase name_base = 2; + } + + // GPU activity name filters. An activity matches if it matches any filter. + repeated ActivityNameFilter activity_name_filters = 3; + + // Glob patterns to use for including GPU activities in TX ranges. TX + // ranges are in-process annotations that mark different sections of GPU + // work (e.g. NVTX ranges for CUDA). TX ranges can be nested, and an + // activity is included if any range in its nesting hierarchy matches. + // Only activities that fall within a matching TX range will be + // instrumented. + repeated string activity_tx_include_globs = 6; + + // Glob patterns to use for excluding GPU activities from TX ranges. + // TX ranges can be nested, and an activity is excluded if any range + // in its nesting hierarchy matches. Excludes take precedence over + // includes. + repeated string activity_tx_exclude_globs = 7; + + // Defines a range of GPU activities to instrument. + message ActivityRange { + // Number of GPU activities to skip before starting to instrument + // command buffers. Defaults to 0 if not specified. + optional uint32 skip = 1; + + // Limit for the number of GPU activities to sample counters for by + // instrumenting command buffers. Defaults to UINT32_MAX (all + // remaining activities) if not specified. + optional uint32 count = 2; + } + + // Ranges of GPU activities to instrument. Applied after activity name + // and TX range filters. If empty, all activities that passed the + // previous filters are instrumented. + repeated ActivityRange activity_ranges = 5; + } + // Sample counters by instrumenting command buffers. - optional bool instrumented_sampling = 3; + oneof instrumented_sampling_mode { + bool instrumented_sampling = 3; + InstrumentedSamplingConfig instrumented_sampling_config = 5; + } // Fix gpu clock rate during trace session. optional bool fix_gpu_clock = 4; diff --git a/protos/perfetto/trace/perfetto_trace.proto b/protos/perfetto/trace/perfetto_trace.proto index 46a212c3904..1a2504ae7c4 100644 --- a/protos/perfetto/trace/perfetto_trace.proto +++ b/protos/perfetto/trace/perfetto_trace.proto @@ -95,6 +95,11 @@ message GpuCounterDescriptor { // command buffer. optional bool supports_instrumented_sampling = 5; + // optional. The producer supports selecting counters by name via + // GpuCounterConfig.counter_names. Not all producers support this; Android + // GPU producers typically do not. + optional bool supports_counter_names = 6; + // next id: 41 enum MeasureUnit { NONE = 0; @@ -1651,12 +1656,94 @@ message GpuCounterConfig { // Desired sampling interval for counters. optional uint64 counter_period_ns = 1; - // List of counters to be sampled. Counter IDs correspond to the ones - // described in GpuCounterSpec in the data source descriptor. + // Selects which counters to sample. Use either counter_ids or counter_names, + // not both. Counter IDs and names correspond to the ones described in + // GpuCounterSpec in the data source descriptor. + + // List of counter IDs to be sampled. repeated uint32 counter_ids = 2; + // List of counter names to be sampled. Requires producer support; check + // GpuCounterDescriptor.supports_counter_names in the data source descriptor. + // Some producers may also support glob patterns for matching multiple + // counters by name but this is not guaranteed. + repeated string counter_names = 6; + + // Configuration for sampling counters by instrumenting command buffers. + // + // When instrumented_sampling_config is used (instead of the + // instrumented_sampling bool), the following steps determine whether + // instrumented counters are enabled for a given GPU activity: + // + // 1. Activity name filtering: If activity_name_filters is non-empty, the + // activity must match at least one filter. If empty, all activities + // pass this step. + // 2. TX range filtering: If activity_tx_include_globs is non-empty, the + // activity must fall within a matching TX range. Activities in TX + // ranges matching activity_tx_exclude_globs are excluded (excludes + // take precedence over includes). If both are empty, all activities + // pass this step. + // 3. Range-based sampling: If activity_ranges is non-empty, only + // activities within the specified skip/count ranges are instrumented. + // If empty, all activities that passed the previous steps are + // instrumented. + message InstrumentedSamplingConfig { + // Filters GPU activities by name. Each filter specifies a glob pattern + // and the basis for matching (mangled or demangled kernel name). + message ActivityNameFilter { + enum NameBase { + MANGLED_KERNEL_NAME = 0; + DEMANGLED_KERNEL_NAME = 1; + } + + // required. Glob pattern to use for GPU activity name filtering. + optional string name_glob = 1; + + // Basis for name filtering. Defaults to MANGLED_KERNEL_NAME if not + // specified. + optional NameBase name_base = 2; + } + + // GPU activity name filters. An activity matches if it matches any filter. + repeated ActivityNameFilter activity_name_filters = 3; + + // Glob patterns to use for including GPU activities in TX ranges. TX + // ranges are in-process annotations that mark different sections of GPU + // work (e.g. NVTX ranges for CUDA). TX ranges can be nested, and an + // activity is included if any range in its nesting hierarchy matches. + // Only activities that fall within a matching TX range will be + // instrumented. + repeated string activity_tx_include_globs = 6; + + // Glob patterns to use for excluding GPU activities from TX ranges. + // TX ranges can be nested, and an activity is excluded if any range + // in its nesting hierarchy matches. Excludes take precedence over + // includes. + repeated string activity_tx_exclude_globs = 7; + + // Defines a range of GPU activities to instrument. + message ActivityRange { + // Number of GPU activities to skip before starting to instrument + // command buffers. Defaults to 0 if not specified. + optional uint32 skip = 1; + + // Limit for the number of GPU activities to sample counters for by + // instrumenting command buffers. Defaults to UINT32_MAX (all + // remaining activities) if not specified. + optional uint32 count = 2; + } + + // Ranges of GPU activities to instrument. Applied after activity name + // and TX range filters. If empty, all activities that passed the + // previous filters are instrumented. + repeated ActivityRange activity_ranges = 5; + } + // Sample counters by instrumenting command buffers. - optional bool instrumented_sampling = 3; + oneof instrumented_sampling_mode { + bool instrumented_sampling = 3; + InstrumentedSamplingConfig instrumented_sampling_config = 5; + } // Fix gpu clock rate during trace session. optional bool fix_gpu_clock = 4;