Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions scripts/performance/perf_plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,12 +121,16 @@ def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executo
if self.nsys_trace is not None:
launcher.nsys_trace = self.nsys_trace

# Combine default extra args with user-provided extra args
# Reduce CPU-side collection overhead: disable context-switch tracing,
# backtrace collection, and CUDA graph node tracing.
existing = launcher.nsys_extra_args or []
existing = [a for a in existing if not a.startswith("--cuda-graph-trace")]
existing.extend(["--cpuctxsw=none", "--backtrace=none"])
launcher.nsys_extra_args = existing

# Combine with user-provided extra args (user args first for precedence)
if self.nsys_extra_args is not None:
# Get existing launcher extra args (nemo_run defaults)
existing_extra_args = launcher.nsys_extra_args or []
# Combine user args with existing args (user args first for precedence)
launcher.nsys_extra_args = self.nsys_extra_args + existing_extra_args
launcher.nsys_extra_args = self.nsys_extra_args + launcher.nsys_extra_args
logger.info(f"Combined nsys_extra_args: {launcher.nsys_extra_args}")

if isinstance(executor, SlurmExecutor):
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/rubin_launch_interactive.sh
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ if [ "${ENABLE_NSYS}" = "1" ]; then
NSYS_PREFIX=(
nsys profile
-s none
--cpuctxsw=none
--backtrace=none
-t "${NSYS_TRACE}"
-o "${NSYS_OUTPUT}"
--force-overwrite true
Expand Down
7 changes: 7 additions & 0 deletions src/megatron/bridge/recipes/run_plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,13 @@ def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executo
launcher.nsys_profile = True
launcher.nsys_trace = self.nsys_trace or ["nvtx", "cuda"]

# Reduce CPU-side collection overhead: disable context-switch tracing,
# backtrace collection, and CUDA graph node tracing.
existing = launcher.nsys_extra_args or []
existing = [a for a in existing if not a.startswith("--cuda-graph-trace")]
existing.extend(["--cpuctxsw=none", "--backtrace=none"])
launcher.nsys_extra_args = existing

if isinstance(executor, SlurmExecutor):
# NOTE: DO NOT change to f-string, `%q{}` is Slurm placeholder
launcher.nsys_filename = "profile_%p_%q{SLURM_JOB_ID}_node%q{SLURM_NODEID}_rank%q{SLURM_PROCID}"
Expand Down
Loading