diff --git a/nemo_rl/distributed/worker_groups.py b/nemo_rl/distributed/worker_groups.py index e4045183c2..84aaaad567 100644 --- a/nemo_rl/distributed/worker_groups.py +++ b/nemo_rl/distributed/worker_groups.py @@ -497,8 +497,13 @@ def _create_workers_from_bundle_indices( "AVAILABLE_PORT_LIST": str(available_ports), } ) - # Remove Ray-specific environment variables, let the worker itself set them. - worker_env_vars.pop("RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES", None) + # Preserve RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES=1 to prevent Ray + # from masking CUDA_VISIBLE_DEVICES per actor. GPU masking triggers NCCL + # bugs on NVSwitch topologies (H200/P5en, H100/P5) including cuMem import + # penalty (nccl#1749) and NVLS rank ordering corruption (nccl#1906). + # Workers use explicit torch.cuda.set_device(local_rank) instead. + # See: https://github.com/NVIDIA-NeMo/RL/issues/1963 + worker_env_vars["RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES"] = "1" worker_env_vars.pop("RAY_CLIENT_MODE", None) worker_env_vars.pop("RAY_JOB_ID", None) worker_env_vars.pop("RAY_LD_PRELOAD", None)