Skip to content

Commit d53eb00

Browse files
ytl0623pre-commit-ci[bot]ericspod
authored
Support MIG UUID and respect CUDA_VISIBLE_DEVICES in nnUNetV2Runner (#8716)
Fixes #7497 ### Description This PR fixes two critical issues when running `nnUNetV2Runner` on NVIDIA MIG (Multi-Instance GPU) environments or when using a specific `CUDA_VISIBLE_DEVICES` configuration. ### Types of changes <!--- Put an `x` in all the boxes that apply, and remove the not applicable items --> - [x] Non-breaking change (fix or new feature that would not break existing functionality). - [ ] Breaking change (fix or new feature that would cause existing functionality to change). - [ ] New tests added to cover the changes. - [ ] Integration tests passed locally by running `./runtests.sh -f -u --net --coverage`. - [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests --disttests`. - [ ] In-line docstrings updated. - [ ] Documentation updated, tested `make html` command in the `docs/` folder. --------- Signed-off-by: ytl0623 <david89062388@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Kerfoot <17726042+ericspod@users.noreply.github.com>
1 parent 894068a commit d53eb00

File tree

1 file changed

+77
-34
lines changed

1 file changed

+77
-34
lines changed

monai/apps/nnunet/nnunetv2_runner.py

Lines changed: 77 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import glob
1616
import os
17+
import shlex
1718
import subprocess
1819
from typing import Any
1920

@@ -486,16 +487,16 @@ def plan_and_process(
486487
if not no_pp:
487488
self.preprocess(c, n_proc, overwrite_plans_name, verbose)
488489

489-
def train_single_model(self, config: Any, fold: int, gpu_id: tuple | list | int = 0, **kwargs: Any) -> None:
490+
def train_single_model(self, config: Any, fold: int, gpu_id: tuple | list | int | str = 0, **kwargs: Any) -> None:
490491
"""
491492
Run the training on a single GPU with one specified configuration provided.
492-
Note: this will override the environment variable `CUDA_VISIBLE_DEVICES`.
493+
Note: if CUDA_VISIBLE_DEVICES is already set and gpu_id resolves to 0, the existing value is preserved;
494+
otherwise it is set to gpu_id.
493495
494496
Args:
495497
config: configuration that should be trained. Examples: "2d", "3d_fullres", "3d_lowres".
496498
fold: fold of the 5-fold cross-validation. Should be an int between 0 and 4.
497-
gpu_id: an integer to select the device to use, or a tuple/list of GPU device indices used for multi-GPU
498-
training (e.g., (0,1)). Default: 0.
499+
gpu_id: an int, MIG UUID (str), or tuple/list of GPU indices for multi-GPU training (e.g., (0,1)). Default: 0.
499500
kwargs: this optional parameter allows you to specify additional arguments in
500501
``nnunetv2.run.run_training.run_training_entry``.
501502
@@ -525,35 +526,70 @@ def train_single_model(self, config: Any, fold: int, gpu_id: tuple | list | int
525526
kwargs.pop("npz")
526527
logger.warning("please specify the `export_validation_probabilities` in the __init__ of `nnUNetV2Runner`.")
527528

528-
cmd = self.train_single_model_command(config, fold, gpu_id, kwargs)
529-
run_cmd(cmd, shell=True)
529+
cmd, env = self.train_single_model_command(config, fold, gpu_id, kwargs)
530+
run_cmd(cmd, env=env)
530531

531-
def train_single_model_command(self, config, fold, gpu_id, kwargs):
532-
if isinstance(gpu_id, (tuple, list)):
532+
def train_single_model_command(
533+
self, config: str, fold: int, gpu_id: int | str | tuple | list, kwargs: dict[str, Any]
534+
) -> tuple[list[str], dict[str, str]]:
535+
"""
536+
Build the shell command string for training a single nnU-Net model.
537+
538+
Args:
539+
config: Configuration name (e.g., "3d_fullres").
540+
fold: Cross-validation fold index (0-4).
541+
gpu_id: Device selector—int, str (MIG UUID), or tuple/list for multi-GPU.
542+
kwargs: Additional CLI arguments forwarded to nnUNetv2_train.
543+
544+
Returns:
545+
Tuple of (cmd, env) where cmd is a list[str] of argv entries and env is a dict[str, str]
546+
passed to the subprocess.
547+
548+
Raises:
549+
ValueError: If gpu_id is an empty tuple or list.
550+
"""
551+
env = os.environ.copy()
552+
device_setting: str = "0"
553+
num_gpus = 1
554+
if isinstance(gpu_id, str):
555+
device_setting = gpu_id
556+
num_gpus = 1
557+
elif isinstance(gpu_id, (tuple, list)):
558+
if len(gpu_id) == 0:
559+
raise ValueError("gpu_id tuple/list cannot be empty")
533560
if len(gpu_id) > 1:
534-
gpu_ids_str = ""
535-
for _i in range(len(gpu_id)):
536-
gpu_ids_str += f"{gpu_id[_i]},"
537-
device_setting = f"CUDA_VISIBLE_DEVICES={gpu_ids_str[:-1]}"
538-
else:
539-
device_setting = f"CUDA_VISIBLE_DEVICES={gpu_id[0]}"
561+
device_setting = ",".join(str(x) for x in gpu_id)
562+
num_gpus = len(gpu_id)
563+
elif len(gpu_id) == 1:
564+
device_setting = str(gpu_id[0])
565+
num_gpus = 1
540566
else:
541-
device_setting = f"CUDA_VISIBLE_DEVICES={gpu_id}"
542-
num_gpus = 1 if isinstance(gpu_id, int) or len(gpu_id) == 1 else len(gpu_id)
543-
544-
cmd = (
545-
f"{device_setting} nnUNetv2_train "
546-
+ f"{self.dataset_name_or_id} {config} {fold} "
547-
+ f"-tr {self.trainer_class_name} -num_gpus {num_gpus}"
548-
)
567+
device_setting = str(gpu_id)
568+
num_gpus = 1
569+
env_cuda = env.get("CUDA_VISIBLE_DEVICES")
570+
if env_cuda is not None and device_setting == "0":
571+
logger.info(f"Using existing environment variable CUDA_VISIBLE_DEVICES='{env_cuda}'")
572+
else:
573+
env["CUDA_VISIBLE_DEVICES"] = device_setting
574+
575+
cmd = [
576+
"nnUNetv2_train",
577+
f"{self.dataset_name_or_id}",
578+
f"{config}",
579+
f"{fold}",
580+
"-tr",
581+
f"{self.trainer_class_name}",
582+
"-num_gpus",
583+
f"{num_gpus}",
584+
]
549585
if self.export_validation_probabilities:
550-
cmd += " --npz"
586+
cmd.append("--npz")
551587
for _key, _value in kwargs.items():
552588
if _key == "p" or _key == "pretrained_weights":
553-
cmd += f" -{_key} {_value}"
589+
cmd.extend([f"-{_key}", f"{_value}"])
554590
else:
555-
cmd += f" --{_key} {_value}"
556-
return cmd
591+
cmd.extend([f"--{_key}", f"{_value}"])
592+
return cmd, env
557593

558594
def train(
559595
self,
@@ -637,8 +673,8 @@ def train_parallel_cmd(
637673
if _config in ensure_tuple(configs):
638674
for _i in range(self.num_folds):
639675
the_device = gpu_id_for_all[_index % n_devices] # type: ignore
640-
cmd = self.train_single_model_command(_config, _i, the_device, kwargs)
641-
all_cmds[-1][the_device].append(cmd)
676+
cmd, env = self.train_single_model_command(_config, _i, the_device, kwargs)
677+
all_cmds[-1][the_device].append((cmd, env))
642678
_index += 1
643679
return all_cmds
644680

@@ -666,19 +702,21 @@ def train_parallel(
666702
for gpu_id, gpu_cmd in cmds.items():
667703
if not gpu_cmd:
668704
continue
705+
cmds_for_log = [shlex.join(cmd) for cmd, _ in gpu_cmd]
669706
logger.info(
670707
f"training - stage {s + 1}:\n"
671-
f"for gpu {gpu_id}, commands: {gpu_cmd}\n"
708+
f"for gpu {gpu_id}, commands: {cmds_for_log}\n"
672709
f"log '.txt' inside '{os.path.join(self.nnunet_results, self.dataset_name)}'"
673710
)
674711
for stage in all_cmds:
675712
processes = []
676713
for device_id in stage:
677714
if not stage[device_id]:
678715
continue
679-
cmd_str = "; ".join(stage[device_id])
716+
cmd_str = "; ".join(shlex.join(cmd) for cmd, _ in stage[device_id])
717+
env = stage[device_id][0][1]
680718
logger.info(f"Current running command on GPU device {device_id}:\n{cmd_str}\n")
681-
processes.append(subprocess.Popen(cmd_str, shell=True, stdout=subprocess.DEVNULL))
719+
processes.append(subprocess.Popen(cmd_str, shell=True, env=env, stdout=subprocess.DEVNULL))
682720
# finish this stage first
683721
for p in processes:
684722
p.wait()
@@ -779,7 +817,7 @@ def predict(
779817
part_id: int = 0,
780818
num_processes_preprocessing: int = -1,
781819
num_processes_segmentation_export: int = -1,
782-
gpu_id: int = 0,
820+
gpu_id: int | str = 0,
783821
) -> None:
784822
"""
785823
Use this to run inference with nnU-Net. This function is used when you want to manually specify a folder containing
@@ -813,9 +851,14 @@ def predict(
813851
num_processes_preprocessing: out-of-RAM issues.
814852
num_processes_segmentation_export: Number of processes used for segmentation export.
815853
More is not always better. Beware of out-of-RAM issues.
816-
gpu_id: which GPU to use for prediction.
854+
gpu_id: GPU device index (int) or MIG UUID (str) for prediction.
855+
If CUDA_VISIBLE_DEVICES is already set and gpu_id is 0, the existing
856+
environment variable is preserved.
817857
"""
818-
os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}"
858+
if "CUDA_VISIBLE_DEVICES" in os.environ and gpu_id in {0, "0"}:
859+
logger.info(f"Predict: Using existing CUDA_VISIBLE_DEVICES={os.environ['CUDA_VISIBLE_DEVICES']}")
860+
else:
861+
os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}"
819862

820863
from nnunetv2.inference.predict_from_raw_data import nnUNetPredictor
821864

0 commit comments

Comments
 (0)