|
14 | 14 |
|
15 | 15 | import glob |
16 | 16 | import os |
| 17 | +import shlex |
17 | 18 | import subprocess |
18 | 19 | from typing import Any |
19 | 20 |
|
@@ -486,16 +487,16 @@ def plan_and_process( |
486 | 487 | if not no_pp: |
487 | 488 | self.preprocess(c, n_proc, overwrite_plans_name, verbose) |
488 | 489 |
|
489 | | - def train_single_model(self, config: Any, fold: int, gpu_id: tuple | list | int = 0, **kwargs: Any) -> None: |
| 490 | + def train_single_model(self, config: Any, fold: int, gpu_id: tuple | list | int | str = 0, **kwargs: Any) -> None: |
490 | 491 | """ |
491 | 492 | Run the training on a single GPU with one specified configuration provided. |
492 | | - Note: this will override the environment variable `CUDA_VISIBLE_DEVICES`. |
| 493 | + Note: if CUDA_VISIBLE_DEVICES is already set and gpu_id resolves to 0, the existing value is preserved; |
| 494 | + otherwise it is set to gpu_id. |
493 | 495 |
|
494 | 496 | Args: |
495 | 497 | config: configuration that should be trained. Examples: "2d", "3d_fullres", "3d_lowres". |
496 | 498 | fold: fold of the 5-fold cross-validation. Should be an int between 0 and 4. |
497 | | - gpu_id: an integer to select the device to use, or a tuple/list of GPU device indices used for multi-GPU |
498 | | - training (e.g., (0,1)). Default: 0. |
| 499 | + gpu_id: an int, MIG UUID (str), or tuple/list of GPU indices for multi-GPU training (e.g., (0,1)). Default: 0. |
499 | 500 | kwargs: this optional parameter allows you to specify additional arguments in |
500 | 501 | ``nnunetv2.run.run_training.run_training_entry``. |
501 | 502 |
|
@@ -525,35 +526,70 @@ def train_single_model(self, config: Any, fold: int, gpu_id: tuple | list | int |
525 | 526 | kwargs.pop("npz") |
526 | 527 | logger.warning("please specify the `export_validation_probabilities` in the __init__ of `nnUNetV2Runner`.") |
527 | 528 |
|
528 | | - cmd = self.train_single_model_command(config, fold, gpu_id, kwargs) |
529 | | - run_cmd(cmd, shell=True) |
| 529 | + cmd, env = self.train_single_model_command(config, fold, gpu_id, kwargs) |
| 530 | + run_cmd(cmd, env=env) |
530 | 531 |
|
531 | | - def train_single_model_command(self, config, fold, gpu_id, kwargs): |
532 | | - if isinstance(gpu_id, (tuple, list)): |
| 532 | + def train_single_model_command( |
| 533 | + self, config: str, fold: int, gpu_id: int | str | tuple | list, kwargs: dict[str, Any] |
| 534 | + ) -> tuple[list[str], dict[str, str]]: |
| 535 | + """ |
| 536 | + Build the shell command string for training a single nnU-Net model. |
| 537 | +
|
| 538 | + Args: |
| 539 | + config: Configuration name (e.g., "3d_fullres"). |
| 540 | + fold: Cross-validation fold index (0-4). |
| 541 | + gpu_id: Device selector—int, str (MIG UUID), or tuple/list for multi-GPU. |
| 542 | + kwargs: Additional CLI arguments forwarded to nnUNetv2_train. |
| 543 | +
|
| 544 | + Returns: |
| 545 | + Tuple of (cmd, env) where cmd is a list[str] of argv entries and env is a dict[str, str] |
| 546 | + passed to the subprocess. |
| 547 | +
|
| 548 | + Raises: |
| 549 | + ValueError: If gpu_id is an empty tuple or list. |
| 550 | + """ |
| 551 | + env = os.environ.copy() |
| 552 | + device_setting: str = "0" |
| 553 | + num_gpus = 1 |
| 554 | + if isinstance(gpu_id, str): |
| 555 | + device_setting = gpu_id |
| 556 | + num_gpus = 1 |
| 557 | + elif isinstance(gpu_id, (tuple, list)): |
| 558 | + if len(gpu_id) == 0: |
| 559 | + raise ValueError("gpu_id tuple/list cannot be empty") |
533 | 560 | if len(gpu_id) > 1: |
534 | | - gpu_ids_str = "" |
535 | | - for _i in range(len(gpu_id)): |
536 | | - gpu_ids_str += f"{gpu_id[_i]}," |
537 | | - device_setting = f"CUDA_VISIBLE_DEVICES={gpu_ids_str[:-1]}" |
538 | | - else: |
539 | | - device_setting = f"CUDA_VISIBLE_DEVICES={gpu_id[0]}" |
| 561 | + device_setting = ",".join(str(x) for x in gpu_id) |
| 562 | + num_gpus = len(gpu_id) |
| 563 | + elif len(gpu_id) == 1: |
| 564 | + device_setting = str(gpu_id[0]) |
| 565 | + num_gpus = 1 |
540 | 566 | else: |
541 | | - device_setting = f"CUDA_VISIBLE_DEVICES={gpu_id}" |
542 | | - num_gpus = 1 if isinstance(gpu_id, int) or len(gpu_id) == 1 else len(gpu_id) |
543 | | - |
544 | | - cmd = ( |
545 | | - f"{device_setting} nnUNetv2_train " |
546 | | - + f"{self.dataset_name_or_id} {config} {fold} " |
547 | | - + f"-tr {self.trainer_class_name} -num_gpus {num_gpus}" |
548 | | - ) |
| 567 | + device_setting = str(gpu_id) |
| 568 | + num_gpus = 1 |
| 569 | + env_cuda = env.get("CUDA_VISIBLE_DEVICES") |
| 570 | + if env_cuda is not None and device_setting == "0": |
| 571 | + logger.info(f"Using existing environment variable CUDA_VISIBLE_DEVICES='{env_cuda}'") |
| 572 | + else: |
| 573 | + env["CUDA_VISIBLE_DEVICES"] = device_setting |
| 574 | + |
| 575 | + cmd = [ |
| 576 | + "nnUNetv2_train", |
| 577 | + f"{self.dataset_name_or_id}", |
| 578 | + f"{config}", |
| 579 | + f"{fold}", |
| 580 | + "-tr", |
| 581 | + f"{self.trainer_class_name}", |
| 582 | + "-num_gpus", |
| 583 | + f"{num_gpus}", |
| 584 | + ] |
549 | 585 | if self.export_validation_probabilities: |
550 | | - cmd += " --npz" |
| 586 | + cmd.append("--npz") |
551 | 587 | for _key, _value in kwargs.items(): |
552 | 588 | if _key == "p" or _key == "pretrained_weights": |
553 | | - cmd += f" -{_key} {_value}" |
| 589 | + cmd.extend([f"-{_key}", f"{_value}"]) |
554 | 590 | else: |
555 | | - cmd += f" --{_key} {_value}" |
556 | | - return cmd |
| 591 | + cmd.extend([f"--{_key}", f"{_value}"]) |
| 592 | + return cmd, env |
557 | 593 |
|
558 | 594 | def train( |
559 | 595 | self, |
@@ -637,8 +673,8 @@ def train_parallel_cmd( |
637 | 673 | if _config in ensure_tuple(configs): |
638 | 674 | for _i in range(self.num_folds): |
639 | 675 | the_device = gpu_id_for_all[_index % n_devices] # type: ignore |
640 | | - cmd = self.train_single_model_command(_config, _i, the_device, kwargs) |
641 | | - all_cmds[-1][the_device].append(cmd) |
| 676 | + cmd, env = self.train_single_model_command(_config, _i, the_device, kwargs) |
| 677 | + all_cmds[-1][the_device].append((cmd, env)) |
642 | 678 | _index += 1 |
643 | 679 | return all_cmds |
644 | 680 |
|
@@ -666,19 +702,21 @@ def train_parallel( |
666 | 702 | for gpu_id, gpu_cmd in cmds.items(): |
667 | 703 | if not gpu_cmd: |
668 | 704 | continue |
| 705 | + cmds_for_log = [shlex.join(cmd) for cmd, _ in gpu_cmd] |
669 | 706 | logger.info( |
670 | 707 | f"training - stage {s + 1}:\n" |
671 | | - f"for gpu {gpu_id}, commands: {gpu_cmd}\n" |
| 708 | + f"for gpu {gpu_id}, commands: {cmds_for_log}\n" |
672 | 709 | f"log '.txt' inside '{os.path.join(self.nnunet_results, self.dataset_name)}'" |
673 | 710 | ) |
674 | 711 | for stage in all_cmds: |
675 | 712 | processes = [] |
676 | 713 | for device_id in stage: |
677 | 714 | if not stage[device_id]: |
678 | 715 | continue |
679 | | - cmd_str = "; ".join(stage[device_id]) |
| 716 | + cmd_str = "; ".join(shlex.join(cmd) for cmd, _ in stage[device_id]) |
| 717 | + env = stage[device_id][0][1] |
680 | 718 | logger.info(f"Current running command on GPU device {device_id}:\n{cmd_str}\n") |
681 | | - processes.append(subprocess.Popen(cmd_str, shell=True, stdout=subprocess.DEVNULL)) |
| 719 | + processes.append(subprocess.Popen(cmd_str, shell=True, env=env, stdout=subprocess.DEVNULL)) |
682 | 720 | # finish this stage first |
683 | 721 | for p in processes: |
684 | 722 | p.wait() |
@@ -779,7 +817,7 @@ def predict( |
779 | 817 | part_id: int = 0, |
780 | 818 | num_processes_preprocessing: int = -1, |
781 | 819 | num_processes_segmentation_export: int = -1, |
782 | | - gpu_id: int = 0, |
| 820 | + gpu_id: int | str = 0, |
783 | 821 | ) -> None: |
784 | 822 | """ |
785 | 823 | Use this to run inference with nnU-Net. This function is used when you want to manually specify a folder containing |
@@ -813,9 +851,14 @@ def predict( |
813 | 851 | num_processes_preprocessing: out-of-RAM issues. |
814 | 852 | num_processes_segmentation_export: Number of processes used for segmentation export. |
815 | 853 | More is not always better. Beware of out-of-RAM issues. |
816 | | - gpu_id: which GPU to use for prediction. |
| 854 | + gpu_id: GPU device index (int) or MIG UUID (str) for prediction. |
| 855 | + If CUDA_VISIBLE_DEVICES is already set and gpu_id is 0, the existing |
| 856 | + environment variable is preserved. |
817 | 857 | """ |
818 | | - os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}" |
| 858 | + if "CUDA_VISIBLE_DEVICES" in os.environ and gpu_id in {0, "0"}: |
| 859 | + logger.info(f"Predict: Using existing CUDA_VISIBLE_DEVICES={os.environ['CUDA_VISIBLE_DEVICES']}") |
| 860 | + else: |
| 861 | + os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}" |
819 | 862 |
|
820 | 863 | from nnunetv2.inference.predict_from_raw_data import nnUNetPredictor |
821 | 864 |
|
|
0 commit comments