[Enhancement] Support RunAI Model Streamer for diffusion weight loading

NickCao · claude · NickCao · commit ede67f76dae9 · 2026-03-25T15:28:50.000-04:00
Add enable_runai_streamer flag to OmniDiffusionConfig so diffusion
models can use the runai_model_streamer library for streaming
safetensors weights, matching the support already available in
the LLM weight loading path.

Co-authored-by: Claude &lt;noreply@anthropic.com&gt;
Signed-off-by: Nick Cao &lt;ncao@redhat.com&gt;
diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py
@@ -412,6 +412,7 @@ class OmniDiffusionConfig:
     # Parallel weight loading (for faster diffusion model startup)
     enable_multithread_weight_load: bool = True
     num_weight_load_threads: int = 4
+    enable_runai_streamer: bool = False
 
     # Enable sleep mode
     enable_sleep_mode: bool = False
diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py
@@ -24,6 +24,7 @@
     filter_files_not_needed_for_inference,
     maybe_download_from_modelscope,
     multi_thread_safetensors_weights_iterator,
+    runai_safetensors_weights_iterator,
     safetensors_weights_iterator,
 )
 from vllm.utils.import_utils import resolve_obj_by_qualname
@@ -180,13 +181,20 @@ def _get_weights_iterator(self, source: "ComponentSource") -> Generator[tuple[st
         )
 
         od_config = self.od_config
+        use_runai = use_safetensors and od_config is not None and getattr(od_config, "enable_runai_streamer", False)
         use_multithread = (
             use_safetensors
             and od_config is not None
             and getattr(od_config, "enable_multithread_weight_load", False)
             and self.load_config.safetensors_load_strategy != "torchao"
         )
-        if use_multithread:
+        if use_runai:
+            sorted_hf_weights_files = sorted(hf_weights_files, key=_natural_sort_key)
+            weights_iterator = runai_safetensors_weights_iterator(
+                sorted_hf_weights_files,
+                self.load_config.use_tqdm_on_load,
+            )
+        elif use_multithread:
             num_threads = getattr(od_config, "num_weight_load_threads", 4)
             # Keep deterministic shard order before passing to vLLM helper.
             sorted_hf_weights_files = sorted(hf_weights_files, key=_natural_sort_key)
diff --git a/vllm_omni/entrypoints/cli/serve.py b/vllm_omni/entrypoints/cli/serve.py
@@ -294,6 +294,12 @@ def subparser_init(self, subparsers: argparse._SubParsersAction) -> FlexibleArgu
             default=4,
             help="Number of threads for parallel weight loading (default: 4).",
         )
+        omni_config_group.add_argument(
+            "--enable-runai-streamer",
+            action="store_true",
+            default=False,
+            help="Use RunAI Model Streamer for loading diffusion safetensors weights.",
+        )
 
         # diffusion model offload parameters
         omni_config_group.add_argument(