fix(diffusers): use correct tensor name prefixes for SDXL text encoders

fewtarius · fewtarius · commit f6ae11169012 · 2026-02-16T17:15:28.000-05:00
Problem:
When loading SDXL models in diffusers directory format, text encoders were
loaded with prefixes "te." and "te.1." which don't match the expected tensor
names in the model graph. The model expects "cond_stage_model.transformer."
for clip_l and "cond_stage_model.1.transformer." for clip_g.

This caused "tensor not in model file" errors for all text encoder tensors
when loading SDXL diffusers models.

Solution:
- Changed text_encoder prefix from "te." to "cond_stage_model.transformer."
- Changed text_encoder_2 prefix from "te.1." to "cond_stage_model.1.transformer."
- These prefixes now match what's used when loading separate clip_l/clip_g files
- Added early return in get_sd_version() when SDXL is detected to prevent
  later components (VAE) from overriding the version
- Added version caching to prevent re-detection from changing SDXL version

Testing:
- SDXL diffusers models now load successfully
- SD 1.5 models continue to work (regression tested)
- All text encoder tensors are found and loaded correctly

Files changed:
- model.cpp: Updated diffusers text encoder prefixes and SDXL detection logic
- stable-diffusion.cpp: Added version caching to preserve SDXL detection
diff --git a/src/model.cpp b/src/model.cpp
@@ -655,11 +655,11 @@ bool ModelLoader::init_from_diffusers_file(const std::string& file_path, const s
         LOG_WARN("Couldn't find working VAE in %s", file_path.c_str());
         // return false;
     }
-    if (!init_from_safetensors_file(clip_path, "te.")) {
+    if (!init_from_safetensors_file(clip_path, "cond_stage_model.transformer.")) {
         LOG_WARN("Couldn't find working text encoder in %s", file_path.c_str());
         // return false;
     }
-    if (!init_from_safetensors_file(clip_g_path, "te.1.")) {
+    if (!init_from_safetensors_file(clip_g_path, "cond_stage_model.1.transformer.")) {
         LOG_DEBUG("Couldn't find working second text encoder in %s", file_path.c_str());
     }
     return true;
@@ -1028,6 +1028,11 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s
 
 SDVersion ModelLoader::get_sd_version() {
     TensorStorage token_embedding_weight, input_block_weight;
+    // Return cached version if already detected as SDXL in earlier component
+    if (version_ == VERSION_SDXL || version_ == VERSION_SDXL_INPAINT || version_ == VERSION_SDXL_PIX2PIX) {
+        LOG_DEBUG("Returning cached SDXL version");
+        return version_;
+    }
 
     bool has_multiple_encoders = false;
     bool is_unet               = false;
@@ -1089,8 +1094,10 @@ SDVersion ModelLoader::get_sd_version() {
                 tensor_storage.name.find("cond_stage_model.1") != std::string::npos ||
                 tensor_storage.name.find("te.1") != std::string::npos) {
                 has_multiple_encoders = true;
+                // Return SDXL immediately to prevent later components from overriding
                 if (is_unet) {
-                    is_xl = true;
+                    LOG_DEBUG("Detected SDXL (multiple text encoders in UNET model)");
+                    return VERSION_SDXL;
                 }
             }
             if (tensor_storage.name.find("model.diffusion_model.input_blocks.8.0.time_mixer.mix_factor") != std::string::npos) {
@@ -1122,6 +1129,11 @@ SDVersion ModelLoader::get_sd_version() {
             input_block_weight = tensor_storage;
         }
     }
+    
+    // Ensure SDXL is detected even if early return was not reached
+    if (has_multiple_encoders && is_unet) {
+        is_xl = true;
+    }
     if (is_wan) {
         LOG_DEBUG("patch_embedding_channels %d", patch_embedding_channels);
         if (patch_embedding_channels == 184320 && !has_img_emb) {
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
@@ -326,7 +326,13 @@ class StableDiffusionGGML {
 
         model_loader.convert_tensors_name();
 
-        version = model_loader.get_sd_version();
+        // SDXL_FIX: Don't overwrite if already detected as SDXL in earlier component
+        SDVersion detected_version = model_loader.get_sd_version();
+        if (version != VERSION_SDXL && version != VERSION_SDXL_INPAINT && version != VERSION_SDXL_PIX2PIX) {
+            version = detected_version;
+        } else {
+            LOG_INFO("Keeping previous SDXL version, detected version: %s", model_version_to_str[detected_version]);
+        }
         if (version == VERSION_COUNT) {
             LOG_ERROR("get sd version from file failed: '%s'", SAFE_STR(sd_ctx_params->model_path));
             return false;
@@ -335,6 +341,7 @@ class StableDiffusionGGML {
         auto& tensor_storage_map = model_loader.get_tensor_storage_map();
 
         LOG_INFO("Version: %s ", model_version_to_str[version]);
+        
         ggml_type wtype               = (int)sd_ctx_params->wtype < std::min<int>(SD_TYPE_COUNT, GGML_TYPE_COUNT)
                                             ? (ggml_type)sd_ctx_params->wtype
                                             : GGML_TYPE_COUNT;