Skip to content

Commit f6ae111

Browse files
committed
fix(diffusers): use correct tensor name prefixes for SDXL text encoders
Problem: When loading SDXL models in diffusers directory format, text encoders were loaded with prefixes "te." and "te.1." which don't match the expected tensor names in the model graph. The model expects "cond_stage_model.transformer." for clip_l and "cond_stage_model.1.transformer." for clip_g. This caused "tensor not in model file" errors for all text encoder tensors when loading SDXL diffusers models. Solution: - Changed text_encoder prefix from "te." to "cond_stage_model.transformer." - Changed text_encoder_2 prefix from "te.1." to "cond_stage_model.1.transformer." - These prefixes now match what's used when loading separate clip_l/clip_g files - Added early return in get_sd_version() when SDXL is detected to prevent later components (VAE) from overriding the version - Added version caching to prevent re-detection from changing SDXL version Testing: - SDXL diffusers models now load successfully - SD 1.5 models continue to work (regression tested) - All text encoder tensors are found and loaded correctly Files changed: - model.cpp: Updated diffusers text encoder prefixes and SDXL detection logic - stable-diffusion.cpp: Added version caching to preserve SDXL detection
1 parent 636d3cb commit f6ae111

File tree

2 files changed

+23
-4
lines changed

2 files changed

+23
-4
lines changed

src/model.cpp

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -655,11 +655,11 @@ bool ModelLoader::init_from_diffusers_file(const std::string& file_path, const s
655655
LOG_WARN("Couldn't find working VAE in %s", file_path.c_str());
656656
// return false;
657657
}
658-
if (!init_from_safetensors_file(clip_path, "te.")) {
658+
if (!init_from_safetensors_file(clip_path, "cond_stage_model.transformer.")) {
659659
LOG_WARN("Couldn't find working text encoder in %s", file_path.c_str());
660660
// return false;
661661
}
662-
if (!init_from_safetensors_file(clip_g_path, "te.1.")) {
662+
if (!init_from_safetensors_file(clip_g_path, "cond_stage_model.1.transformer.")) {
663663
LOG_DEBUG("Couldn't find working second text encoder in %s", file_path.c_str());
664664
}
665665
return true;
@@ -1028,6 +1028,11 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s
10281028

10291029
SDVersion ModelLoader::get_sd_version() {
10301030
TensorStorage token_embedding_weight, input_block_weight;
1031+
// Return cached version if already detected as SDXL in earlier component
1032+
if (version_ == VERSION_SDXL || version_ == VERSION_SDXL_INPAINT || version_ == VERSION_SDXL_PIX2PIX) {
1033+
LOG_DEBUG("Returning cached SDXL version");
1034+
return version_;
1035+
}
10311036

10321037
bool has_multiple_encoders = false;
10331038
bool is_unet = false;
@@ -1089,8 +1094,10 @@ SDVersion ModelLoader::get_sd_version() {
10891094
tensor_storage.name.find("cond_stage_model.1") != std::string::npos ||
10901095
tensor_storage.name.find("te.1") != std::string::npos) {
10911096
has_multiple_encoders = true;
1097+
// Return SDXL immediately to prevent later components from overriding
10921098
if (is_unet) {
1093-
is_xl = true;
1099+
LOG_DEBUG("Detected SDXL (multiple text encoders in UNET model)");
1100+
return VERSION_SDXL;
10941101
}
10951102
}
10961103
if (tensor_storage.name.find("model.diffusion_model.input_blocks.8.0.time_mixer.mix_factor") != std::string::npos) {
@@ -1122,6 +1129,11 @@ SDVersion ModelLoader::get_sd_version() {
11221129
input_block_weight = tensor_storage;
11231130
}
11241131
}
1132+
1133+
// Ensure SDXL is detected even if early return was not reached
1134+
if (has_multiple_encoders && is_unet) {
1135+
is_xl = true;
1136+
}
11251137
if (is_wan) {
11261138
LOG_DEBUG("patch_embedding_channels %d", patch_embedding_channels);
11271139
if (patch_embedding_channels == 184320 && !has_img_emb) {

src/stable-diffusion.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,13 @@ class StableDiffusionGGML {
326326

327327
model_loader.convert_tensors_name();
328328

329-
version = model_loader.get_sd_version();
329+
// SDXL_FIX: Don't overwrite if already detected as SDXL in earlier component
330+
SDVersion detected_version = model_loader.get_sd_version();
331+
if (version != VERSION_SDXL && version != VERSION_SDXL_INPAINT && version != VERSION_SDXL_PIX2PIX) {
332+
version = detected_version;
333+
} else {
334+
LOG_INFO("Keeping previous SDXL version, detected version: %s", model_version_to_str[detected_version]);
335+
}
330336
if (version == VERSION_COUNT) {
331337
LOG_ERROR("get sd version from file failed: '%s'", SAFE_STR(sd_ctx_params->model_path));
332338
return false;
@@ -335,6 +341,7 @@ class StableDiffusionGGML {
335341
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
336342

337343
LOG_INFO("Version: %s ", model_version_to_str[version]);
344+
338345
ggml_type wtype = (int)sd_ctx_params->wtype < std::min<int>(SD_TYPE_COUNT, GGML_TYPE_COUNT)
339346
? (ggml_type)sd_ctx_params->wtype
340347
: GGML_TYPE_COUNT;

0 commit comments

Comments
 (0)