support more comyui loras.

sayakpaul · sayakpaul · commit 812b4e1eaa20 · 2025-03-06T17:43:37.000+05:30
diff --git a/src/diffusers/loaders/lora_base.py b/src/diffusers/loaders/lora_base.py
@@ -358,6 +358,20 @@ def _load_lora_into_text_encoder(
             # convert state dict
             text_encoder_lora_state_dict = convert_state_dict_to_peft(text_encoder_lora_state_dict)
 
+            if any("position_embedding" in k for k in text_encoder_lora_state_dict):
+                # TODO: this copying is a big shot in the dark.
+                # https://huggingface.co/sayakpaul/different-lora-from-civitai/tree/main?show_file_info=RM_Artistify_v1.0M.safetensors
+                # only has LoRA keys for the position embedding but not the LoRA embedding keys.
+                text_encoder_lora_state_dict[
+                    "text_model.embeddings.position_embedding.lora_embedding_A.weight"
+                ] = text_encoder_lora_state_dict["text_model.embeddings.position_embedding.lora_A.weight"].clone()
+                text_encoder_lora_state_dict[
+                    "text_model.embeddings.position_embedding.lora_embedding_B.weight"
+                ] = text_encoder_lora_state_dict["text_model.embeddings.position_embedding.lora_B.weight"].clone()
+                rank["text_model.embeddings.position_embedding.lora_B.weight"] = text_encoder_lora_state_dict[
+                    "text_model.embeddings.position_embedding.lora_B.weight"
+                ].shape[1]
+
             for name, _ in text_encoder_attn_modules(text_encoder):
                 for module in ("out_proj", "q_proj", "k_proj", "v_proj"):
                     rank_key = f"{name}.{module}.lora_B.weight"
diff --git a/src/diffusers/loaders/lora_conversion_utils.py b/src/diffusers/loaders/lora_conversion_utils.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import re
+from typing import List
 
 import torch
 
@@ -22,6 +23,12 @@
 logger = logging.get_logger(__name__)
 
 
+def swap_scale_shift(weight):
+    shift, scale = weight.chunk(2, dim=0)
+    new_weight = torch.cat([scale, shift], dim=0)
+    return new_weight
+
+
 def _maybe_map_sgm_blocks_to_diffusers(state_dict, unet_config, delimiter="_", block_slice_pos=5):
     # 1. get all state_dict_keys
     all_keys = list(state_dict.keys())
@@ -299,7 +306,9 @@ def _convert_text_encoder_lora_key(key, lora_name):
         key_to_replace = "lora_te2_"
 
     diffusers_name = key.replace(key_to_replace, "").replace("_", ".")
+
     diffusers_name = diffusers_name.replace("text.model", "text_model")
+    diffusers_name = diffusers_name.replace("position.embedding", "position_embedding")
     diffusers_name = diffusers_name.replace("self.attn", "self_attn")
     diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
     diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora")
@@ -313,6 +322,7 @@ def _convert_text_encoder_lora_key(key, lora_name):
         # Be aware that this is the new diffusers convention and the rest of the code might
         # not utilize it yet.
         diffusers_name = diffusers_name.replace(".lora.", ".lora_linear_layer.")
+
     return diffusers_name
 
 
@@ -341,7 +351,8 @@ def _convert_to_ai_toolkit(sds_sd, ait_sd, sds_key, ait_key):
 
         # scale weight by alpha and dim
         rank = down_weight.shape[0]
-        alpha = sds_sd.pop(sds_key + ".alpha").item()  # alpha is scalar
+        default_alpha = torch.tensor(rank, dtype=down_weight.dtype, device=down_weight.device, requires_grad=False)
+        alpha = sds_sd.pop(sds_key + ".alpha", default_alpha).item()  # alpha is scalar
         scale = alpha / rank  # LoRA is scaled by 'alpha / rank' in forward pass, so we need to scale it back here
 
         # calculate scale_down and scale_up to keep the same value. if scale is 4, scale_down is 2 and scale_up is 2
@@ -362,7 +373,10 @@ def _convert_to_ai_toolkit_cat(sds_sd, ait_sd, sds_key, ait_keys, dims=None):
         sd_lora_rank = down_weight.shape[0]
 
         # scale weight by alpha and dim
-        alpha = sds_sd.pop(sds_key + ".alpha")
+        default_alpha = torch.tensor(
+            sd_lora_rank, dtype=down_weight.dtype, device=down_weight.device, requires_grad=False
+        )
+        alpha = sds_sd.pop(sds_key + ".alpha", default_alpha)
         scale = alpha / sd_lora_rank
 
         # calculate scale_down and scale_up
@@ -516,10 +530,62 @@ def _convert_sd_scripts_to_ai_toolkit(sds_sd):
                 f"transformer.single_transformer_blocks.{i}.norm.linear",
             )
 
+        # TODO: alphas.
+        if any("final_layer" in k for k in sds_sd):
+            for lora_key in ["lora_A", "lora_B"]:
+                orig_lora_key = "lora_down" if lora_key == "lora_A" else "lora_up"
+                # Notice the swap.
+                ait_sd[f"norm_out.linear.{lora_key}.weight"] = swap_scale_shift(
+                    sds_sd.pop(f"lora_unet_final_layer_adaLN_modulation_1.{orig_lora_key}.weight")
+                )
+                ait_sd[f"proj_out.{lora_key}.weight"] = sds_sd.pop(
+                    f"lora_unet_final_layer_linear.{orig_lora_key}.weight"
+                )
+
+        if any("guidance_in" in k for k in sds_sd):
+            for lora_key in ["lora_A", "lora_B"]:
+                orig_lora_key = "lora_down" if lora_key == "lora_A" else "lora_up"
+                ait_sd[f"time_text_embed.guidance_embedder.linear_1.{lora_key}.weight"] = sds_sd.pop(
+                    f"lora_unet_guidance_in_in_layer.{orig_lora_key}.weight"
+                )
+                ait_sd[f"time_text_embed.guidance_embedder.linear_2.{lora_key}.weight"] = sds_sd.pop(
+                    f"lora_unet_guidance_in_out_layer.{orig_lora_key}.weight"
+                )
+
+        if any("img_in" in k for k in sds_sd):
+            for lora_key in ["lora_A", "lora_B"]:
+                orig_lora_key = "lora_down" if lora_key == "lora_A" else "lora_up"
+                ait_sd[f"x_embedder.{lora_key}.weight"] = sds_sd.pop(f"lora_unet_img_in.{orig_lora_key}.weight")
+
+        if any("txt_in" in k for k in sds_sd):
+            for lora_key in ["lora_A", "lora_B"]:
+                orig_lora_key = "lora_down" if lora_key == "lora_A" else "lora_up"
+                ait_sd[f"context_embedder.{lora_key}.weight"] = sds_sd.pop(f"lora_unet_txt_in.{orig_lora_key}.weight")
+
+        if any("time_in" in k for k in state_dict):
+            for lora_key in ["lora_A", "lora_B"]:
+                orig_lora_key = "lora_down" if lora_key == "lora_A" else "lora_up"
+                ait_sd[f"time_text_embed.timestep_embedder.linear_1.{lora_key}.weight"] = sds_sd.pop(
+                    f"lora_unet_time_in_in_layer.{orig_lora_key}.weight"
+                )
+                ait_sd[f"time_text_embed.timestep_embedder.linear_2.{lora_key}.weight"] = sds_sd.pop(
+                    f"lora_unet_time_in_out_layer.{orig_lora_key}.weight"
+                )
+
+        if any("vector_in" in k for k in sds_sd):
+            for lora_key in ["lora_A", "lora_B"]:
+                orig_lora_key = "lora_down" if lora_key == "lora_A" else "lora_up"
+                ait_sd[f"time_text_embed.text_embedder.linear_1.{lora_key}.weight"] = sds_sd.pop(
+                    f"lora_unet_vector_in_in_layer.{orig_lora_key}.weight"
+                )
+                ait_sd[f"time_text_embed.text_embedder.linear_2.{lora_key}.weight"] = sds_sd.pop(
+                    f"lora_unet_vector_in_out_layer.{orig_lora_key}.weight"
+                )
+
         remaining_keys = list(sds_sd.keys())
         te_state_dict = {}
         if remaining_keys:
-            if not all(k.startswith("lora_te") for k in remaining_keys):
+            if not all(k.startswith(("lora_te", "lora_te1")) for k in remaining_keys):
                 raise ValueError(f"Incompatible keys detected: \n\n {', '.join(remaining_keys)}")
             for key in remaining_keys:
                 if not key.endswith("lora_down.weight"):
@@ -680,10 +746,59 @@ def _convert(original_key, diffusers_key, state_dict, new_state_dict):
     if has_peft_state_dict:
         state_dict = {k: v for k, v in state_dict.items() if k.startswith("transformer.")}
         return state_dict
+
     # Another weird one.
     has_mixture = any(
         k.startswith("lora_transformer_") and ("lora_down" in k or "lora_up" in k or "alpha" in k) for k in state_dict
     )
+
+    # ComfyUI.
+    state_dict = {k.replace("diffusion_model.", "lora_unet."): v for k, v in state_dict.items()}
+    state_dict = {k.replace("text_encoders.clip_l.transformer.", "lora_te."): v for k, v in state_dict.items()}
+    has_t5xxl = any(k.startswith("text_encoders.t5xxl.transformer.") for k in state_dict)
+    if has_t5xxl:
+        logger.info(
+            "T5-xxl keys found in the state dict, which are currently unsupported. We will filter them out."
+            "Open an issue if this is a problem - https://github.com/huggingface/diffusers/issues/new."
+        )
+        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("text_encoders.t5xxl.transformer.")}
+
+    any_diffb_keys = any("diff_b" in k and k.startswith(("lora_unet.", "lora_te.")) for k in state_dict)
+    if any_diffb_keys:
+        logger.info(
+            "`diff_b` keys found in the state dict which are currently unsupported. "
+            "So, we will filter out those keys. Open an issue if this is a problem - "
+            "https://github.com/huggingface/diffusers/issues/new."
+        )
+        state_dict = {k: v for k, v in state_dict.items() if "diff_b" not in k}
+
+    any_norm_diff_keys = any("norm" in k and "diff" in k for k in state_dict)
+    if any_norm_diff_keys:
+        logger.info(
+            "Normalization diff keys found in the state dict which are currently unsupported. "
+            "So, we will filter out those keys. Open an issue if this is a problem - "
+            "https://github.com/huggingface/diffusers/issues/new."
+        )
+        state_dict = {k: v for k, v in state_dict.items() if "norm" not in k and "diff" not in k}
+
+    limit_substrings = ["lora_down", "lora_up"]
+    if any("alpha" in k for k in state_dict):
+        limit_substrings.append("alpha")
+
+    state_dict = {
+        _custom_replace(k, limit_substrings): v
+        for k, v in state_dict.items()
+        if k.startswith(("lora_unet.", "lora_te."))
+    }
+
+    if any("text_projection" in k for k in state_dict):
+        logger.info(
+            "`text_projection` keys found in the state_dict which are unexpected. "
+            "So, we will filter out those keys. Open an issue if this is a problem - "
+            "https://github.com/huggingface/diffusers/issues/new."
+        )
+        state_dict = {k: v for k, v in state_dict.items() if "text_projection" not in k}
+
     if has_mixture:
         return _convert_mixture_state_dict_to_diffusers(state_dict)
 
@@ -798,6 +913,23 @@ def handle_qkv(sds_sd, ait_sd, sds_key, ait_keys, dims=None):
     return new_state_dict
 
 
+def _custom_replace(key: str, substrings: List[str]) -> str:
+    pattern = "(" + "|".join(re.escape(sub) for sub in substrings) + ")"
+
+    match = re.search(pattern, key)
+    if match:
+        start_sub = match.start()
+        if start_sub > 0 and key[start_sub - 1] == ".":
+            boundary = start_sub - 1
+        else:
+            boundary = start_sub
+        left = key[:boundary].replace(".", "_")
+        right = key[boundary:]
+        return left + right
+    else:
+        return key.replace(".", "_")
+
+
 def _convert_bfl_flux_control_lora_to_diffusers(original_state_dict):
     converted_state_dict = {}
     original_state_dict_keys = list(original_state_dict.keys())
@@ -806,11 +938,6 @@ def _convert_bfl_flux_control_lora_to_diffusers(original_state_dict):
     inner_dim = 3072
     mlp_ratio = 4.0
 
-    def swap_scale_shift(weight):
-        shift, scale = weight.chunk(2, dim=0)
-        new_weight = torch.cat([scale, shift], dim=0)
-        return new_weight
-
     for lora_key in ["lora_A", "lora_B"]:
         ## time_text_embed.timestep_embedder <-  time_in
         converted_state_dict[
diff --git a/src/diffusers/utils/state_dict_utils.py b/src/diffusers/utils/state_dict_utils.py
@@ -64,6 +64,8 @@ class StateDictType(enum.Enum):
     ".lora_linear_layer.down": ".lora_A",
     "text_projection.lora.down.weight": "text_projection.lora_A.weight",
     "text_projection.lora.up.weight": "text_projection.lora_B.weight",
+    "position_embedding.lora.down.weight": "position_embedding.lora_A.weight",
+    "position_embedding.lora.up.weight": "position_embedding.lora_B.weight",
 }
 
 DIFFUSERS_OLD_TO_PEFT = {

Original file line number	Diff line number	Diff line change
`@@ -64,6 +64,8 @@ class StateDictType(enum.Enum):`
`64`	`64`	`".lora_linear_layer.down": ".lora_A",`
`65`	`65`	`"text_projection.lora.down.weight": "text_projection.lora_A.weight",`
`66`	`66`	`"text_projection.lora.up.weight": "text_projection.lora_B.weight",`
	`67`	`+ "position_embedding.lora.down.weight": "position_embedding.lora_A.weight",`
	`68`	`+ "position_embedding.lora.up.weight": "position_embedding.lora_B.weight",`
`67`	`69`	`}`
`68`	`70`
`69`	`71`	`DIFFUSERS_OLD_TO_PEFT = {`